This commit is contained in:
Jakub Gajski 2025-08-29 11:04:53 +07:00 committed by GitHub
commit 373291bea2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 134 additions and 10 deletions

View File

@ -0,0 +1,95 @@
FROM ubuntu:24.04 AS deps
ARG llamacpp_version=b4827
ARG llamacpp_native=ON
ARG llamacpp_cpu_arm_arch=native
WORKDIR /opt/src
ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt upgrade -y && apt install -y \
clang \
cmake \
curl \
git \
python3-dev \
libssl-dev \
pkg-config \
tar \
libopenblas-dev \
libblas-dev \
liblapack-dev
ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
RUN mkdir -p llama.cpp \
&& tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
&& cd llama.cpp \
&& cmake -B build \
-DCMAKE_INSTALL_PREFIX=/usr \
-DCMAKE_INSTALL_LIBDIR=/usr/lib \
-DCMAKE_C_COMPILER=clang \
-DCMAKE_CXX_COMPILER=clang++ \
-DGGML_NATIVE=${llamacpp_native} \
-DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_BACKEND_BLAS=ON \
-DBUILD_SHARED_LIBS=ON \
&& cmake --build build --parallel --config Release \
&& cmake --install build
WORKDIR /app
COPY rust-toolchain.toml rust-toolchain.toml
RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
ENV PATH="/root/.cargo/bin:$PATH"
RUN cargo install cargo-chef --locked
FROM deps AS planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json
FROM deps AS builder
COPY --from=planner /app/recipe.json recipe.json
RUN cargo chef cook \
--recipe-path recipe.json \
--profile release \
--package text-generation-router-llamacpp
COPY . .
RUN cargo build \
--profile release \
--package text-generation-router-llamacpp --frozen
FROM ubuntu:24.04
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt upgrade -y && apt install -y \
python3-venv \
python3-pip \
libopenblas0 \
libblas3 \
liblapack3
RUN python3 -m venv /venv
ENV PATH="/venv/bin:$PATH"
COPY backends/llamacpp/requirements-cpu.txt requirements.txt
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
# install torch manually to avoid automatic download of nvidia dependencies (leaner image)
RUN pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
torch==2.8.0 \
&& pip3 install --no-cache-dir -r requirements.txt -e gguf-py
COPY --from=builder /usr/lib/libllama.so /usr/lib/
COPY --from=builder /usr/lib/libggml*.so /usr/lib/
COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENTRYPOINT ["text-generation-router-llamacpp"]

View File

@ -0,0 +1,4 @@
transformers[torch]==4.49
huggingface-hub==0.28.1
hf-transfer==0.1.9
# when changing transformers version, adjust torch version in Dockerfile_llamacpp_cpuonly

View File

@ -12,7 +12,7 @@ environments.
- Full compatibility with GGUF format and all quantization formats
(GGUF-related constraints may be mitigated dynamically by on-the-fly
generation in future updates)
- Optimized inference on CPU and GPU architectures
- Optimized inference on CPU and GPU architectures with two dedicated Dockerfiles
- Containerized deployment, eliminating dependency complexity
- Seamless interoperability with the Hugging Face ecosystem
@ -24,13 +24,24 @@ You will find the best models on [Hugging Face][GGUF].
## Build Docker image
### For CPU-only inference
For optimal performance, the Docker image is compiled with native CPU
instructions by default. As a result, it is strongly recommended to run
the container on the same host architecture used during the build
process. Efforts are ongoing to improve portability across different
systems while preserving high computational efficiency.
To build the Docker image, use the following command:
To build the Docker image fo CPU, use the following command:
```bash
docker build \
-t tgi-llamacpp-cpu \
https://github.com/huggingface/text-generation-inference.git \
-f Dockerfile_llamacpp_cpuonly
```
### For GPU-enabled inference
```bash
docker build \
@ -41,13 +52,13 @@ docker build \
### Build parameters
| Parameter (with --build-arg) | Description |
| ----------------------------------------- | -------------------------------- |
| `llamacpp_version=bXXXX` | Specific version of llama.cpp |
| `llamacpp_cuda=ON` | Enables CUDA acceleration |
| `llamacpp_native=OFF` | Disable automatic CPU detection |
| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features |
| `cuda_arch=ARCH` | Defines target CUDA architecture |
| Parameter (with --build-arg) | Description | CPU or GPU? |
| ----------------------------------------- | -------------------------------- | ----------- |
| `llamacpp_version=bXXXX` | Specific version of llama.cpp | Both |
| `llamacpp_cuda=ON` | Enables CUDA acceleration | GPU |
| `llamacpp_native=OFF` | Disable automatic CPU detection | Both |
| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features | Both |
| `cuda_arch=ARCH` | Defines target CUDA architecture | GPU |
For example, to target Graviton4 when building on another ARM
architecture:
@ -61,6 +72,20 @@ docker build \
-f Dockerfile_llamacpp
```
For example, to target a local CPU without GPU acceleration:
```bash
docker build \
-t tgi-llamacpp-cpu \
--build-arg llamacpp_native=ON \
https://github.com/huggingface/text-generation-inference.git \
-f Dockerfile_llamacpp_cpuonly
```
As a rule of thumb, if you are not interested in GPU acceleration,
you should build the CPU-only image, which is significantly smaller
(Dockerfile_llamacpp_cpuonly - 1.7GB vs Dockerfile_llamacpp - 17GB)
## Run Docker image
### CPU-based inference
@ -70,7 +95,7 @@ docker run \
-p 3000:3000 \
-e "HF_TOKEN=$HF_TOKEN" \
-v "$HOME/models:/app/models" \
tgi-llamacpp \
tgi-llamacpp-cpu \
--model-id "Qwen/Qwen2.5-3B-Instruct"
```