Merge b6795e041a into 06d9d88b95

2025-09-08 19:04:52 +00:00 · 2025-08-29 11:04:53 +07:00 · 2025-08-29 11:04:53 +07:00 · 373291bea2
commit 373291bea2
parent 06d9d88b95 b6795e041a
3 changed files with 134 additions and 10 deletions
--- a/95
+++ b/95
@ -0,0 +1,95 @@
+FROM ubuntu:24.04 AS deps
+
+ARG llamacpp_version=b4827
+ARG llamacpp_native=ON
+ARG llamacpp_cpu_arm_arch=native
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    libssl-dev \
+    pkg-config \
+    tar \
+    libopenblas-dev \
+    libblas-dev \
+    liblapack-dev
+
+ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN mkdir -p llama.cpp \
+ && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
+ && cd llama.cpp \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DGGML_NATIVE=${llamacpp_native} \
+    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+    -DGGML_BLAS=ON \
+    -DGGML_BLAS_VENDOR=OpenBLAS \
+    -DGGML_BACKEND_BLAS=ON \
+    -DBUILD_SHARED_LIBS=ON \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release \
+    --package text-generation-router-llamacpp --frozen
+
+FROM ubuntu:24.04
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    python3-venv \
+    python3-pip \
+    libopenblas0 \
+    libblas3 \
+    liblapack3
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements-cpu.txt requirements.txt
+COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
+COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
+
+# install torch manually to avoid automatic download of nvidia dependencies (leaner image)
+RUN pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
+    torch==2.8.0 \
+    && pip3 install --no-cache-dir -r requirements.txt -e gguf-py
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+ENTRYPOINT ["text-generation-router-llamacpp"]
--- a/backends/llamacpp/requirements-cpu.txt
+++ b/backends/llamacpp/requirements-cpu.txt
@ -0,0 +1,4 @@
+transformers[torch]==4.49
+huggingface-hub==0.28.1
+hf-transfer==0.1.9
+# when changing transformers version, adjust torch version in Dockerfile_llamacpp_cpuonly
--- a/docs/source/backends/llamacpp.md
+++ b/docs/source/backends/llamacpp.md
@ -12,7 +12,7 @@ environments.
 - Full compatibility with GGUF format and all quantization formats
  (GGUF-related constraints may be mitigated dynamically by on-the-fly
  generation in future updates)
- Optimized inference on CPU and GPU architectures
+- Optimized inference on CPU and GPU architectures with two dedicated Dockerfiles
 - Containerized deployment, eliminating dependency complexity
 - Seamless interoperability with the Hugging Face ecosystem

@ -24,13 +24,24 @@ You will find the best models on [Hugging Face][GGUF].

 ## Build Docker image

+### For CPU-only inference
+
 For optimal performance, the Docker image is compiled with native CPU
 instructions by default. As a result, it is strongly recommended to run
 the container on the same host architecture used during the build
 process. Efforts are ongoing to improve portability across different
 systems while preserving high computational efficiency.

-To build the Docker image, use the following command:
+To build the Docker image fo CPU, use the following command:
+
+```bash
+docker build \
+    -t tgi-llamacpp-cpu \
+    https://github.com/huggingface/text-generation-inference.git \
+    -f Dockerfile_llamacpp_cpuonly
+```
+
+### For GPU-enabled inference

 ```bash
 docker build \
@ -41,13 +52,13 @@ docker build \

 ### Build parameters

-| Parameter (with --build-arg)              | Description                      |
-| ----------------------------------------- | -------------------------------- |
-| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    |
-| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        |
-| `llamacpp_native=OFF`                     | Disable automatic CPU detection  |
-| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    |
-| `cuda_arch=ARCH`                          | Defines target CUDA architecture |
+| Parameter (with --build-arg)              | Description                      | CPU or GPU? |
+| ----------------------------------------- | -------------------------------- | ----------- |
+| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    | Both        |
+| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        | GPU         |
+| `llamacpp_native=OFF`                     | Disable automatic CPU detection  | Both        |
+| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    | Both        |
+| `cuda_arch=ARCH`                          | Defines target CUDA architecture | GPU         |

 For example, to target Graviton4 when building on another ARM
 architecture:
@ -61,6 +72,20 @@ docker build \
    -f Dockerfile_llamacpp
 ```

+For example, to target a local CPU without GPU acceleration:
+
+```bash
+docker build \                            
+    -t tgi-llamacpp-cpu \
+    --build-arg llamacpp_native=ON \
+    https://github.com/huggingface/text-generation-inference.git \
+    -f Dockerfile_llamacpp_cpuonly
+```
+
+As a rule of thumb, if you are not interested in GPU acceleration,
+you should build the CPU-only image, which is significantly smaller 
+(Dockerfile_llamacpp_cpuonly - 1.7GB vs Dockerfile_llamacpp - 17GB)
+
 ## Run Docker image

 ### CPU-based inference
@ -70,7 +95,7 @@ docker run \
    -p 3000:3000 \
    -e "HF_TOKEN=$HF_TOKEN" \
    -v "$HOME/models:/app/models" \
-    tgi-llamacpp \
+    tgi-llamacpp-cpu \
    --model-id "Qwen/Qwen2.5-3B-Instruct"
 ```