Merge b6795e041a into 06d9d88b95

2025-09-09 03:14:53 +00:00 · 2025-08-29 11:04:53 +07:00 · 2025-08-29 11:04:53 +07:00 · 373291bea2
commit 373291bea2
parent 06d9d88b95 b6795e041a
3 changed files with 134 additions and 10 deletions
--- a/95
+++ b/95
@ -0,0 +1,95 @@
 FROM ubuntu:24.04 AS deps
 ARG llamacpp_version=b4827
 ARG llamacpp_native=ON
 ARG llamacpp_cpu_arm_arch=native
 WORKDIR /opt/src
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && apt upgrade -y && apt install -y \
    clang \
    cmake \
    curl \
    git \
    python3-dev \
    libssl-dev \
    pkg-config \
    tar \
    libopenblas-dev \
    libblas-dev \
    liblapack-dev
 ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
 RUN mkdir -p llama.cpp \
 && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
 && cd llama.cpp \
 && cmake -B build \
    -DCMAKE_INSTALL_PREFIX=/usr \
    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
    -DCMAKE_C_COMPILER=clang \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DGGML_NATIVE=${llamacpp_native} \
    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
    -DLLAMA_BUILD_COMMON=OFF \
    -DLLAMA_BUILD_TESTS=OFF \
    -DLLAMA_BUILD_EXAMPLES=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -DGGML_BLAS=ON \
    -DGGML_BLAS_VENDOR=OpenBLAS \
    -DGGML_BACKEND_BLAS=ON \
    -DBUILD_SHARED_LIBS=ON \
 && cmake --build build --parallel --config Release \
 && cmake --install build
 WORKDIR /app
 COPY rust-toolchain.toml rust-toolchain.toml
 RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
 ENV PATH="/root/.cargo/bin:$PATH"
 RUN cargo install cargo-chef --locked
 FROM deps AS planner
 COPY . .
 RUN cargo chef prepare --recipe-path recipe.json
 FROM deps AS builder
 COPY --from=planner /app/recipe.json recipe.json
 RUN cargo chef cook \
    --recipe-path recipe.json \
    --profile release \
    --package text-generation-router-llamacpp
 COPY . .
 RUN cargo build \
    --profile release \
    --package text-generation-router-llamacpp --frozen
 FROM ubuntu:24.04
 WORKDIR /app
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && apt upgrade -y && apt install -y \
    python3-venv \
    python3-pip \
    libopenblas0 \
    libblas3 \
    liblapack3
 RUN python3 -m venv /venv
 ENV PATH="/venv/bin:$PATH"
 COPY backends/llamacpp/requirements-cpu.txt requirements.txt
 COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
 COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
 # install torch manually to avoid automatic download of nvidia dependencies (leaner image)
 RUN pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
    torch==2.8.0 \
    && pip3 install --no-cache-dir -r requirements.txt -e gguf-py
 COPY --from=builder /usr/lib/libllama.so /usr/lib/
 COPY --from=builder /usr/lib/libggml*.so /usr/lib/
 COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENTRYPOINT ["text-generation-router-llamacpp"]
--- a/backends/llamacpp/requirements-cpu.txt
+++ b/backends/llamacpp/requirements-cpu.txt
@ -0,0 +1,4 @@
 transformers[torch]==4.49
 huggingface-hub==0.28.1
 hf-transfer==0.1.9
 # when changing transformers version, adjust torch version in Dockerfile_llamacpp_cpuonly
--- a/docs/source/backends/llamacpp.md
+++ b/docs/source/backends/llamacpp.md
@ -12,7 +12,7 @@ environments.
 - Full compatibility with GGUF format and all quantization formats
  (GGUF-related constraints may be mitigated dynamically by on-the-fly
  generation in future updates)
- Optimized inference on CPU and GPU architectures
+- Optimized inference on CPU and GPU architectures with two dedicated Dockerfiles
 - Containerized deployment, eliminating dependency complexity
 - Seamless interoperability with the Hugging Face ecosystem
@ -24,13 +24,24 @@ You will find the best models on [Hugging Face][GGUF].
 ## Build Docker image
 ### For CPU-only inference
 For optimal performance, the Docker image is compiled with native CPU
 instructions by default. As a result, it is strongly recommended to run
 the container on the same host architecture used during the build
 process. Efforts are ongoing to improve portability across different
 systems while preserving high computational efficiency.
-To build the Docker image, use the following command:
+To build the Docker image fo CPU, use the following command:
 ```bash
 docker build \
    -t tgi-llamacpp-cpu \
    https://github.com/huggingface/text-generation-inference.git \
    -f Dockerfile_llamacpp_cpuonly
 ```
 ### For GPU-enabled inference
 ```bash
 docker build \
@ -41,13 +52,13 @@ docker build \
 ### Build parameters
-| Parameter (with --build-arg)              | Description                      |
+| Parameter (with --build-arg)              | Description                      | CPU or GPU? |
-| ----------------------------------------- | -------------------------------- |
+| ----------------------------------------- | -------------------------------- | ----------- |
-| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    |
+| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    | Both        |
-| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        |
+| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        | GPU         |
-| `llamacpp_native=OFF`                     | Disable automatic CPU detection  |
+| `llamacpp_native=OFF`                     | Disable automatic CPU detection  | Both        |
-| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    |
+| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    | Both        |
-| `cuda_arch=ARCH`                          | Defines target CUDA architecture |
+| `cuda_arch=ARCH`                          | Defines target CUDA architecture | GPU         |
 For example, to target Graviton4 when building on another ARM
 architecture:
@ -61,6 +72,20 @@ docker build \
    -f Dockerfile_llamacpp
 ```
 For example, to target a local CPU without GPU acceleration:
 ```bash
 docker build \                            
    -t tgi-llamacpp-cpu \
    --build-arg llamacpp_native=ON \
    https://github.com/huggingface/text-generation-inference.git \
    -f Dockerfile_llamacpp_cpuonly
 ```
 As a rule of thumb, if you are not interested in GPU acceleration,
 you should build the CPU-only image, which is significantly smaller 
 (Dockerfile_llamacpp_cpuonly - 1.7GB vs Dockerfile_llamacpp - 17GB)
 ## Run Docker image
 ### CPU-based inference
@ -70,7 +95,7 @@ docker run \
    -p 3000:3000 \
    -e "HF_TOKEN=$HF_TOKEN" \
    -v "$HOME/models:/app/models" \
-    tgi-llamacpp \
+    tgi-llamacpp-cpu \
    --model-id "Qwen/Qwen2.5-3B-Instruct"
 ```