mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-08 19:04:52 +00:00
Merge b6795e041a
into 06d9d88b95
This commit is contained in:
commit
373291bea2
95
Dockerfile_llamacpp_cpuonly
Normal file
95
Dockerfile_llamacpp_cpuonly
Normal file
@ -0,0 +1,95 @@
|
||||
FROM ubuntu:24.04 AS deps
|
||||
|
||||
ARG llamacpp_version=b4827
|
||||
ARG llamacpp_native=ON
|
||||
ARG llamacpp_cpu_arm_arch=native
|
||||
|
||||
WORKDIR /opt/src
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt update && apt upgrade -y && apt install -y \
|
||||
clang \
|
||||
cmake \
|
||||
curl \
|
||||
git \
|
||||
python3-dev \
|
||||
libssl-dev \
|
||||
pkg-config \
|
||||
tar \
|
||||
libopenblas-dev \
|
||||
libblas-dev \
|
||||
liblapack-dev
|
||||
|
||||
ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
|
||||
RUN mkdir -p llama.cpp \
|
||||
&& tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
|
||||
&& cd llama.cpp \
|
||||
&& cmake -B build \
|
||||
-DCMAKE_INSTALL_PREFIX=/usr \
|
||||
-DCMAKE_INSTALL_LIBDIR=/usr/lib \
|
||||
-DCMAKE_C_COMPILER=clang \
|
||||
-DCMAKE_CXX_COMPILER=clang++ \
|
||||
-DGGML_NATIVE=${llamacpp_native} \
|
||||
-DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DGGML_BACKEND_BLAS=ON \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
&& cmake --build build --parallel --config Release \
|
||||
&& cmake --install build
|
||||
|
||||
WORKDIR /app
|
||||
COPY rust-toolchain.toml rust-toolchain.toml
|
||||
RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
|
||||
ENV PATH="/root/.cargo/bin:$PATH"
|
||||
RUN cargo install cargo-chef --locked
|
||||
|
||||
FROM deps AS planner
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
FROM deps AS builder
|
||||
COPY --from=planner /app/recipe.json recipe.json
|
||||
RUN cargo chef cook \
|
||||
--recipe-path recipe.json \
|
||||
--profile release \
|
||||
--package text-generation-router-llamacpp
|
||||
COPY . .
|
||||
RUN cargo build \
|
||||
--profile release \
|
||||
--package text-generation-router-llamacpp --frozen
|
||||
|
||||
FROM ubuntu:24.04
|
||||
WORKDIR /app
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
RUN apt update && apt upgrade -y && apt install -y \
|
||||
python3-venv \
|
||||
python3-pip \
|
||||
libopenblas0 \
|
||||
libblas3 \
|
||||
liblapack3
|
||||
|
||||
RUN python3 -m venv /venv
|
||||
ENV PATH="/venv/bin:$PATH"
|
||||
|
||||
COPY backends/llamacpp/requirements-cpu.txt requirements.txt
|
||||
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
|
||||
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
|
||||
|
||||
# install torch manually to avoid automatic download of nvidia dependencies (leaner image)
|
||||
RUN pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
|
||||
torch==2.8.0 \
|
||||
&& pip3 install --no-cache-dir -r requirements.txt -e gguf-py
|
||||
|
||||
COPY --from=builder /usr/lib/libllama.so /usr/lib/
|
||||
COPY --from=builder /usr/lib/libggml*.so /usr/lib/
|
||||
COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
|
||||
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER=1
|
||||
|
||||
ENTRYPOINT ["text-generation-router-llamacpp"]
|
4
backends/llamacpp/requirements-cpu.txt
Normal file
4
backends/llamacpp/requirements-cpu.txt
Normal file
@ -0,0 +1,4 @@
|
||||
transformers[torch]==4.49
|
||||
huggingface-hub==0.28.1
|
||||
hf-transfer==0.1.9
|
||||
# when changing transformers version, adjust torch version in Dockerfile_llamacpp_cpuonly
|
@ -12,7 +12,7 @@ environments.
|
||||
- Full compatibility with GGUF format and all quantization formats
|
||||
(GGUF-related constraints may be mitigated dynamically by on-the-fly
|
||||
generation in future updates)
|
||||
- Optimized inference on CPU and GPU architectures
|
||||
- Optimized inference on CPU and GPU architectures with two dedicated Dockerfiles
|
||||
- Containerized deployment, eliminating dependency complexity
|
||||
- Seamless interoperability with the Hugging Face ecosystem
|
||||
|
||||
@ -24,13 +24,24 @@ You will find the best models on [Hugging Face][GGUF].
|
||||
|
||||
## Build Docker image
|
||||
|
||||
### For CPU-only inference
|
||||
|
||||
For optimal performance, the Docker image is compiled with native CPU
|
||||
instructions by default. As a result, it is strongly recommended to run
|
||||
the container on the same host architecture used during the build
|
||||
process. Efforts are ongoing to improve portability across different
|
||||
systems while preserving high computational efficiency.
|
||||
|
||||
To build the Docker image, use the following command:
|
||||
To build the Docker image fo CPU, use the following command:
|
||||
|
||||
```bash
|
||||
docker build \
|
||||
-t tgi-llamacpp-cpu \
|
||||
https://github.com/huggingface/text-generation-inference.git \
|
||||
-f Dockerfile_llamacpp_cpuonly
|
||||
```
|
||||
|
||||
### For GPU-enabled inference
|
||||
|
||||
```bash
|
||||
docker build \
|
||||
@ -41,13 +52,13 @@ docker build \
|
||||
|
||||
### Build parameters
|
||||
|
||||
| Parameter (with --build-arg) | Description |
|
||||
| ----------------------------------------- | -------------------------------- |
|
||||
| `llamacpp_version=bXXXX` | Specific version of llama.cpp |
|
||||
| `llamacpp_cuda=ON` | Enables CUDA acceleration |
|
||||
| `llamacpp_native=OFF` | Disable automatic CPU detection |
|
||||
| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features |
|
||||
| `cuda_arch=ARCH` | Defines target CUDA architecture |
|
||||
| Parameter (with --build-arg) | Description | CPU or GPU? |
|
||||
| ----------------------------------------- | -------------------------------- | ----------- |
|
||||
| `llamacpp_version=bXXXX` | Specific version of llama.cpp | Both |
|
||||
| `llamacpp_cuda=ON` | Enables CUDA acceleration | GPU |
|
||||
| `llamacpp_native=OFF` | Disable automatic CPU detection | Both |
|
||||
| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features | Both |
|
||||
| `cuda_arch=ARCH` | Defines target CUDA architecture | GPU |
|
||||
|
||||
For example, to target Graviton4 when building on another ARM
|
||||
architecture:
|
||||
@ -61,6 +72,20 @@ docker build \
|
||||
-f Dockerfile_llamacpp
|
||||
```
|
||||
|
||||
For example, to target a local CPU without GPU acceleration:
|
||||
|
||||
```bash
|
||||
docker build \
|
||||
-t tgi-llamacpp-cpu \
|
||||
--build-arg llamacpp_native=ON \
|
||||
https://github.com/huggingface/text-generation-inference.git \
|
||||
-f Dockerfile_llamacpp_cpuonly
|
||||
```
|
||||
|
||||
As a rule of thumb, if you are not interested in GPU acceleration,
|
||||
you should build the CPU-only image, which is significantly smaller
|
||||
(Dockerfile_llamacpp_cpuonly - 1.7GB vs Dockerfile_llamacpp - 17GB)
|
||||
|
||||
## Run Docker image
|
||||
|
||||
### CPU-based inference
|
||||
@ -70,7 +95,7 @@ docker run \
|
||||
-p 3000:3000 \
|
||||
-e "HF_TOKEN=$HF_TOKEN" \
|
||||
-v "$HOME/models:/app/models" \
|
||||
tgi-llamacpp \
|
||||
tgi-llamacpp-cpu \
|
||||
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
||||
```
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user