mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 03:14:53 +00:00
Merge b6795e041a
into 06d9d88b95
This commit is contained in:
commit
373291bea2
95
Dockerfile_llamacpp_cpuonly
Normal file
95
Dockerfile_llamacpp_cpuonly
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
FROM ubuntu:24.04 AS deps
|
||||||
|
|
||||||
|
ARG llamacpp_version=b4827
|
||||||
|
ARG llamacpp_native=ON
|
||||||
|
ARG llamacpp_cpu_arm_arch=native
|
||||||
|
|
||||||
|
WORKDIR /opt/src
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
RUN apt update && apt upgrade -y && apt install -y \
|
||||||
|
clang \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
git \
|
||||||
|
python3-dev \
|
||||||
|
libssl-dev \
|
||||||
|
pkg-config \
|
||||||
|
tar \
|
||||||
|
libopenblas-dev \
|
||||||
|
libblas-dev \
|
||||||
|
liblapack-dev
|
||||||
|
|
||||||
|
ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
|
||||||
|
RUN mkdir -p llama.cpp \
|
||||||
|
&& tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
|
||||||
|
&& cd llama.cpp \
|
||||||
|
&& cmake -B build \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=/usr \
|
||||||
|
-DCMAKE_INSTALL_LIBDIR=/usr/lib \
|
||||||
|
-DCMAKE_C_COMPILER=clang \
|
||||||
|
-DCMAKE_CXX_COMPILER=clang++ \
|
||||||
|
-DGGML_NATIVE=${llamacpp_native} \
|
||||||
|
-DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
|
||||||
|
-DLLAMA_BUILD_COMMON=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DGGML_BLAS=ON \
|
||||||
|
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||||
|
-DGGML_BACKEND_BLAS=ON \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
&& cmake --build build --parallel --config Release \
|
||||||
|
&& cmake --install build
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY rust-toolchain.toml rust-toolchain.toml
|
||||||
|
RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
|
||||||
|
ENV PATH="/root/.cargo/bin:$PATH"
|
||||||
|
RUN cargo install cargo-chef --locked
|
||||||
|
|
||||||
|
FROM deps AS planner
|
||||||
|
COPY . .
|
||||||
|
RUN cargo chef prepare --recipe-path recipe.json
|
||||||
|
|
||||||
|
FROM deps AS builder
|
||||||
|
COPY --from=planner /app/recipe.json recipe.json
|
||||||
|
RUN cargo chef cook \
|
||||||
|
--recipe-path recipe.json \
|
||||||
|
--profile release \
|
||||||
|
--package text-generation-router-llamacpp
|
||||||
|
COPY . .
|
||||||
|
RUN cargo build \
|
||||||
|
--profile release \
|
||||||
|
--package text-generation-router-llamacpp --frozen
|
||||||
|
|
||||||
|
FROM ubuntu:24.04
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
RUN apt update && apt upgrade -y && apt install -y \
|
||||||
|
python3-venv \
|
||||||
|
python3-pip \
|
||||||
|
libopenblas0 \
|
||||||
|
libblas3 \
|
||||||
|
liblapack3
|
||||||
|
|
||||||
|
RUN python3 -m venv /venv
|
||||||
|
ENV PATH="/venv/bin:$PATH"
|
||||||
|
|
||||||
|
COPY backends/llamacpp/requirements-cpu.txt requirements.txt
|
||||||
|
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
|
||||||
|
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
|
||||||
|
|
||||||
|
# install torch manually to avoid automatic download of nvidia dependencies (leaner image)
|
||||||
|
RUN pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
|
||||||
|
torch==2.8.0 \
|
||||||
|
&& pip3 install --no-cache-dir -r requirements.txt -e gguf-py
|
||||||
|
|
||||||
|
COPY --from=builder /usr/lib/libllama.so /usr/lib/
|
||||||
|
COPY --from=builder /usr/lib/libggml*.so /usr/lib/
|
||||||
|
COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
|
||||||
|
|
||||||
|
ENV HF_HUB_ENABLE_HF_TRANSFER=1
|
||||||
|
|
||||||
|
ENTRYPOINT ["text-generation-router-llamacpp"]
|
4
backends/llamacpp/requirements-cpu.txt
Normal file
4
backends/llamacpp/requirements-cpu.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
transformers[torch]==4.49
|
||||||
|
huggingface-hub==0.28.1
|
||||||
|
hf-transfer==0.1.9
|
||||||
|
# when changing transformers version, adjust torch version in Dockerfile_llamacpp_cpuonly
|
@ -12,7 +12,7 @@ environments.
|
|||||||
- Full compatibility with GGUF format and all quantization formats
|
- Full compatibility with GGUF format and all quantization formats
|
||||||
(GGUF-related constraints may be mitigated dynamically by on-the-fly
|
(GGUF-related constraints may be mitigated dynamically by on-the-fly
|
||||||
generation in future updates)
|
generation in future updates)
|
||||||
- Optimized inference on CPU and GPU architectures
|
- Optimized inference on CPU and GPU architectures with two dedicated Dockerfiles
|
||||||
- Containerized deployment, eliminating dependency complexity
|
- Containerized deployment, eliminating dependency complexity
|
||||||
- Seamless interoperability with the Hugging Face ecosystem
|
- Seamless interoperability with the Hugging Face ecosystem
|
||||||
|
|
||||||
@ -24,13 +24,24 @@ You will find the best models on [Hugging Face][GGUF].
|
|||||||
|
|
||||||
## Build Docker image
|
## Build Docker image
|
||||||
|
|
||||||
|
### For CPU-only inference
|
||||||
|
|
||||||
For optimal performance, the Docker image is compiled with native CPU
|
For optimal performance, the Docker image is compiled with native CPU
|
||||||
instructions by default. As a result, it is strongly recommended to run
|
instructions by default. As a result, it is strongly recommended to run
|
||||||
the container on the same host architecture used during the build
|
the container on the same host architecture used during the build
|
||||||
process. Efforts are ongoing to improve portability across different
|
process. Efforts are ongoing to improve portability across different
|
||||||
systems while preserving high computational efficiency.
|
systems while preserving high computational efficiency.
|
||||||
|
|
||||||
To build the Docker image, use the following command:
|
To build the Docker image fo CPU, use the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build \
|
||||||
|
-t tgi-llamacpp-cpu \
|
||||||
|
https://github.com/huggingface/text-generation-inference.git \
|
||||||
|
-f Dockerfile_llamacpp_cpuonly
|
||||||
|
```
|
||||||
|
|
||||||
|
### For GPU-enabled inference
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build \
|
docker build \
|
||||||
@ -41,13 +52,13 @@ docker build \
|
|||||||
|
|
||||||
### Build parameters
|
### Build parameters
|
||||||
|
|
||||||
| Parameter (with --build-arg) | Description |
|
| Parameter (with --build-arg) | Description | CPU or GPU? |
|
||||||
| ----------------------------------------- | -------------------------------- |
|
| ----------------------------------------- | -------------------------------- | ----------- |
|
||||||
| `llamacpp_version=bXXXX` | Specific version of llama.cpp |
|
| `llamacpp_version=bXXXX` | Specific version of llama.cpp | Both |
|
||||||
| `llamacpp_cuda=ON` | Enables CUDA acceleration |
|
| `llamacpp_cuda=ON` | Enables CUDA acceleration | GPU |
|
||||||
| `llamacpp_native=OFF` | Disable automatic CPU detection |
|
| `llamacpp_native=OFF` | Disable automatic CPU detection | Both |
|
||||||
| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features |
|
| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features | Both |
|
||||||
| `cuda_arch=ARCH` | Defines target CUDA architecture |
|
| `cuda_arch=ARCH` | Defines target CUDA architecture | GPU |
|
||||||
|
|
||||||
For example, to target Graviton4 when building on another ARM
|
For example, to target Graviton4 when building on another ARM
|
||||||
architecture:
|
architecture:
|
||||||
@ -61,6 +72,20 @@ docker build \
|
|||||||
-f Dockerfile_llamacpp
|
-f Dockerfile_llamacpp
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For example, to target a local CPU without GPU acceleration:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build \
|
||||||
|
-t tgi-llamacpp-cpu \
|
||||||
|
--build-arg llamacpp_native=ON \
|
||||||
|
https://github.com/huggingface/text-generation-inference.git \
|
||||||
|
-f Dockerfile_llamacpp_cpuonly
|
||||||
|
```
|
||||||
|
|
||||||
|
As a rule of thumb, if you are not interested in GPU acceleration,
|
||||||
|
you should build the CPU-only image, which is significantly smaller
|
||||||
|
(Dockerfile_llamacpp_cpuonly - 1.7GB vs Dockerfile_llamacpp - 17GB)
|
||||||
|
|
||||||
## Run Docker image
|
## Run Docker image
|
||||||
|
|
||||||
### CPU-based inference
|
### CPU-based inference
|
||||||
@ -70,7 +95,7 @@ docker run \
|
|||||||
-p 3000:3000 \
|
-p 3000:3000 \
|
||||||
-e "HF_TOKEN=$HF_TOKEN" \
|
-e "HF_TOKEN=$HF_TOKEN" \
|
||||||
-v "$HOME/models:/app/models" \
|
-v "$HOME/models:/app/models" \
|
||||||
tgi-llamacpp \
|
tgi-llamacpp-cpu \
|
||||||
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user