Update Dockerfile_llamacpp

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-09-11 04:14:52 +00:00 · 2025-02-04 12:34:02 +00:00 · 2025-02-04 12:34:02 +00:00 · df2a4fbb8a
commit df2a4fbb8a
parent d883109df6
2 changed files with 42 additions and 47 deletions
--- a/85
+++ b/85
@ -1,41 +1,27 @@
-ARG llama_version=b4623
+FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS deps
-ARG llama_hardware_target=cpu
+
 ARG llama_version=b4628
 ARG llama_cuda_arch=75-real;80-real;86-real;89-real;90-real
-FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS base
+WORKDIR /opt/src
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update && apt install -y \
    python3-venv \
    python3-pip
 RUN python3 -m venv /venv
 ENV PATH="/venv/bin:$PATH"
 RUN pip3 install --no-cache-dir transformers
 FROM base AS deps
 WORKDIR /opt/src
 RUN apt install -y \
    clang \
    cmake \
    curl \
    git \
    python3-dev \
    libssl-dev \
    pkg-config \
    tar
-FROM deps AS llamacpp-builder
+ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llama_version}.tar.gz /opt/src/
-ARG llama_version
+RUN tar -xzf ${llama_version}.tar.gz \
-ARG llama_cuda_arch
+ && cd llama.cpp-${llama_version} \
-ENV LLAMA_VERSION=${llama_version}
+ && cmake -B build \
-
+    -DCMAKE_INSTALL_PREFIX=/usr \
-ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_VERSION}.tar.gz /opt/src/
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
 RUN tar -xzf ${LLAMA_VERSION}.tar.gz && \
    cd llama.cpp-${LLAMA_VERSION} && \
    cmake \
    -B build \
    -DCMAKE_INSTALL_PREFIX=/usr/llama \
    -DCMAKE_C_COMPILER=clang \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DCMAKE_CUDA_ARCHITECTURES=${llama_cuda_arch} \
@ -44,44 +30,49 @@ RUN tar -xzf ${LLAMA_VERSION}.tar.gz && \
    -DLLAMA_BUILD_TESTS=OFF \
    -DLLAMA_BUILD_EXAMPLES=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
- && cmake --build build --parallel --config Release -j \
+ && cmake --build build --parallel --config Release \
 && cmake --install build
-FROM deps AS rust-builder
+WORKDIR /app
 COPY rust-toolchain.toml rust-toolchain.toml
 RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
 ENV PATH="/root/.cargo/bin:$PATH"
 RUN cargo install cargo-chef --locked
 FROM deps AS planner
 COPY . .
-COPY --from=llamacpp-builder /usr/llama/lib/ /usr/lib/
+RUN cargo chef prepare --recipe-path recipe.json
 COPY --from=llamacpp-builder /usr/llama/include/ /usr/include/
-
+FROM deps AS builder
-ARG llama_hardware_target
+COPY --from=planner /app/recipe.json recipe.json
-ENV TGI_LLAMA_HARDWARE_TARGET=${llama_hardware_target}
+RUN cargo chef cook \
-RUN export TGI_LIB_SEARCH_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs && \
+    --recipe-path recipe.json \
    ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
    cargo build \
    --profile release-opt \
    --package text-generation-router-llamacpp
 COPY . .
 ENV TGI_LLAMA_PKG_CUDA=cuda-12.6
 RUN cargo build \
    --profile release-opt \
    --package text-generation-router-llamacpp --frozen
 # fix libcuda.so.1 ?
 RUN cp "$(pkg-config --variable=libdir cuda-12.6)"/stubs/libcuda.so /usr/lib/libcuda.so.1
 FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
 WORKDIR /usr/bin
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PATH="/venv/bin:$PATH"
 RUN apt update && apt install -y \
    openssl \
    python3-venv \
    python3-pip
-RUN python3 -m venv /venv && \
+RUN python3 -m venv /venv
-    pip3 install --no-cache-dir -r transformers
+ENV PATH="/venv/bin:$PATH"
-COPY --from=llamacpp-builder /usr/llama/lib/ /usr/lib/
+COPY backends/llamacpp/requirements.txt requirements.txt
-COPY --from=llamacpp-builder /usr/llama/include/ /usr/include/
+RUN pip3 install --no-cache-dir -r requirements.txt
 COPY --from=llamacpp-builder /usr/llama/bin/ /usr/bin/
 COPY --from=rust-builder /opt/src/target/release-opt/text-generation-router-llamacpp /usr/bin/text-generation-launcher
-ENTRYPOINT ["text-generation-launcher"]
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
 COPY --from=builder /usr/lib/libggml*.so /usr/lib/
 COPY --from=builder /usr/lib/libcuda.so.1 /usr/lib/
 COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/
 ENTRYPOINT ["text-generation-router-llamacpp"]
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@ -21,6 +21,7 @@ fn inject_transient_dependencies(lib_search_path: Option<&str>, lib_target_hardw
 }
 fn main() {
    let pkg_cuda = option_env!("TGI_LLAMA_PKG_CUDA");
    let lib_search_path = option_env!("TGI_LLAMA_LD_LIBRARY_PATH");
    let lib_target_hardware = option_env!("TGI_LLAMA_HARDWARE_TARGET").unwrap_or("cpu");
@ -36,6 +37,9 @@ fn main() {
        .write_to_file(out_path.join("bindings.rs"))
        .expect("Couldn't write bindings!");
    if let Some(pkg_cuda) = pkg_cuda {
        pkg_config::Config::new().probe(pkg_cuda).unwrap();
    }
    pkg_config::Config::new().probe("llama").unwrap();
    inject_transient_dependencies(lib_search_path, lib_target_hardware);