diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 4b7dacd35..20c560f7e 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -18,8 +18,9 @@ RUN apt update && apt install -y \ tar ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ -RUN tar -xzf ${llamacpp_version}.tar.gz \ - && cd llama.cpp-${llamacpp_version} \ +RUN mkdir -p llama.cpp \ + && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ + && cd llama.cpp \ && cmake -B build \ -DCMAKE_INSTALL_PREFIX=/usr \ -DCMAKE_INSTALL_LIBDIR=/usr/lib \ @@ -27,9 +28,7 @@ RUN tar -xzf ${llamacpp_version}.tar.gz \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ -DGGML_CUDA=${llamacpp_cuda} \ - -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_SERVER=OFF \ && cmake --build build --parallel --config Release \ && cmake --install build @@ -56,7 +55,9 @@ RUN cargo build \ --package text-generation-router-llamacpp --frozen FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 +WORKDIR /app +ENV DEBIAN_FRONTEND=noninteractive RUN apt update && apt install -y \ python3-venv \ python3-pip @@ -65,12 +66,21 @@ RUN python3 -m venv /venv ENV PATH="/venv/bin:$PATH" COPY backends/llamacpp/requirements.txt requirements.txt -RUN pip3 install --no-cache-dir -r requirements.txt +COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py +COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ +COPY --from=builder /usr/bin/llama-quantize /usr/bin/ + +RUN pip3 install --no-cache-dir \ + -r requirements.txt \ + -e gguf-py COPY --from=builder /usr/lib/libllama.so /usr/lib/ COPY --from=builder /usr/lib/libggml*.so /usr/lib/ COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ +COPY backends/llamacpp/make-gguf.sh make-gguf.sh +ENV MAKE_GGUF=./make-gguf.sh + ENV HF_HUB_ENABLE_HF_TRANSFER=1 ENTRYPOINT ["text-generation-router-llamacpp"] diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index 18c2ed0a8..555ad2ff8 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -16,6 +16,6 @@ num_cpus = "1.16.0" text-generation-router = { path = "../../router" } thiserror = "2.0.11" tokenizers.workspace = true -tokio = "1.43.0" +tokio = { version = "1.43.0", features = ["process"] } tokio-stream = "0.1.17" tracing = "0.1.41" diff --git a/backends/llamacpp/make-gguf.sh b/backends/llamacpp/make-gguf.sh new file mode 100755 index 000000000..713330ff0 --- /dev/null +++ b/backends/llamacpp/make-gguf.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +[ "$#" -ge 2 ] || { + echo "Usage: $0 []" >&2 + return 1 +} + +GGUF="$1" +GGUF_DIR=$(dirname "$GGUF") +GGUF_TMP="model.src/tmp.gguf" +MODEL_ID="$2" +MODEL_DIR="model.src/$2" +REV="${3-main}" + +[ -e "$GGUF" ] && return + +mkdir -p model.src "$GGUF_DIR" + +huggingface-cli download \ + --revision "$REV" \ + --local-dir "$MODEL_DIR" \ + "$MODEL_ID" && + +convert_hf_to_gguf.py \ + --outfile "$GGUF_TMP" \ + "$MODEL_DIR" && + +llama-quantize \ + "$GGUF_TMP" \ + "$GGUF" \ + "Q4_0" + +rm -rf model.src diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt index d19c9e5bd..293cd2055 100644 --- a/backends/llamacpp/requirements.txt +++ b/backends/llamacpp/requirements.txt @@ -1,3 +1,4 @@ transformers==4.49 huggingface-hub==0.28.1 hf-transfer==0.1.9 +torch==2.6.0 diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 5a07acdcd..fe22c1d7b 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -8,6 +8,7 @@ use clap::Parser; use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; use tokenizers::{FromPretrainedParameters, Tokenizer}; +use tokio::process::Command; use tokio::sync::oneshot::error::RecvError; use tracing::{error, warn}; @@ -25,7 +26,7 @@ struct Args { /// Path to the GGUF model file for inference. #[clap(long, env)] - model_gguf: String, // TODO Option() with hf->gguf & quantize + model_gguf: Option, /// Number of threads to use for generation. #[clap(long, env)] @@ -205,12 +206,39 @@ async fn main() -> Result<(), RouterError> { token, ..Default::default() }; - Tokenizer::from_pretrained(args.model_id.clone(), Some(params))? + Tokenizer::from_pretrained(&args.model_id, Some(params))? + }; + + let model_gguf = match args.model_gguf { + Some(model_gguf) => model_gguf, + None => { + let make_gguf = match std::env::var("MAKE_GGUF") { + Ok(make_gguf) => make_gguf, + Err(e) => { + error!("Missing env: MAKE_GGUF"); + return Err(RouterError::VarError(e)); + } + }; + let model_gguf = "models/model.gguf".to_string(); + + let status = Command::new(make_gguf) + .arg(&model_gguf) + .arg(&args.model_id) + .arg(&args.revision) + .spawn()? + .wait() + .await?; + + if !status.success() { + error!("Failed to generate GGUF"); + } + model_gguf + } }; let (backend, ok, shutdown) = LlamacppBackend::new( LlamacppConfig { - model_gguf: args.model_gguf, + model_gguf, n_threads, n_threads_batch, n_gpu_layers: args.n_gpu_layers, @@ -281,4 +309,8 @@ enum RouterError { WebServer(#[from] server::WebServerError), #[error("Recv error: {0}")] RecvError(#[from] RecvError), + #[error("IoError: {0}")] + IoError(#[from] std::io::Error), + #[error("VarError: {0}")] + VarError(#[from] std::env::VarError), }