mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 22:32:07 +00:00
Make --model-gguf optional
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
bda39e42c2
commit
2d4aa25b9c
@ -18,8 +18,9 @@ RUN apt update && apt install -y \
|
|||||||
tar
|
tar
|
||||||
|
|
||||||
ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
|
ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
|
||||||
RUN tar -xzf ${llamacpp_version}.tar.gz \
|
RUN mkdir -p llama.cpp \
|
||||||
&& cd llama.cpp-${llamacpp_version} \
|
&& tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
|
||||||
|
&& cd llama.cpp \
|
||||||
&& cmake -B build \
|
&& cmake -B build \
|
||||||
-DCMAKE_INSTALL_PREFIX=/usr \
|
-DCMAKE_INSTALL_PREFIX=/usr \
|
||||||
-DCMAKE_INSTALL_LIBDIR=/usr/lib \
|
-DCMAKE_INSTALL_LIBDIR=/usr/lib \
|
||||||
@ -27,9 +28,7 @@ RUN tar -xzf ${llamacpp_version}.tar.gz \
|
|||||||
-DCMAKE_CXX_COMPILER=clang++ \
|
-DCMAKE_CXX_COMPILER=clang++ \
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
|
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
|
||||||
-DGGML_CUDA=${llamacpp_cuda} \
|
-DGGML_CUDA=${llamacpp_cuda} \
|
||||||
-DLLAMA_BUILD_COMMON=OFF \
|
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
&& cmake --build build --parallel --config Release \
|
&& cmake --build build --parallel --config Release \
|
||||||
&& cmake --install build
|
&& cmake --install build
|
||||||
@ -56,7 +55,9 @@ RUN cargo build \
|
|||||||
--package text-generation-router-llamacpp --frozen
|
--package text-generation-router-llamacpp --frozen
|
||||||
|
|
||||||
FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
|
FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
RUN apt update && apt install -y \
|
RUN apt update && apt install -y \
|
||||||
python3-venv \
|
python3-venv \
|
||||||
python3-pip
|
python3-pip
|
||||||
@ -65,12 +66,21 @@ RUN python3 -m venv /venv
|
|||||||
ENV PATH="/venv/bin:$PATH"
|
ENV PATH="/venv/bin:$PATH"
|
||||||
|
|
||||||
COPY backends/llamacpp/requirements.txt requirements.txt
|
COPY backends/llamacpp/requirements.txt requirements.txt
|
||||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
|
||||||
|
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
|
||||||
|
COPY --from=builder /usr/bin/llama-quantize /usr/bin/
|
||||||
|
|
||||||
|
RUN pip3 install --no-cache-dir \
|
||||||
|
-r requirements.txt \
|
||||||
|
-e gguf-py
|
||||||
|
|
||||||
COPY --from=builder /usr/lib/libllama.so /usr/lib/
|
COPY --from=builder /usr/lib/libllama.so /usr/lib/
|
||||||
COPY --from=builder /usr/lib/libggml*.so /usr/lib/
|
COPY --from=builder /usr/lib/libggml*.so /usr/lib/
|
||||||
COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
|
COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
|
||||||
|
|
||||||
|
COPY backends/llamacpp/make-gguf.sh make-gguf.sh
|
||||||
|
ENV MAKE_GGUF=./make-gguf.sh
|
||||||
|
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER=1
|
ENV HF_HUB_ENABLE_HF_TRANSFER=1
|
||||||
|
|
||||||
ENTRYPOINT ["text-generation-router-llamacpp"]
|
ENTRYPOINT ["text-generation-router-llamacpp"]
|
||||||
|
@ -16,6 +16,6 @@ num_cpus = "1.16.0"
|
|||||||
text-generation-router = { path = "../../router" }
|
text-generation-router = { path = "../../router" }
|
||||||
thiserror = "2.0.11"
|
thiserror = "2.0.11"
|
||||||
tokenizers.workspace = true
|
tokenizers.workspace = true
|
||||||
tokio = "1.43.0"
|
tokio = { version = "1.43.0", features = ["process"] }
|
||||||
tokio-stream = "0.1.17"
|
tokio-stream = "0.1.17"
|
||||||
tracing = "0.1.41"
|
tracing = "0.1.41"
|
||||||
|
33
backends/llamacpp/make-gguf.sh
Executable file
33
backends/llamacpp/make-gguf.sh
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
[ "$#" -ge 2 ] || {
|
||||||
|
echo "Usage: $0 <GGUF> <MODEL_ID> [<REV>]" >&2
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
GGUF="$1"
|
||||||
|
GGUF_DIR=$(dirname "$GGUF")
|
||||||
|
GGUF_TMP="model.src/tmp.gguf"
|
||||||
|
MODEL_ID="$2"
|
||||||
|
MODEL_DIR="model.src/$2"
|
||||||
|
REV="${3-main}"
|
||||||
|
|
||||||
|
[ -e "$GGUF" ] && return
|
||||||
|
|
||||||
|
mkdir -p model.src "$GGUF_DIR"
|
||||||
|
|
||||||
|
huggingface-cli download \
|
||||||
|
--revision "$REV" \
|
||||||
|
--local-dir "$MODEL_DIR" \
|
||||||
|
"$MODEL_ID" &&
|
||||||
|
|
||||||
|
convert_hf_to_gguf.py \
|
||||||
|
--outfile "$GGUF_TMP" \
|
||||||
|
"$MODEL_DIR" &&
|
||||||
|
|
||||||
|
llama-quantize \
|
||||||
|
"$GGUF_TMP" \
|
||||||
|
"$GGUF" \
|
||||||
|
"Q4_0"
|
||||||
|
|
||||||
|
rm -rf model.src
|
@ -1,3 +1,4 @@
|
|||||||
transformers==4.49
|
transformers==4.49
|
||||||
huggingface-hub==0.28.1
|
huggingface-hub==0.28.1
|
||||||
hf-transfer==0.1.9
|
hf-transfer==0.1.9
|
||||||
|
torch==2.6.0
|
||||||
|
@ -8,6 +8,7 @@ use clap::Parser;
|
|||||||
use text_generation_router::{logging, server, usage_stats};
|
use text_generation_router::{logging, server, usage_stats};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokenizers::{FromPretrainedParameters, Tokenizer};
|
use tokenizers::{FromPretrainedParameters, Tokenizer};
|
||||||
|
use tokio::process::Command;
|
||||||
use tokio::sync::oneshot::error::RecvError;
|
use tokio::sync::oneshot::error::RecvError;
|
||||||
use tracing::{error, warn};
|
use tracing::{error, warn};
|
||||||
|
|
||||||
@ -25,7 +26,7 @@ struct Args {
|
|||||||
|
|
||||||
/// Path to the GGUF model file for inference.
|
/// Path to the GGUF model file for inference.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
model_gguf: String, // TODO Option() with hf->gguf & quantize
|
model_gguf: Option<String>,
|
||||||
|
|
||||||
/// Number of threads to use for generation.
|
/// Number of threads to use for generation.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
@ -205,12 +206,39 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
token,
|
token,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
Tokenizer::from_pretrained(args.model_id.clone(), Some(params))?
|
Tokenizer::from_pretrained(&args.model_id, Some(params))?
|
||||||
|
};
|
||||||
|
|
||||||
|
let model_gguf = match args.model_gguf {
|
||||||
|
Some(model_gguf) => model_gguf,
|
||||||
|
None => {
|
||||||
|
let make_gguf = match std::env::var("MAKE_GGUF") {
|
||||||
|
Ok(make_gguf) => make_gguf,
|
||||||
|
Err(e) => {
|
||||||
|
error!("Missing env: MAKE_GGUF");
|
||||||
|
return Err(RouterError::VarError(e));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let model_gguf = "models/model.gguf".to_string();
|
||||||
|
|
||||||
|
let status = Command::new(make_gguf)
|
||||||
|
.arg(&model_gguf)
|
||||||
|
.arg(&args.model_id)
|
||||||
|
.arg(&args.revision)
|
||||||
|
.spawn()?
|
||||||
|
.wait()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if !status.success() {
|
||||||
|
error!("Failed to generate GGUF");
|
||||||
|
}
|
||||||
|
model_gguf
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let (backend, ok, shutdown) = LlamacppBackend::new(
|
let (backend, ok, shutdown) = LlamacppBackend::new(
|
||||||
LlamacppConfig {
|
LlamacppConfig {
|
||||||
model_gguf: args.model_gguf,
|
model_gguf,
|
||||||
n_threads,
|
n_threads,
|
||||||
n_threads_batch,
|
n_threads_batch,
|
||||||
n_gpu_layers: args.n_gpu_layers,
|
n_gpu_layers: args.n_gpu_layers,
|
||||||
@ -281,4 +309,8 @@ enum RouterError {
|
|||||||
WebServer(#[from] server::WebServerError),
|
WebServer(#[from] server::WebServerError),
|
||||||
#[error("Recv error: {0}")]
|
#[error("Recv error: {0}")]
|
||||||
RecvError(#[from] RecvError),
|
RecvError(#[from] RecvError),
|
||||||
|
#[error("IoError: {0}")]
|
||||||
|
IoError(#[from] std::io::Error),
|
||||||
|
#[error("VarError: {0}")]
|
||||||
|
VarError(#[from] std::env::VarError),
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user