diff --git a/Cargo.lock b/Cargo.lock index 0c28b285..4851e4df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4754,6 +4754,7 @@ dependencies = [ "async-trait", "bindgen 0.71.1", "clap 4.5.30", + "hf-hub", "num_cpus", "pkg-config", "text-generation-router", diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 7615626f..61c34b8e 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -79,9 +79,6 @@ COPY --from=builder /usr/lib/libllama.so /usr/lib/ COPY --from=builder /usr/lib/libggml*.so /usr/lib/ COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ -COPY backends/llamacpp/make-gguf.sh make-gguf.sh -ENV MAKE_GGUF=./make-gguf.sh - ENV HF_HUB_ENABLE_HF_TRANSFER=1 ENTRYPOINT ["text-generation-router-llamacpp"] diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index 555ad2ff..685a313f 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -12,6 +12,7 @@ pkg-config = "0.3.31" [dependencies] async-trait = "0.1.85" clap = "4.5.27" +hf-hub.workspace = true num_cpus = "1.16.0" text-generation-router = { path = "../../router" } thiserror = "2.0.11" diff --git a/backends/llamacpp/make-gguf.sh b/backends/llamacpp/make-gguf.sh deleted file mode 100755 index e4823af1..00000000 --- a/backends/llamacpp/make-gguf.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/sh - -[ "$#" -ge 2 ] || { - echo "Usage: $0 []" >&2 - return 1 -} - -case "$1" in (*?.gguf) ;; (*) - echo "Not a valid GGUF file: $1" - return 1; -esac - -GGUF="$1" -GGUF_DIR=$(dirname -- "$GGUF") -MODEL_ID="$2" -MODEL_DIR="model.src/$2" -REV="${3-main}" - -mkdir -p model.src "$GGUF_DIR" - -huggingface-cli download \ - --revision "$REV" \ - --local-dir "$MODEL_DIR" \ - "$MODEL_ID" && - -convert_hf_to_gguf.py \ - --outfile "$GGUF" \ - "$MODEL_DIR" - -rm -rf -- model.src diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index c5e72d4f..aceb86ef 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -9,10 +9,12 @@ use backend::{ LlamacppSplitMode, }; use clap::Parser; +use hf_hub::api::tokio::ApiBuilder; +use hf_hub::{Repo, RepoType}; use std::path::Path; use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; -use tokenizers::{FromPretrainedParameters, Tokenizer}; +use tokenizers::Tokenizer; use tokio::process::Command; use tokio::sync::oneshot::error::RecvError; use tracing::{error, warn}; @@ -200,37 +202,47 @@ async fn main() -> Result<(), RouterError> { )); } - // TODO: check if we use the same cache of Server - // check if llamacpp is faster - let tokenizer = { - let token = std::env::var("HF_TOKEN") - .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) - .ok(); - let params = FromPretrainedParameters { - revision: args.revision.clone(), - token, - ..Default::default() - }; - Tokenizer::from_pretrained(&args.model_id, Some(params))? + let api_builder = || { + let mut builder = ApiBuilder::new().with_progress(true); + + if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") { + builder = builder.with_cache_dir(cache_dir.into()); + } + if let Ok(token) = std::env::var("HF_TOKEN") { + builder = builder.with_token(token.into()); + } + builder }; + let api_repo = api_builder().build()?.repo(Repo::with_revision( + args.model_id.clone(), + RepoType::Model, + args.revision.clone(), + )); + + let tokenizer_path = api_repo.get("tokenizer.json").await?; + let tokenizer = Tokenizer::from_file(&tokenizer_path)?; let model_gguf = if let Some(model_gguf) = args.model_gguf { model_gguf } else { - let make_gguf = std::env::var("MAKE_GGUF").map_err(|e| { - error!("No GGUF model given and environment variable MAKE_GGUF is missing."); - RouterError::VarError(e) - })?; - let model_gguf = format!("models/{}/model.gguf", args.model_id); + let model_gguf_path = Path::new(&model_gguf); - if !Path::new(&model_gguf).exists() { + if !model_gguf_path.exists() { let tmp_gguf = "models/tmp.gguf"; - let status = Command::new(make_gguf) + if let Some(parent) = Path::new(model_gguf_path).parent() { + std::fs::create_dir_all(parent)?; + } + let cache_path = tokenizer_path.parent().unwrap(); + + for sibling in api_repo.info().await?.siblings { + let _ = api_repo.get(&sibling.rfilename).await?; + } + let status = Command::new("convert_hf_to_gguf.py") + .arg("--outfile") .arg(tmp_gguf) - .arg(&args.model_id) - .arg(&args.revision) + .arg(cache_path) .spawn()? .wait() .await?; @@ -327,4 +339,6 @@ enum RouterError { QuantizeError(String), #[error("Command error: {0}")] CommandError(i32), + #[error("HF hub error: {0}")] + HubError(#[from] hf_hub::api::tokio::ApiError), } diff --git a/backends/llamacpp/src/quantize.rs b/backends/llamacpp/src/quantize.rs index 7f0cde9f..31307bec 100644 --- a/backends/llamacpp/src/quantize.rs +++ b/backends/llamacpp/src/quantize.rs @@ -1,7 +1,6 @@ use crate::llamacpp; use std::ffi::CString; -use std::path::Path; #[repr(u32)] #[derive(Debug, Clone, Copy)] @@ -15,9 +14,6 @@ pub fn model( ftype: QuantizeType, n_threads: usize, ) -> Result<(), String> { - if !Path::new(input_path).exists() { - return Err(format!("Input file '{}' does not exist", input_path)); - } let c_input_path = CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;