Remove make-gguf.sh

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-09-17 15:24:52 +00:00 · 2025-02-22 12:54:46 +00:00 · 2025-02-22 12:54:46 +00:00 · 46feaf6296
commit 46feaf6296
parent 3849223340
6 changed files with 38 additions and 59 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4754,6 +4754,7 @@ dependencies = [
 "async-trait",
 "bindgen 0.71.1",
 "clap 4.5.30",
 "hf-hub",
 "num_cpus",
 "pkg-config",
 "text-generation-router",
--- a/3
+++ b/3
@ -79,9 +79,6 @@ COPY --from=builder /usr/lib/libllama.so /usr/lib/
 COPY --from=builder /usr/lib/libggml*.so /usr/lib/
 COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
 COPY backends/llamacpp/make-gguf.sh make-gguf.sh
 ENV MAKE_GGUF=./make-gguf.sh
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENTRYPOINT ["text-generation-router-llamacpp"]
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@ -12,6 +12,7 @@ pkg-config = "0.3.31"
 [dependencies]
 async-trait = "0.1.85"
 clap = "4.5.27"
 hf-hub.workspace = true
 num_cpus = "1.16.0"
 text-generation-router = { path = "../../router" }
 thiserror = "2.0.11"
--- a/backends/llamacpp/make-gguf.sh
+++ b/backends/llamacpp/make-gguf.sh
@ -1,30 +0,0 @@
 #!/bin/sh
 [ "$#" -ge 2 ] || {
 	echo "Usage: $0 <GGUF> <MODEL_ID> [<REV>]" >&2
 	return 1
 }
 case "$1" in (*?.gguf) ;; (*)
 	echo "Not a valid GGUF file: $1"
 	return 1;
 esac
 GGUF="$1"
 GGUF_DIR=$(dirname -- "$GGUF")
 MODEL_ID="$2"
 MODEL_DIR="model.src/$2"
 REV="${3-main}"
 mkdir -p model.src "$GGUF_DIR"
 huggingface-cli download \
 	--revision "$REV" \
 	--local-dir "$MODEL_DIR" \
 	"$MODEL_ID" &&
 convert_hf_to_gguf.py \
 	--outfile "$GGUF" \
 	"$MODEL_DIR"
 rm -rf -- model.src
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -9,10 +9,12 @@ use backend::{
    LlamacppSplitMode,
 };
 use clap::Parser;
 use hf_hub::api::tokio::ApiBuilder;
 use hf_hub::{Repo, RepoType};
 use std::path::Path;
 use text_generation_router::{logging, server, usage_stats};
 use thiserror::Error;
-use tokenizers::{FromPretrainedParameters, Tokenizer};
+use tokenizers::Tokenizer;
 use tokio::process::Command;
 use tokio::sync::oneshot::error::RecvError;
 use tracing::{error, warn};
@ -200,37 +202,47 @@ async fn main() -> Result<(), RouterError> {
        ));
    }
-    // TODO: check if we use the same cache of Server
+    let api_builder = || {
-    // check if llamacpp is faster
+        let mut builder = ApiBuilder::new().with_progress(true);
-    let tokenizer = {
+
-        let token = std::env::var("HF_TOKEN")
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
-            .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+            builder = builder.with_cache_dir(cache_dir.into());
-            .ok();
+        }
-        let params = FromPretrainedParameters {
+        if let Ok(token) = std::env::var("HF_TOKEN") {
-            revision: args.revision.clone(),
+            builder = builder.with_token(token.into());
-            token,
+        }
-            ..Default::default()
+        builder
        };
        Tokenizer::from_pretrained(&args.model_id, Some(params))?
    };
    let api_repo = api_builder().build()?.repo(Repo::with_revision(
        args.model_id.clone(),
        RepoType::Model,
        args.revision.clone(),
    ));
    let tokenizer_path = api_repo.get("tokenizer.json").await?;
    let tokenizer = Tokenizer::from_file(&tokenizer_path)?;
    let model_gguf = if let Some(model_gguf) = args.model_gguf {
        model_gguf
    } else {
        let make_gguf = std::env::var("MAKE_GGUF").map_err(|e| {
            error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
            RouterError::VarError(e)
        })?;
        let model_gguf = format!("models/{}/model.gguf", args.model_id);
        let model_gguf_path = Path::new(&model_gguf);
-        if !Path::new(&model_gguf).exists() {
+        if !model_gguf_path.exists() {
            let tmp_gguf = "models/tmp.gguf";
-            let status = Command::new(make_gguf)
+            if let Some(parent) = Path::new(model_gguf_path).parent() {
                std::fs::create_dir_all(parent)?;
            }
            let cache_path = tokenizer_path.parent().unwrap();
            for sibling in api_repo.info().await?.siblings {
                let _ = api_repo.get(&sibling.rfilename).await?;
            }
            let status = Command::new("convert_hf_to_gguf.py")
                .arg("--outfile")
                .arg(tmp_gguf)
-                .arg(&args.model_id)
+                .arg(cache_path)
                .arg(&args.revision)
                .spawn()?
                .wait()
                .await?;
@ -327,4 +339,6 @@ enum RouterError {
    QuantizeError(String),
    #[error("Command error: {0}")]
    CommandError(i32),
    #[error("HF hub error: {0}")]
    HubError(#[from] hf_hub::api::tokio::ApiError),
 }
--- a/backends/llamacpp/src/quantize.rs
+++ b/backends/llamacpp/src/quantize.rs
@ -1,7 +1,6 @@
 use crate::llamacpp;
 use std::ffi::CString;
 use std::path::Path;
 #[repr(u32)]
 #[derive(Debug, Clone, Copy)]
@ -15,9 +14,6 @@ pub fn model(
    ftype: QuantizeType,
    n_threads: usize,
 ) -> Result<(), String> {
    if !Path::new(input_path).exists() {
        return Err(format!("Input file '{}' does not exist", input_path));
    }
    let c_input_path =
        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;