Remove make-gguf.sh

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-07-08 08:50:17 +00:00 · 2025-02-22 12:54:46 +00:00 · 2025-02-22 12:54:46 +00:00 · 46feaf6296
commit 46feaf6296
parent 3849223340
6 changed files with 38 additions and 59 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4754,6 +4754,7 @@ dependencies = [
 "async-trait",
 "bindgen 0.71.1",
 "clap 4.5.30",
+ "hf-hub",
 "num_cpus",
 "pkg-config",
 "text-generation-router",
--- a/3
+++ b/3
@ -79,9 +79,6 @@ COPY --from=builder /usr/lib/libllama.so /usr/lib/
 COPY --from=builder /usr/lib/libggml*.so /usr/lib/
 COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/

-COPY backends/llamacpp/make-gguf.sh make-gguf.sh
-ENV MAKE_GGUF=./make-gguf.sh
-
 ENV HF_HUB_ENABLE_HF_TRANSFER=1

 ENTRYPOINT ["text-generation-router-llamacpp"]
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@ -12,6 +12,7 @@ pkg-config = "0.3.31"
 [dependencies]
 async-trait = "0.1.85"
 clap = "4.5.27"
+hf-hub.workspace = true
 num_cpus = "1.16.0"
 text-generation-router = { path = "../../router" }
 thiserror = "2.0.11"
--- a/backends/llamacpp/make-gguf.sh
+++ b/backends/llamacpp/make-gguf.sh
@ -1,30 +0,0 @@
-#!/bin/sh
-
-[ "$#" -ge 2 ] || {
-	echo "Usage: $0 <GGUF> <MODEL_ID> [<REV>]" >&2
-	return 1
-}
-
-case "$1" in (*?.gguf) ;; (*)
-	echo "Not a valid GGUF file: $1"
-	return 1;
-esac
-
-GGUF="$1"
-GGUF_DIR=$(dirname -- "$GGUF")
-MODEL_ID="$2"
-MODEL_DIR="model.src/$2"
-REV="${3-main}"
-
-mkdir -p model.src "$GGUF_DIR"
-
-huggingface-cli download \
-	--revision "$REV" \
-	--local-dir "$MODEL_DIR" \
-	"$MODEL_ID" &&
-
-convert_hf_to_gguf.py \
-	--outfile "$GGUF" \
-	"$MODEL_DIR"
-
-rm -rf -- model.src
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -9,10 +9,12 @@ use backend::{
    LlamacppSplitMode,
 };
 use clap::Parser;
+use hf_hub::api::tokio::ApiBuilder;
+use hf_hub::{Repo, RepoType};
 use std::path::Path;
 use text_generation_router::{logging, server, usage_stats};
 use thiserror::Error;
-use tokenizers::{FromPretrainedParameters, Tokenizer};
+use tokenizers::Tokenizer;
 use tokio::process::Command;
 use tokio::sync::oneshot::error::RecvError;
 use tracing::{error, warn};
@ -200,37 +202,47 @@ async fn main() -> Result<(), RouterError> {
        ));
    }

-    // TODO: check if we use the same cache of Server
-    // check if llamacpp is faster
-    let tokenizer = {
-        let token = std::env::var("HF_TOKEN")
-            .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
-            .ok();
-        let params = FromPretrainedParameters {
-            revision: args.revision.clone(),
-            token,
-            ..Default::default()
-        };
-        Tokenizer::from_pretrained(&args.model_id, Some(params))?
+    let api_builder = || {
+        let mut builder = ApiBuilder::new().with_progress(true);
+
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
+            builder = builder.with_cache_dir(cache_dir.into());
+        }
+        if let Ok(token) = std::env::var("HF_TOKEN") {
+            builder = builder.with_token(token.into());
+        }
+        builder
    };
+    let api_repo = api_builder().build()?.repo(Repo::with_revision(
+        args.model_id.clone(),
+        RepoType::Model,
+        args.revision.clone(),
+    ));
+
+    let tokenizer_path = api_repo.get("tokenizer.json").await?;
+    let tokenizer = Tokenizer::from_file(&tokenizer_path)?;

    let model_gguf = if let Some(model_gguf) = args.model_gguf {
        model_gguf
    } else {
-        let make_gguf = std::env::var("MAKE_GGUF").map_err(|e| {
-            error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
-            RouterError::VarError(e)
-        })?;
-
        let model_gguf = format!("models/{}/model.gguf", args.model_id);
+        let model_gguf_path = Path::new(&model_gguf);

-        if !Path::new(&model_gguf).exists() {
+        if !model_gguf_path.exists() {
            let tmp_gguf = "models/tmp.gguf";

-            let status = Command::new(make_gguf)
+            if let Some(parent) = Path::new(model_gguf_path).parent() {
+                std::fs::create_dir_all(parent)?;
+            }
+            let cache_path = tokenizer_path.parent().unwrap();
+
+            for sibling in api_repo.info().await?.siblings {
+                let _ = api_repo.get(&sibling.rfilename).await?;
+            }
+            let status = Command::new("convert_hf_to_gguf.py")
+                .arg("--outfile")
                .arg(tmp_gguf)
-                .arg(&args.model_id)
-                .arg(&args.revision)
+                .arg(cache_path)
                .spawn()?
                .wait()
                .await?;
@ -327,4 +339,6 @@ enum RouterError {
    QuantizeError(String),
    #[error("Command error: {0}")]
    CommandError(i32),
+    #[error("HF hub error: {0}")]
+    HubError(#[from] hf_hub::api::tokio::ApiError),
 }
--- a/backends/llamacpp/src/quantize.rs
+++ b/backends/llamacpp/src/quantize.rs
@ -1,7 +1,6 @@
 use crate::llamacpp;

 use std::ffi::CString;
-use std::path::Path;

 #[repr(u32)]
 #[derive(Debug, Clone, Copy)]
@ -15,9 +14,6 @@ pub fn model(
    ftype: QuantizeType,
    n_threads: usize,
 ) -> Result<(), String> {
-    if !Path::new(input_path).exists() {
-        return Err(format!("Input file '{}' does not exist", input_path));
-    }
    let c_input_path =
        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;