diff --git a/Cargo.lock b/Cargo.lock
index 0c28b285..4851e4df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4754,6 +4754,7 @@ dependencies = [
  "async-trait",
  "bindgen 0.71.1",
  "clap 4.5.30",
+ "hf-hub",
  "num_cpus",
  "pkg-config",
  "text-generation-router",
diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
index 7615626f..61c34b8e 100644
--- a/Dockerfile_llamacpp
+++ b/Dockerfile_llamacpp
@@ -79,9 +79,6 @@ COPY --from=builder /usr/lib/libllama.so /usr/lib/
 COPY --from=builder /usr/lib/libggml*.so /usr/lib/
 COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
 
-COPY backends/llamacpp/make-gguf.sh make-gguf.sh
-ENV MAKE_GGUF=./make-gguf.sh
-
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
 ENTRYPOINT ["text-generation-router-llamacpp"]
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
index 555ad2ff..685a313f 100644
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@@ -12,6 +12,7 @@ pkg-config = "0.3.31"
 [dependencies]
 async-trait = "0.1.85"
 clap = "4.5.27"
+hf-hub.workspace = true
 num_cpus = "1.16.0"
 text-generation-router = { path = "../../router" }
 thiserror = "2.0.11"
diff --git a/backends/llamacpp/make-gguf.sh b/backends/llamacpp/make-gguf.sh
deleted file mode 100755
index e4823af1..00000000
--- a/backends/llamacpp/make-gguf.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-
-[ "$#" -ge 2 ] || {
-	echo "Usage: $0 <GGUF> <MODEL_ID> [<REV>]" >&2
-	return 1
-}
-
-case "$1" in (*?.gguf) ;; (*)
-	echo "Not a valid GGUF file: $1"
-	return 1;
-esac
-
-GGUF="$1"
-GGUF_DIR=$(dirname -- "$GGUF")
-MODEL_ID="$2"
-MODEL_DIR="model.src/$2"
-REV="${3-main}"
-
-mkdir -p model.src "$GGUF_DIR"
-
-huggingface-cli download \
-	--revision "$REV" \
-	--local-dir "$MODEL_DIR" \
-	"$MODEL_ID" &&
-
-convert_hf_to_gguf.py \
-	--outfile "$GGUF" \
-	"$MODEL_DIR"
-
-rm -rf -- model.src
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index c5e72d4f..aceb86ef 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -9,10 +9,12 @@ use backend::{
     LlamacppSplitMode,
 };
 use clap::Parser;
+use hf_hub::api::tokio::ApiBuilder;
+use hf_hub::{Repo, RepoType};
 use std::path::Path;
 use text_generation_router::{logging, server, usage_stats};
 use thiserror::Error;
-use tokenizers::{FromPretrainedParameters, Tokenizer};
+use tokenizers::Tokenizer;
 use tokio::process::Command;
 use tokio::sync::oneshot::error::RecvError;
 use tracing::{error, warn};
@@ -200,37 +202,47 @@ async fn main() -> Result<(), RouterError> {
         ));
     }
 
-    // TODO: check if we use the same cache of Server
-    // check if llamacpp is faster
-    let tokenizer = {
-        let token = std::env::var("HF_TOKEN")
-            .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
-            .ok();
-        let params = FromPretrainedParameters {
-            revision: args.revision.clone(),
-            token,
-            ..Default::default()
-        };
-        Tokenizer::from_pretrained(&args.model_id, Some(params))?
+    let api_builder = || {
+        let mut builder = ApiBuilder::new().with_progress(true);
+
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
+            builder = builder.with_cache_dir(cache_dir.into());
+        }
+        if let Ok(token) = std::env::var("HF_TOKEN") {
+            builder = builder.with_token(token.into());
+        }
+        builder
     };
+    let api_repo = api_builder().build()?.repo(Repo::with_revision(
+        args.model_id.clone(),
+        RepoType::Model,
+        args.revision.clone(),
+    ));
+
+    let tokenizer_path = api_repo.get("tokenizer.json").await?;
+    let tokenizer = Tokenizer::from_file(&tokenizer_path)?;
 
     let model_gguf = if let Some(model_gguf) = args.model_gguf {
         model_gguf
     } else {
-        let make_gguf = std::env::var("MAKE_GGUF").map_err(|e| {
-            error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
-            RouterError::VarError(e)
-        })?;
-
         let model_gguf = format!("models/{}/model.gguf", args.model_id);
+        let model_gguf_path = Path::new(&model_gguf);
 
-        if !Path::new(&model_gguf).exists() {
+        if !model_gguf_path.exists() {
             let tmp_gguf = "models/tmp.gguf";
 
-            let status = Command::new(make_gguf)
+            if let Some(parent) = Path::new(model_gguf_path).parent() {
+                std::fs::create_dir_all(parent)?;
+            }
+            let cache_path = tokenizer_path.parent().unwrap();
+
+            for sibling in api_repo.info().await?.siblings {
+                let _ = api_repo.get(&sibling.rfilename).await?;
+            }
+            let status = Command::new("convert_hf_to_gguf.py")
+                .arg("--outfile")
                 .arg(tmp_gguf)
-                .arg(&args.model_id)
-                .arg(&args.revision)
+                .arg(cache_path)
                 .spawn()?
                 .wait()
                 .await?;
@@ -327,4 +339,6 @@ enum RouterError {
     QuantizeError(String),
     #[error("Command error: {0}")]
     CommandError(i32),
+    #[error("HF hub error: {0}")]
+    HubError(#[from] hf_hub::api::tokio::ApiError),
 }
diff --git a/backends/llamacpp/src/quantize.rs b/backends/llamacpp/src/quantize.rs
index 7f0cde9f..31307bec 100644
--- a/backends/llamacpp/src/quantize.rs
+++ b/backends/llamacpp/src/quantize.rs
@@ -1,7 +1,6 @@
 use crate::llamacpp;
 
 use std::ffi::CString;
-use std::path::Path;
 
 #[repr(u32)]
 #[derive(Debug, Clone, Copy)]
@@ -15,9 +14,6 @@ pub fn model(
     ftype: QuantizeType,
     n_threads: usize,
 ) -> Result<(), String> {
-    if !Path::new(input_path).exists() {
-        return Err(format!("Input file '{}' does not exist", input_path));
-    }
     let c_input_path =
         CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;