Quantize without llama-quantize

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-07-10 18:00:16 +00:00 · 2025-02-20 15:40:40 +00:00 · 2025-02-20 15:40:40 +00:00 · 0a55bd3db9
commit 0a55bd3db9
parent 6223b6e264
6 changed files with 86 additions and 31 deletions
--- a/3
+++ b/3
@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
    -DGGML_CUDA=${llamacpp_cuda} \
+    -DLLAMA_BUILD_COMMON=OFF \
    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
 && cmake --build build --parallel --config Release \
 && cmake --install build
@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH"
 COPY backends/llamacpp/requirements.txt requirements.txt
 COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
 COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
-COPY --from=builder /usr/bin/llama-quantize /usr/bin/

 RUN pip3 install --no-cache-dir \
    -r requirements.txt \
--- a/backends/llamacpp/make-gguf.sh
+++ b/backends/llamacpp/make-gguf.sh
@ -5,15 +5,17 @@
 	return 1
 }

+case "$1" in (*?.gguf) ;; (*)
+	echo "Not a valid GGUF file: $1"
+	return 1;
+esac
+
 GGUF="$1"
-GGUF_DIR=$(dirname "$GGUF")
-GGUF_TMP="model.src/tmp.gguf"
+GGUF_DIR=$(dirname -- "$GGUF")
 MODEL_ID="$2"
 MODEL_DIR="model.src/$2"
 REV="${3-main}"

-[ -e "$GGUF" ] && return
-
 mkdir -p model.src "$GGUF_DIR"

 huggingface-cli download \
@ -22,12 +24,7 @@ huggingface-cli download \
 	"$MODEL_ID" &&

 convert_hf_to_gguf.py \
-	--outfile "$GGUF_TMP" \
-	"$MODEL_DIR" &&
+	--outfile "$GGUF" \
+	"$MODEL_DIR"

-llama-quantize \
-	"$GGUF_TMP" \
-	"$GGUF" \
-	"Q4_0"
-
-rm -rf model.src
+rm -rf -- model.src
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@ -1,10 +1,5 @@
-mod llamacpp {
-    #![allow(non_upper_case_globals)]
-    #![allow(non_camel_case_types)]
-    #![allow(non_snake_case)]
-    #![allow(dead_code)]
-    include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
-}
+use crate::llamacpp;
+
 use async_trait::async_trait;
 use std::ffi::CString;
 use std::mem::replace;
--- a/backends/llamacpp/src/llamacpp.rs
+++ b/backends/llamacpp/src/llamacpp.rs
@ -0,0 +1,5 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(dead_code)]
+include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -1,10 +1,15 @@
 mod backend;
+mod llamacpp;
+mod quantize;
+
+use quantize::QuantizeType;

 use backend::{
    BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
    LlamacppSplitMode,
 };
 use clap::Parser;
+use std::path::Path;
 use text_generation_router::{logging, server, usage_stats};
 use thiserror::Error;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
@ -216,18 +221,27 @@ async fn main() -> Result<(), RouterError> {
            error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
            RouterError::VarError(e)
        })?;
+
        let model_gguf = format!("models/{}/model.gguf", args.model_id);

-        let status = Command::new(make_gguf)
-            .arg(&model_gguf)
-            .arg(&args.model_id)
-            .arg(&args.revision)
-            .spawn()?
-            .wait()
-            .await?;
+        if !Path::new(&model_gguf).exists() {
+            let tmp_gguf = "models/tmp.gguf";

-        if !status.success() {
-            error!("Failed to generate GGUF");
+            let status = Command::new(make_gguf)
+                .arg(tmp_gguf)
+                .arg(&args.model_id)
+                .arg(&args.revision)
+                .spawn()?
+                .wait()
+                .await?;
+
+            if !status.success() {
+                let exit_code = status.code().unwrap_or(-1);
+                error!("Failed to generate GGUF, exit code: {}", exit_code);
+                return Err(RouterError::CommandError(exit_code));
+            }
+            quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
+                .map_err(RouterError::QuantizeError)?;
        }
        model_gguf
    };
@ -305,8 +319,12 @@ enum RouterError {
    WebServer(#[from] server::WebServerError),
    #[error("Recv error: {0}")]
    RecvError(#[from] RecvError),
-    #[error("IoError: {0}")]
+    #[error("Io error: {0}")]
    IoError(#[from] std::io::Error),
-    #[error("VarError: {0}")]
+    #[error("Var error: {0}")]
    VarError(#[from] std::env::VarError),
+    #[error("Quantize error: {0}")]
+    QuantizeError(String),
+    #[error("Command error: {0}")]
+    CommandError(i32),
 }
--- a/backends/llamacpp/src/quantize.rs
+++ b/backends/llamacpp/src/quantize.rs
@ -0,0 +1,39 @@
+use crate::llamacpp;
+
+use std::ffi::CString;
+use std::path::Path;
+
+#[repr(u32)]
+#[derive(Debug, Clone, Copy)]
+pub enum QuantizeType {
+    MostlyQ4_0 = 2,
+}
+
+pub fn model(
+    input_path: &str,
+    output_path: &str,
+    ftype: QuantizeType,
+    n_threads: usize,
+) -> Result<(), String> {
+    if !Path::new(input_path).exists() {
+        return Err(format!("Input file '{}' does not exist", input_path));
+    }
+    let c_input_path =
+        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
+
+    let c_output_path =
+        CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
+
+    let result = unsafe {
+        let mut params = llamacpp::model_quantize_default_params();
+        params.nthread = n_threads as _;
+        params.ftype = ftype as _;
+        params.quantize_output_tensor = true;
+        llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
+    };
+    if result == 0 {
+        Ok(())
+    } else {
+        Err(format!("Quantization failed, error code: {}", result))
+    }
+}