Quantize without llama-quantize

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-10-23 22:05:23 +00:00 · 2025-02-20 15:40:40 +00:00 · 2025-02-20 15:40:40 +00:00 · 0a55bd3db9
commit 0a55bd3db9
parent 6223b6e264
6 changed files with 86 additions and 31 deletions
--- a/3
+++ b/3
@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
    -DGGML_CUDA=${llamacpp_cuda} \
    -DLLAMA_BUILD_COMMON=OFF \
    -DLLAMA_BUILD_TESTS=OFF \
    -DLLAMA_BUILD_EXAMPLES=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
 && cmake --build build --parallel --config Release \
 && cmake --install build
@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH"
 COPY backends/llamacpp/requirements.txt requirements.txt
 COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
 COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
 COPY --from=builder /usr/bin/llama-quantize /usr/bin/
 RUN pip3 install --no-cache-dir \
    -r requirements.txt \
--- a/backends/llamacpp/make-gguf.sh
+++ b/backends/llamacpp/make-gguf.sh
@ -5,15 +5,17 @@
 	return 1
 }
 case "$1" in (*?.gguf) ;; (*)
 	echo "Not a valid GGUF file: $1"
 	return 1;
 esac
 GGUF="$1"
-GGUF_DIR=$(dirname "$GGUF")
+GGUF_DIR=$(dirname -- "$GGUF")
 GGUF_TMP="model.src/tmp.gguf"
 MODEL_ID="$2"
 MODEL_DIR="model.src/$2"
 REV="${3-main}"
 [ -e "$GGUF" ] && return
 mkdir -p model.src "$GGUF_DIR"
 huggingface-cli download \
@ -22,12 +24,7 @@ huggingface-cli download \
 	"$MODEL_ID" &&
 convert_hf_to_gguf.py \
-	--outfile "$GGUF_TMP" \
+	--outfile "$GGUF" \
-	"$MODEL_DIR" &&
+	"$MODEL_DIR"
-llama-quantize \
+rm -rf -- model.src
 	"$GGUF_TMP" \
 	"$GGUF" \
 	"Q4_0"
 rm -rf model.src
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@ -1,10 +1,5 @@
-mod llamacpp {
+use crate::llamacpp;
-    #![allow(non_upper_case_globals)]
+
    #![allow(non_camel_case_types)]
    #![allow(non_snake_case)]
    #![allow(dead_code)]
    include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
 }
 use async_trait::async_trait;
 use std::ffi::CString;
 use std::mem::replace;
--- a/backends/llamacpp/src/llamacpp.rs
+++ b/backends/llamacpp/src/llamacpp.rs
@ -0,0 +1,5 @@
 #![allow(non_upper_case_globals)]
 #![allow(non_camel_case_types)]
 #![allow(non_snake_case)]
 #![allow(dead_code)]
 include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -1,10 +1,15 @@
 mod backend;
 mod llamacpp;
 mod quantize;
 use quantize::QuantizeType;
 use backend::{
    BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
    LlamacppSplitMode,
 };
 use clap::Parser;
 use std::path::Path;
 use text_generation_router::{logging, server, usage_stats};
 use thiserror::Error;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
@ -216,10 +221,14 @@ async fn main() -> Result<(), RouterError> {
            error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
            RouterError::VarError(e)
        })?;
        let model_gguf = format!("models/{}/model.gguf", args.model_id);
        if !Path::new(&model_gguf).exists() {
            let tmp_gguf = "models/tmp.gguf";
            let status = Command::new(make_gguf)
-            .arg(&model_gguf)
+                .arg(tmp_gguf)
                .arg(&args.model_id)
                .arg(&args.revision)
                .spawn()?
@ -227,7 +236,12 @@ async fn main() -> Result<(), RouterError> {
                .await?;
            if !status.success() {
-            error!("Failed to generate GGUF");
+                let exit_code = status.code().unwrap_or(-1);
                error!("Failed to generate GGUF, exit code: {}", exit_code);
                return Err(RouterError::CommandError(exit_code));
            }
            quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
                .map_err(RouterError::QuantizeError)?;
        }
        model_gguf
    };
@ -305,8 +319,12 @@ enum RouterError {
    WebServer(#[from] server::WebServerError),
    #[error("Recv error: {0}")]
    RecvError(#[from] RecvError),
-    #[error("IoError: {0}")]
+    #[error("Io error: {0}")]
    IoError(#[from] std::io::Error),
-    #[error("VarError: {0}")]
+    #[error("Var error: {0}")]
    VarError(#[from] std::env::VarError),
    #[error("Quantize error: {0}")]
    QuantizeError(String),
    #[error("Command error: {0}")]
    CommandError(i32),
 }
--- a/backends/llamacpp/src/quantize.rs
+++ b/backends/llamacpp/src/quantize.rs
@ -0,0 +1,39 @@
 use crate::llamacpp;
 use std::ffi::CString;
 use std::path::Path;
 #[repr(u32)]
 #[derive(Debug, Clone, Copy)]
 pub enum QuantizeType {
    MostlyQ4_0 = 2,
 }
 pub fn model(
    input_path: &str,
    output_path: &str,
    ftype: QuantizeType,
    n_threads: usize,
 ) -> Result<(), String> {
    if !Path::new(input_path).exists() {
        return Err(format!("Input file '{}' does not exist", input_path));
    }
    let c_input_path =
        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
    let c_output_path =
        CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
    let result = unsafe {
        let mut params = llamacpp::model_quantize_default_params();
        params.nthread = n_threads as _;
        params.ftype = ftype as _;
        params.quantize_output_tensor = true;
        llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
    };
    if result == 0 {
        Ok(())
    } else {
        Err(format!("Quantization failed, error code: {}", result))
    }
 }