From 0a55bd3db9704686c98a36f87263daa3843ee27f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Thu, 20 Feb 2025 15:40:40 +0000
Subject: [PATCH] Quantize without llama-quantize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 Dockerfile_llamacpp               |  3 ++-
 backends/llamacpp/make-gguf.sh    | 21 +++++++---------
 backends/llamacpp/src/backend.rs  |  9 ++-----
 backends/llamacpp/src/llamacpp.rs |  5 ++++
 backends/llamacpp/src/main.rs     | 40 ++++++++++++++++++++++---------
 backends/llamacpp/src/quantize.rs | 39 ++++++++++++++++++++++++++++++
 6 files changed, 86 insertions(+), 31 deletions(-)
 create mode 100644 backends/llamacpp/src/llamacpp.rs
 create mode 100644 backends/llamacpp/src/quantize.rs

diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
index 736066b9..00924fae 100644
--- a/Dockerfile_llamacpp
+++ b/Dockerfile_llamacpp
@@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \
     -DCMAKE_CXX_COMPILER=clang++ \
     -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
     -DGGML_CUDA=${llamacpp_cuda} \
+    -DLLAMA_BUILD_COMMON=OFF \
     -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
     -DLLAMA_BUILD_SERVER=OFF \
  && cmake --build build --parallel --config Release \
  && cmake --install build
@@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH"
 COPY backends/llamacpp/requirements.txt requirements.txt
 COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
 COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
-COPY --from=builder /usr/bin/llama-quantize /usr/bin/
 
 RUN pip3 install --no-cache-dir \
     -r requirements.txt \
diff --git a/backends/llamacpp/make-gguf.sh b/backends/llamacpp/make-gguf.sh
index 713330ff..e4823af1 100755
--- a/backends/llamacpp/make-gguf.sh
+++ b/backends/llamacpp/make-gguf.sh
@@ -5,15 +5,17 @@
 	return 1
 }
 
+case "$1" in (*?.gguf) ;; (*)
+	echo "Not a valid GGUF file: $1"
+	return 1;
+esac
+
 GGUF="$1"
-GGUF_DIR=$(dirname "$GGUF")
-GGUF_TMP="model.src/tmp.gguf"
+GGUF_DIR=$(dirname -- "$GGUF")
 MODEL_ID="$2"
 MODEL_DIR="model.src/$2"
 REV="${3-main}"
 
-[ -e "$GGUF" ] && return
-
 mkdir -p model.src "$GGUF_DIR"
 
 huggingface-cli download \
@@ -22,12 +24,7 @@ huggingface-cli download \
 	"$MODEL_ID" &&
 
 convert_hf_to_gguf.py \
-	--outfile "$GGUF_TMP" \
-	"$MODEL_DIR" &&
+	--outfile "$GGUF" \
+	"$MODEL_DIR"
 
-llama-quantize \
-	"$GGUF_TMP" \
-	"$GGUF" \
-	"Q4_0"
-
-rm -rf model.src
+rm -rf -- model.src
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 1566e1bf..3405cfad 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,10 +1,5 @@
-mod llamacpp {
-    #![allow(non_upper_case_globals)]
-    #![allow(non_camel_case_types)]
-    #![allow(non_snake_case)]
-    #![allow(dead_code)]
-    include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
-}
+use crate::llamacpp;
+
 use async_trait::async_trait;
 use std::ffi::CString;
 use std::mem::replace;
diff --git a/backends/llamacpp/src/llamacpp.rs b/backends/llamacpp/src/llamacpp.rs
new file mode 100644
index 00000000..fb206df2
--- /dev/null
+++ b/backends/llamacpp/src/llamacpp.rs
@@ -0,0 +1,5 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(dead_code)]
+include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index c49f9a23..c5e72d4f 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -1,10 +1,15 @@
 mod backend;
+mod llamacpp;
+mod quantize;
+
+use quantize::QuantizeType;
 
 use backend::{
     BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
     LlamacppSplitMode,
 };
 use clap::Parser;
+use std::path::Path;
 use text_generation_router::{logging, server, usage_stats};
 use thiserror::Error;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
@@ -216,18 +221,27 @@ async fn main() -> Result<(), RouterError> {
             error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
             RouterError::VarError(e)
         })?;
+
         let model_gguf = format!("models/{}/model.gguf", args.model_id);
 
-        let status = Command::new(make_gguf)
-            .arg(&model_gguf)
-            .arg(&args.model_id)
-            .arg(&args.revision)
-            .spawn()?
-            .wait()
-            .await?;
+        if !Path::new(&model_gguf).exists() {
+            let tmp_gguf = "models/tmp.gguf";
 
-        if !status.success() {
-            error!("Failed to generate GGUF");
+            let status = Command::new(make_gguf)
+                .arg(tmp_gguf)
+                .arg(&args.model_id)
+                .arg(&args.revision)
+                .spawn()?
+                .wait()
+                .await?;
+
+            if !status.success() {
+                let exit_code = status.code().unwrap_or(-1);
+                error!("Failed to generate GGUF, exit code: {}", exit_code);
+                return Err(RouterError::CommandError(exit_code));
+            }
+            quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
+                .map_err(RouterError::QuantizeError)?;
         }
         model_gguf
     };
@@ -305,8 +319,12 @@ enum RouterError {
     WebServer(#[from] server::WebServerError),
     #[error("Recv error: {0}")]
     RecvError(#[from] RecvError),
-    #[error("IoError: {0}")]
+    #[error("Io error: {0}")]
     IoError(#[from] std::io::Error),
-    #[error("VarError: {0}")]
+    #[error("Var error: {0}")]
     VarError(#[from] std::env::VarError),
+    #[error("Quantize error: {0}")]
+    QuantizeError(String),
+    #[error("Command error: {0}")]
+    CommandError(i32),
 }
diff --git a/backends/llamacpp/src/quantize.rs b/backends/llamacpp/src/quantize.rs
new file mode 100644
index 00000000..7f0cde9f
--- /dev/null
+++ b/backends/llamacpp/src/quantize.rs
@@ -0,0 +1,39 @@
+use crate::llamacpp;
+
+use std::ffi::CString;
+use std::path::Path;
+
+#[repr(u32)]
+#[derive(Debug, Clone, Copy)]
+pub enum QuantizeType {
+    MostlyQ4_0 = 2,
+}
+
+pub fn model(
+    input_path: &str,
+    output_path: &str,
+    ftype: QuantizeType,
+    n_threads: usize,
+) -> Result<(), String> {
+    if !Path::new(input_path).exists() {
+        return Err(format!("Input file '{}' does not exist", input_path));
+    }
+    let c_input_path =
+        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
+
+    let c_output_path =
+        CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
+
+    let result = unsafe {
+        let mut params = llamacpp::model_quantize_default_params();
+        params.nthread = n_threads as _;
+        params.ftype = ftype as _;
+        params.quantize_output_tensor = true;
+        llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
+    };
+    if result == 0 {
+        Ok(())
+    } else {
+        Err(format!("Quantization failed, error code: {}", result))
+    }
+}