Quantize without llama-quantize

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2025-02-20 15:40:40 +00:00
parent 6223b6e264
commit 0a55bd3db9
No known key found for this signature in database
6 changed files with 86 additions and 31 deletions

View File

@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \
-DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
-DGGML_CUDA=${llamacpp_cuda} \ -DGGML_CUDA=${llamacpp_cuda} \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_SERVER=OFF \ -DLLAMA_BUILD_SERVER=OFF \
&& cmake --build build --parallel --config Release \ && cmake --build build --parallel --config Release \
&& cmake --install build && cmake --install build
@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH"
COPY backends/llamacpp/requirements.txt requirements.txt COPY backends/llamacpp/requirements.txt requirements.txt
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
COPY --from=builder /usr/bin/llama-quantize /usr/bin/
RUN pip3 install --no-cache-dir \ RUN pip3 install --no-cache-dir \
-r requirements.txt \ -r requirements.txt \

View File

@ -5,15 +5,17 @@
return 1 return 1
} }
case "$1" in (*?.gguf) ;; (*)
echo "Not a valid GGUF file: $1"
return 1;
esac
GGUF="$1" GGUF="$1"
GGUF_DIR=$(dirname "$GGUF") GGUF_DIR=$(dirname -- "$GGUF")
GGUF_TMP="model.src/tmp.gguf"
MODEL_ID="$2" MODEL_ID="$2"
MODEL_DIR="model.src/$2" MODEL_DIR="model.src/$2"
REV="${3-main}" REV="${3-main}"
[ -e "$GGUF" ] && return
mkdir -p model.src "$GGUF_DIR" mkdir -p model.src "$GGUF_DIR"
huggingface-cli download \ huggingface-cli download \
@ -22,12 +24,7 @@ huggingface-cli download \
"$MODEL_ID" && "$MODEL_ID" &&
convert_hf_to_gguf.py \ convert_hf_to_gguf.py \
--outfile "$GGUF_TMP" \ --outfile "$GGUF" \
"$MODEL_DIR" && "$MODEL_DIR"
llama-quantize \ rm -rf -- model.src
"$GGUF_TMP" \
"$GGUF" \
"Q4_0"
rm -rf model.src

View File

@ -1,10 +1,5 @@
mod llamacpp { use crate::llamacpp;
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
}
use async_trait::async_trait; use async_trait::async_trait;
use std::ffi::CString; use std::ffi::CString;
use std::mem::replace; use std::mem::replace;

View File

@ -0,0 +1,5 @@
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));

View File

@ -1,10 +1,15 @@
mod backend; mod backend;
mod llamacpp;
mod quantize;
use quantize::QuantizeType;
use backend::{ use backend::{
BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma, BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
LlamacppSplitMode, LlamacppSplitMode,
}; };
use clap::Parser; use clap::Parser;
use std::path::Path;
use text_generation_router::{logging, server, usage_stats}; use text_generation_router::{logging, server, usage_stats};
use thiserror::Error; use thiserror::Error;
use tokenizers::{FromPretrainedParameters, Tokenizer}; use tokenizers::{FromPretrainedParameters, Tokenizer};
@ -216,18 +221,27 @@ async fn main() -> Result<(), RouterError> {
error!("No GGUF model given and environment variable MAKE_GGUF is missing."); error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
RouterError::VarError(e) RouterError::VarError(e)
})?; })?;
let model_gguf = format!("models/{}/model.gguf", args.model_id); let model_gguf = format!("models/{}/model.gguf", args.model_id);
let status = Command::new(make_gguf) if !Path::new(&model_gguf).exists() {
.arg(&model_gguf) let tmp_gguf = "models/tmp.gguf";
.arg(&args.model_id)
.arg(&args.revision)
.spawn()?
.wait()
.await?;
if !status.success() { let status = Command::new(make_gguf)
error!("Failed to generate GGUF"); .arg(tmp_gguf)
.arg(&args.model_id)
.arg(&args.revision)
.spawn()?
.wait()
.await?;
if !status.success() {
let exit_code = status.code().unwrap_or(-1);
error!("Failed to generate GGUF, exit code: {}", exit_code);
return Err(RouterError::CommandError(exit_code));
}
quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
.map_err(RouterError::QuantizeError)?;
} }
model_gguf model_gguf
}; };
@ -305,8 +319,12 @@ enum RouterError {
WebServer(#[from] server::WebServerError), WebServer(#[from] server::WebServerError),
#[error("Recv error: {0}")] #[error("Recv error: {0}")]
RecvError(#[from] RecvError), RecvError(#[from] RecvError),
#[error("IoError: {0}")] #[error("Io error: {0}")]
IoError(#[from] std::io::Error), IoError(#[from] std::io::Error),
#[error("VarError: {0}")] #[error("Var error: {0}")]
VarError(#[from] std::env::VarError), VarError(#[from] std::env::VarError),
#[error("Quantize error: {0}")]
QuantizeError(String),
#[error("Command error: {0}")]
CommandError(i32),
} }

View File

@ -0,0 +1,39 @@
use crate::llamacpp;
use std::ffi::CString;
use std::path::Path;
#[repr(u32)]
#[derive(Debug, Clone, Copy)]
pub enum QuantizeType {
MostlyQ4_0 = 2,
}
pub fn model(
input_path: &str,
output_path: &str,
ftype: QuantizeType,
n_threads: usize,
) -> Result<(), String> {
if !Path::new(input_path).exists() {
return Err(format!("Input file '{}' does not exist", input_path));
}
let c_input_path =
CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
let c_output_path =
CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
let result = unsafe {
let mut params = llamacpp::model_quantize_default_params();
params.nthread = n_threads as _;
params.ftype = ftype as _;
params.quantize_output_tensor = true;
llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
};
if result == 0 {
Ok(())
} else {
Err(format!("Quantization failed, error code: {}", result))
}
}