Quantize without llama-quantize

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2025-02-20 15:40:40 +00:00
parent 6223b6e264
commit 0a55bd3db9
No known key found for this signature in database
6 changed files with 86 additions and 31 deletions

View File

@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
-DGGML_CUDA=${llamacpp_cuda} \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_SERVER=OFF \
&& cmake --build build --parallel --config Release \
&& cmake --install build
@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH"
COPY backends/llamacpp/requirements.txt requirements.txt
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
COPY --from=builder /usr/bin/llama-quantize /usr/bin/
RUN pip3 install --no-cache-dir \
-r requirements.txt \

View File

@ -5,15 +5,17 @@
return 1
}
case "$1" in (*?.gguf) ;; (*)
echo "Not a valid GGUF file: $1"
return 1;
esac
GGUF="$1"
GGUF_DIR=$(dirname "$GGUF")
GGUF_TMP="model.src/tmp.gguf"
GGUF_DIR=$(dirname -- "$GGUF")
MODEL_ID="$2"
MODEL_DIR="model.src/$2"
REV="${3-main}"
[ -e "$GGUF" ] && return
mkdir -p model.src "$GGUF_DIR"
huggingface-cli download \
@ -22,12 +24,7 @@ huggingface-cli download \
"$MODEL_ID" &&
convert_hf_to_gguf.py \
--outfile "$GGUF_TMP" \
"$MODEL_DIR" &&
--outfile "$GGUF" \
"$MODEL_DIR"
llama-quantize \
"$GGUF_TMP" \
"$GGUF" \
"Q4_0"
rm -rf model.src
rm -rf -- model.src

View File

@ -1,10 +1,5 @@
mod llamacpp {
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
}
use crate::llamacpp;
use async_trait::async_trait;
use std::ffi::CString;
use std::mem::replace;

View File

@ -0,0 +1,5 @@
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));

View File

@ -1,10 +1,15 @@
mod backend;
mod llamacpp;
mod quantize;
use quantize::QuantizeType;
use backend::{
BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
LlamacppSplitMode,
};
use clap::Parser;
use std::path::Path;
use text_generation_router::{logging, server, usage_stats};
use thiserror::Error;
use tokenizers::{FromPretrainedParameters, Tokenizer};
@ -216,18 +221,27 @@ async fn main() -> Result<(), RouterError> {
error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
RouterError::VarError(e)
})?;
let model_gguf = format!("models/{}/model.gguf", args.model_id);
let status = Command::new(make_gguf)
.arg(&model_gguf)
.arg(&args.model_id)
.arg(&args.revision)
.spawn()?
.wait()
.await?;
if !Path::new(&model_gguf).exists() {
let tmp_gguf = "models/tmp.gguf";
if !status.success() {
error!("Failed to generate GGUF");
let status = Command::new(make_gguf)
.arg(tmp_gguf)
.arg(&args.model_id)
.arg(&args.revision)
.spawn()?
.wait()
.await?;
if !status.success() {
let exit_code = status.code().unwrap_or(-1);
error!("Failed to generate GGUF, exit code: {}", exit_code);
return Err(RouterError::CommandError(exit_code));
}
quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
.map_err(RouterError::QuantizeError)?;
}
model_gguf
};
@ -305,8 +319,12 @@ enum RouterError {
WebServer(#[from] server::WebServerError),
#[error("Recv error: {0}")]
RecvError(#[from] RecvError),
#[error("IoError: {0}")]
#[error("Io error: {0}")]
IoError(#[from] std::io::Error),
#[error("VarError: {0}")]
#[error("Var error: {0}")]
VarError(#[from] std::env::VarError),
#[error("Quantize error: {0}")]
QuantizeError(String),
#[error("Command error: {0}")]
CommandError(i32),
}

View File

@ -0,0 +1,39 @@
use crate::llamacpp;
use std::ffi::CString;
use std::path::Path;
#[repr(u32)]
#[derive(Debug, Clone, Copy)]
pub enum QuantizeType {
MostlyQ4_0 = 2,
}
pub fn model(
input_path: &str,
output_path: &str,
ftype: QuantizeType,
n_threads: usize,
) -> Result<(), String> {
if !Path::new(input_path).exists() {
return Err(format!("Input file '{}' does not exist", input_path));
}
let c_input_path =
CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
let c_output_path =
CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
let result = unsafe {
let mut params = llamacpp::model_quantize_default_params();
params.nthread = n_threads as _;
params.ftype = ftype as _;
params.quantize_output_tensor = true;
llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
};
if result == 0 {
Ok(())
} else {
Err(format!("Quantization failed, error code: {}", result))
}
}