From 0a55bd3db9704686c98a36f87263daa3843ee27f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 20 Feb 2025 15:40:40 +0000 Subject: [PATCH] Quantize without llama-quantize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 3 ++- backends/llamacpp/make-gguf.sh | 21 +++++++--------- backends/llamacpp/src/backend.rs | 9 ++----- backends/llamacpp/src/llamacpp.rs | 5 ++++ backends/llamacpp/src/main.rs | 40 ++++++++++++++++++++++--------- backends/llamacpp/src/quantize.rs | 39 ++++++++++++++++++++++++++++++ 6 files changed, 86 insertions(+), 31 deletions(-) create mode 100644 backends/llamacpp/src/llamacpp.rs create mode 100644 backends/llamacpp/src/quantize.rs diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 736066b9c..00924faed 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ -DGGML_CUDA=${llamacpp_cuda} \ + -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_SERVER=OFF \ && cmake --build build --parallel --config Release \ && cmake --install build @@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH" COPY backends/llamacpp/requirements.txt requirements.txt COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ -COPY --from=builder /usr/bin/llama-quantize /usr/bin/ RUN pip3 install --no-cache-dir \ -r requirements.txt \ diff --git a/backends/llamacpp/make-gguf.sh b/backends/llamacpp/make-gguf.sh index 713330ff0..e4823af1d 100755 --- a/backends/llamacpp/make-gguf.sh +++ b/backends/llamacpp/make-gguf.sh @@ -5,15 +5,17 @@ return 1 } +case "$1" in (*?.gguf) ;; (*) + echo "Not a valid GGUF file: $1" + return 1; +esac + GGUF="$1" -GGUF_DIR=$(dirname "$GGUF") -GGUF_TMP="model.src/tmp.gguf" +GGUF_DIR=$(dirname -- "$GGUF") MODEL_ID="$2" MODEL_DIR="model.src/$2" REV="${3-main}" -[ -e "$GGUF" ] && return - mkdir -p model.src "$GGUF_DIR" huggingface-cli download \ @@ -22,12 +24,7 @@ huggingface-cli download \ "$MODEL_ID" && convert_hf_to_gguf.py \ - --outfile "$GGUF_TMP" \ - "$MODEL_DIR" && + --outfile "$GGUF" \ + "$MODEL_DIR" -llama-quantize \ - "$GGUF_TMP" \ - "$GGUF" \ - "Q4_0" - -rm -rf model.src +rm -rf -- model.src diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 1566e1bf9..3405cfadd 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,10 +1,5 @@ -mod llamacpp { - #![allow(non_upper_case_globals)] - #![allow(non_camel_case_types)] - #![allow(non_snake_case)] - #![allow(dead_code)] - include!(concat!(env!("OUT_DIR"), "/llamacpp.rs")); -} +use crate::llamacpp; + use async_trait::async_trait; use std::ffi::CString; use std::mem::replace; diff --git a/backends/llamacpp/src/llamacpp.rs b/backends/llamacpp/src/llamacpp.rs new file mode 100644 index 000000000..fb206df27 --- /dev/null +++ b/backends/llamacpp/src/llamacpp.rs @@ -0,0 +1,5 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] +include!(concat!(env!("OUT_DIR"), "/llamacpp.rs")); diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index c49f9a237..c5e72d4f8 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,10 +1,15 @@ mod backend; +mod llamacpp; +mod quantize; + +use quantize::QuantizeType; use backend::{ BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma, LlamacppSplitMode, }; use clap::Parser; +use std::path::Path; use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; use tokenizers::{FromPretrainedParameters, Tokenizer}; @@ -216,18 +221,27 @@ async fn main() -> Result<(), RouterError> { error!("No GGUF model given and environment variable MAKE_GGUF is missing."); RouterError::VarError(e) })?; + let model_gguf = format!("models/{}/model.gguf", args.model_id); - let status = Command::new(make_gguf) - .arg(&model_gguf) - .arg(&args.model_id) - .arg(&args.revision) - .spawn()? - .wait() - .await?; + if !Path::new(&model_gguf).exists() { + let tmp_gguf = "models/tmp.gguf"; - if !status.success() { - error!("Failed to generate GGUF"); + let status = Command::new(make_gguf) + .arg(tmp_gguf) + .arg(&args.model_id) + .arg(&args.revision) + .spawn()? + .wait() + .await?; + + if !status.success() { + let exit_code = status.code().unwrap_or(-1); + error!("Failed to generate GGUF, exit code: {}", exit_code); + return Err(RouterError::CommandError(exit_code)); + } + quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads) + .map_err(RouterError::QuantizeError)?; } model_gguf }; @@ -305,8 +319,12 @@ enum RouterError { WebServer(#[from] server::WebServerError), #[error("Recv error: {0}")] RecvError(#[from] RecvError), - #[error("IoError: {0}")] + #[error("Io error: {0}")] IoError(#[from] std::io::Error), - #[error("VarError: {0}")] + #[error("Var error: {0}")] VarError(#[from] std::env::VarError), + #[error("Quantize error: {0}")] + QuantizeError(String), + #[error("Command error: {0}")] + CommandError(i32), } diff --git a/backends/llamacpp/src/quantize.rs b/backends/llamacpp/src/quantize.rs new file mode 100644 index 000000000..7f0cde9f8 --- /dev/null +++ b/backends/llamacpp/src/quantize.rs @@ -0,0 +1,39 @@ +use crate::llamacpp; + +use std::ffi::CString; +use std::path::Path; + +#[repr(u32)] +#[derive(Debug, Clone, Copy)] +pub enum QuantizeType { + MostlyQ4_0 = 2, +} + +pub fn model( + input_path: &str, + output_path: &str, + ftype: QuantizeType, + n_threads: usize, +) -> Result<(), String> { + if !Path::new(input_path).exists() { + return Err(format!("Input file '{}' does not exist", input_path)); + } + let c_input_path = + CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?; + + let c_output_path = + CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?; + + let result = unsafe { + let mut params = llamacpp::model_quantize_default_params(); + params.nthread = n_threads as _; + params.ftype = ftype as _; + params.quantize_output_tensor = true; + llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), ¶ms) + }; + if result == 0 { + Ok(()) + } else { + Err(format!("Quantization failed, error code: {}", result)) + } +}