mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 22:32:07 +00:00
Quantize without llama-quantize
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
6223b6e264
commit
0a55bd3db9
@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \
|
||||
-DCMAKE_CXX_COMPILER=clang++ \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
|
||||
-DGGML_CUDA=${llamacpp_cuda} \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
&& cmake --build build --parallel --config Release \
|
||||
&& cmake --install build
|
||||
@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH"
|
||||
COPY backends/llamacpp/requirements.txt requirements.txt
|
||||
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
|
||||
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
|
||||
COPY --from=builder /usr/bin/llama-quantize /usr/bin/
|
||||
|
||||
RUN pip3 install --no-cache-dir \
|
||||
-r requirements.txt \
|
||||
|
@ -5,15 +5,17 @@
|
||||
return 1
|
||||
}
|
||||
|
||||
case "$1" in (*?.gguf) ;; (*)
|
||||
echo "Not a valid GGUF file: $1"
|
||||
return 1;
|
||||
esac
|
||||
|
||||
GGUF="$1"
|
||||
GGUF_DIR=$(dirname "$GGUF")
|
||||
GGUF_TMP="model.src/tmp.gguf"
|
||||
GGUF_DIR=$(dirname -- "$GGUF")
|
||||
MODEL_ID="$2"
|
||||
MODEL_DIR="model.src/$2"
|
||||
REV="${3-main}"
|
||||
|
||||
[ -e "$GGUF" ] && return
|
||||
|
||||
mkdir -p model.src "$GGUF_DIR"
|
||||
|
||||
huggingface-cli download \
|
||||
@ -22,12 +24,7 @@ huggingface-cli download \
|
||||
"$MODEL_ID" &&
|
||||
|
||||
convert_hf_to_gguf.py \
|
||||
--outfile "$GGUF_TMP" \
|
||||
"$MODEL_DIR" &&
|
||||
--outfile "$GGUF" \
|
||||
"$MODEL_DIR"
|
||||
|
||||
llama-quantize \
|
||||
"$GGUF_TMP" \
|
||||
"$GGUF" \
|
||||
"Q4_0"
|
||||
|
||||
rm -rf model.src
|
||||
rm -rf -- model.src
|
||||
|
@ -1,10 +1,5 @@
|
||||
mod llamacpp {
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
#![allow(dead_code)]
|
||||
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
|
||||
}
|
||||
use crate::llamacpp;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use std::ffi::CString;
|
||||
use std::mem::replace;
|
||||
|
5
backends/llamacpp/src/llamacpp.rs
Normal file
5
backends/llamacpp/src/llamacpp.rs
Normal file
@ -0,0 +1,5 @@
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
#![allow(dead_code)]
|
||||
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
|
@ -1,10 +1,15 @@
|
||||
mod backend;
|
||||
mod llamacpp;
|
||||
mod quantize;
|
||||
|
||||
use quantize::QuantizeType;
|
||||
|
||||
use backend::{
|
||||
BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
|
||||
LlamacppSplitMode,
|
||||
};
|
||||
use clap::Parser;
|
||||
use std::path::Path;
|
||||
use text_generation_router::{logging, server, usage_stats};
|
||||
use thiserror::Error;
|
||||
use tokenizers::{FromPretrainedParameters, Tokenizer};
|
||||
@ -216,18 +221,27 @@ async fn main() -> Result<(), RouterError> {
|
||||
error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
|
||||
RouterError::VarError(e)
|
||||
})?;
|
||||
|
||||
let model_gguf = format!("models/{}/model.gguf", args.model_id);
|
||||
|
||||
let status = Command::new(make_gguf)
|
||||
.arg(&model_gguf)
|
||||
.arg(&args.model_id)
|
||||
.arg(&args.revision)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
if !Path::new(&model_gguf).exists() {
|
||||
let tmp_gguf = "models/tmp.gguf";
|
||||
|
||||
if !status.success() {
|
||||
error!("Failed to generate GGUF");
|
||||
let status = Command::new(make_gguf)
|
||||
.arg(tmp_gguf)
|
||||
.arg(&args.model_id)
|
||||
.arg(&args.revision)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
if !status.success() {
|
||||
let exit_code = status.code().unwrap_or(-1);
|
||||
error!("Failed to generate GGUF, exit code: {}", exit_code);
|
||||
return Err(RouterError::CommandError(exit_code));
|
||||
}
|
||||
quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
|
||||
.map_err(RouterError::QuantizeError)?;
|
||||
}
|
||||
model_gguf
|
||||
};
|
||||
@ -305,8 +319,12 @@ enum RouterError {
|
||||
WebServer(#[from] server::WebServerError),
|
||||
#[error("Recv error: {0}")]
|
||||
RecvError(#[from] RecvError),
|
||||
#[error("IoError: {0}")]
|
||||
#[error("Io error: {0}")]
|
||||
IoError(#[from] std::io::Error),
|
||||
#[error("VarError: {0}")]
|
||||
#[error("Var error: {0}")]
|
||||
VarError(#[from] std::env::VarError),
|
||||
#[error("Quantize error: {0}")]
|
||||
QuantizeError(String),
|
||||
#[error("Command error: {0}")]
|
||||
CommandError(i32),
|
||||
}
|
||||
|
39
backends/llamacpp/src/quantize.rs
Normal file
39
backends/llamacpp/src/quantize.rs
Normal file
@ -0,0 +1,39 @@
|
||||
use crate::llamacpp;
|
||||
|
||||
use std::ffi::CString;
|
||||
use std::path::Path;
|
||||
|
||||
#[repr(u32)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum QuantizeType {
|
||||
MostlyQ4_0 = 2,
|
||||
}
|
||||
|
||||
pub fn model(
|
||||
input_path: &str,
|
||||
output_path: &str,
|
||||
ftype: QuantizeType,
|
||||
n_threads: usize,
|
||||
) -> Result<(), String> {
|
||||
if !Path::new(input_path).exists() {
|
||||
return Err(format!("Input file '{}' does not exist", input_path));
|
||||
}
|
||||
let c_input_path =
|
||||
CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
|
||||
|
||||
let c_output_path =
|
||||
CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
|
||||
|
||||
let result = unsafe {
|
||||
let mut params = llamacpp::model_quantize_default_params();
|
||||
params.nthread = n_threads as _;
|
||||
params.ftype = ftype as _;
|
||||
params.quantize_output_tensor = true;
|
||||
llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), ¶ms)
|
||||
};
|
||||
if result == 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(format!("Quantization failed, error code: {}", result))
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user