mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 22:32:07 +00:00
Quantize without llama-quantize
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
6223b6e264
commit
0a55bd3db9
@ -28,7 +28,9 @@ RUN mkdir -p llama.cpp \
|
|||||||
-DCMAKE_CXX_COMPILER=clang++ \
|
-DCMAKE_CXX_COMPILER=clang++ \
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
|
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
|
||||||
-DGGML_CUDA=${llamacpp_cuda} \
|
-DGGML_CUDA=${llamacpp_cuda} \
|
||||||
|
-DLLAMA_BUILD_COMMON=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
&& cmake --build build --parallel --config Release \
|
&& cmake --build build --parallel --config Release \
|
||||||
&& cmake --install build
|
&& cmake --install build
|
||||||
@ -68,7 +70,6 @@ ENV PATH="/venv/bin:$PATH"
|
|||||||
COPY backends/llamacpp/requirements.txt requirements.txt
|
COPY backends/llamacpp/requirements.txt requirements.txt
|
||||||
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
|
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
|
||||||
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
|
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
|
||||||
COPY --from=builder /usr/bin/llama-quantize /usr/bin/
|
|
||||||
|
|
||||||
RUN pip3 install --no-cache-dir \
|
RUN pip3 install --no-cache-dir \
|
||||||
-r requirements.txt \
|
-r requirements.txt \
|
||||||
|
@ -5,15 +5,17 @@
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case "$1" in (*?.gguf) ;; (*)
|
||||||
|
echo "Not a valid GGUF file: $1"
|
||||||
|
return 1;
|
||||||
|
esac
|
||||||
|
|
||||||
GGUF="$1"
|
GGUF="$1"
|
||||||
GGUF_DIR=$(dirname "$GGUF")
|
GGUF_DIR=$(dirname -- "$GGUF")
|
||||||
GGUF_TMP="model.src/tmp.gguf"
|
|
||||||
MODEL_ID="$2"
|
MODEL_ID="$2"
|
||||||
MODEL_DIR="model.src/$2"
|
MODEL_DIR="model.src/$2"
|
||||||
REV="${3-main}"
|
REV="${3-main}"
|
||||||
|
|
||||||
[ -e "$GGUF" ] && return
|
|
||||||
|
|
||||||
mkdir -p model.src "$GGUF_DIR"
|
mkdir -p model.src "$GGUF_DIR"
|
||||||
|
|
||||||
huggingface-cli download \
|
huggingface-cli download \
|
||||||
@ -22,12 +24,7 @@ huggingface-cli download \
|
|||||||
"$MODEL_ID" &&
|
"$MODEL_ID" &&
|
||||||
|
|
||||||
convert_hf_to_gguf.py \
|
convert_hf_to_gguf.py \
|
||||||
--outfile "$GGUF_TMP" \
|
--outfile "$GGUF" \
|
||||||
"$MODEL_DIR" &&
|
"$MODEL_DIR"
|
||||||
|
|
||||||
llama-quantize \
|
rm -rf -- model.src
|
||||||
"$GGUF_TMP" \
|
|
||||||
"$GGUF" \
|
|
||||||
"Q4_0"
|
|
||||||
|
|
||||||
rm -rf model.src
|
|
||||||
|
@ -1,10 +1,5 @@
|
|||||||
mod llamacpp {
|
use crate::llamacpp;
|
||||||
#![allow(non_upper_case_globals)]
|
|
||||||
#![allow(non_camel_case_types)]
|
|
||||||
#![allow(non_snake_case)]
|
|
||||||
#![allow(dead_code)]
|
|
||||||
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
|
|
||||||
}
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use std::ffi::CString;
|
use std::ffi::CString;
|
||||||
use std::mem::replace;
|
use std::mem::replace;
|
||||||
|
5
backends/llamacpp/src/llamacpp.rs
Normal file
5
backends/llamacpp/src/llamacpp.rs
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#![allow(non_upper_case_globals)]
|
||||||
|
#![allow(non_camel_case_types)]
|
||||||
|
#![allow(non_snake_case)]
|
||||||
|
#![allow(dead_code)]
|
||||||
|
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
|
@ -1,10 +1,15 @@
|
|||||||
mod backend;
|
mod backend;
|
||||||
|
mod llamacpp;
|
||||||
|
mod quantize;
|
||||||
|
|
||||||
|
use quantize::QuantizeType;
|
||||||
|
|
||||||
use backend::{
|
use backend::{
|
||||||
BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
|
BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
|
||||||
LlamacppSplitMode,
|
LlamacppSplitMode,
|
||||||
};
|
};
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
use std::path::Path;
|
||||||
use text_generation_router::{logging, server, usage_stats};
|
use text_generation_router::{logging, server, usage_stats};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokenizers::{FromPretrainedParameters, Tokenizer};
|
use tokenizers::{FromPretrainedParameters, Tokenizer};
|
||||||
@ -216,10 +221,14 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
|
error!("No GGUF model given and environment variable MAKE_GGUF is missing.");
|
||||||
RouterError::VarError(e)
|
RouterError::VarError(e)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let model_gguf = format!("models/{}/model.gguf", args.model_id);
|
let model_gguf = format!("models/{}/model.gguf", args.model_id);
|
||||||
|
|
||||||
|
if !Path::new(&model_gguf).exists() {
|
||||||
|
let tmp_gguf = "models/tmp.gguf";
|
||||||
|
|
||||||
let status = Command::new(make_gguf)
|
let status = Command::new(make_gguf)
|
||||||
.arg(&model_gguf)
|
.arg(tmp_gguf)
|
||||||
.arg(&args.model_id)
|
.arg(&args.model_id)
|
||||||
.arg(&args.revision)
|
.arg(&args.revision)
|
||||||
.spawn()?
|
.spawn()?
|
||||||
@ -227,7 +236,12 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
if !status.success() {
|
if !status.success() {
|
||||||
error!("Failed to generate GGUF");
|
let exit_code = status.code().unwrap_or(-1);
|
||||||
|
error!("Failed to generate GGUF, exit code: {}", exit_code);
|
||||||
|
return Err(RouterError::CommandError(exit_code));
|
||||||
|
}
|
||||||
|
quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
|
||||||
|
.map_err(RouterError::QuantizeError)?;
|
||||||
}
|
}
|
||||||
model_gguf
|
model_gguf
|
||||||
};
|
};
|
||||||
@ -305,8 +319,12 @@ enum RouterError {
|
|||||||
WebServer(#[from] server::WebServerError),
|
WebServer(#[from] server::WebServerError),
|
||||||
#[error("Recv error: {0}")]
|
#[error("Recv error: {0}")]
|
||||||
RecvError(#[from] RecvError),
|
RecvError(#[from] RecvError),
|
||||||
#[error("IoError: {0}")]
|
#[error("Io error: {0}")]
|
||||||
IoError(#[from] std::io::Error),
|
IoError(#[from] std::io::Error),
|
||||||
#[error("VarError: {0}")]
|
#[error("Var error: {0}")]
|
||||||
VarError(#[from] std::env::VarError),
|
VarError(#[from] std::env::VarError),
|
||||||
|
#[error("Quantize error: {0}")]
|
||||||
|
QuantizeError(String),
|
||||||
|
#[error("Command error: {0}")]
|
||||||
|
CommandError(i32),
|
||||||
}
|
}
|
||||||
|
39
backends/llamacpp/src/quantize.rs
Normal file
39
backends/llamacpp/src/quantize.rs
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
use crate::llamacpp;
|
||||||
|
|
||||||
|
use std::ffi::CString;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
#[repr(u32)]
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum QuantizeType {
|
||||||
|
MostlyQ4_0 = 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn model(
|
||||||
|
input_path: &str,
|
||||||
|
output_path: &str,
|
||||||
|
ftype: QuantizeType,
|
||||||
|
n_threads: usize,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
if !Path::new(input_path).exists() {
|
||||||
|
return Err(format!("Input file '{}' does not exist", input_path));
|
||||||
|
}
|
||||||
|
let c_input_path =
|
||||||
|
CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
|
||||||
|
|
||||||
|
let c_output_path =
|
||||||
|
CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
|
||||||
|
|
||||||
|
let result = unsafe {
|
||||||
|
let mut params = llamacpp::model_quantize_default_params();
|
||||||
|
params.nthread = n_threads as _;
|
||||||
|
params.ftype = ftype as _;
|
||||||
|
params.quantize_output_tensor = true;
|
||||||
|
llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), ¶ms)
|
||||||
|
};
|
||||||
|
if result == 0 {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(format!("Quantization failed, error code: {}", result))
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user