mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-28 05:22:07 +00:00
* Build faster Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Make --model-gguf optional Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Bump llama.cpp Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Enable mmap, offload_kqv & flash_attention by default Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update doc Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Better error message Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update doc Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update installed packages Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Save gguf in models/MODEL_ID/model.gguf Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Fix build with Mach-O Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Quantize without llama-quantize Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Bump llama.cpp and switch to ggml-org Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Remove make-gguf.sh Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update Cargo.lock Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Support HF_HUB_USER_AGENT_ORIGIN Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Bump llama.cpp Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Add --build-arg llamacpp_native & llamacpp_cpu_arm_arch Signed-off-by: Adrien Gallouët <angt@huggingface.co> --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co>
36 lines
946 B
Rust
36 lines
946 B
Rust
use crate::llamacpp;
|
|
|
|
use std::ffi::CString;
|
|
|
|
#[repr(u32)]
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub enum QuantizeType {
|
|
MostlyQ4_0 = 2,
|
|
}
|
|
|
|
pub fn model(
|
|
input_path: &str,
|
|
output_path: &str,
|
|
ftype: QuantizeType,
|
|
n_threads: usize,
|
|
) -> Result<(), String> {
|
|
let c_input_path =
|
|
CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
|
|
|
|
let c_output_path =
|
|
CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
|
|
|
|
let result = unsafe {
|
|
let mut params = llamacpp::model_quantize_default_params();
|
|
params.nthread = n_threads as _;
|
|
params.ftype = ftype as _;
|
|
params.quantize_output_tensor = true;
|
|
llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), ¶ms)
|
|
};
|
|
if result == 0 {
|
|
Ok(())
|
|
} else {
|
|
Err(format!("Quantization failed, error code: {}", result))
|
|
}
|
|
}
|