text-generation-inference/backends/llamacpp/src/quantize.rs

use crate::llamacpp;

use std::ffi::CString;

#[repr(u32)]
#[derive(Debug, Clone, Copy)]
pub enum QuantizeType {
    MostlyQ4_0 = 2,
}

pub fn model(
    input_path: &str,
    output_path: &str,
    ftype: QuantizeType,
    n_threads: usize,
) -> Result<(), String> {
    let c_input_path =
        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;

    let c_output_path =
        CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;

    let result = unsafe {
        let mut params = llamacpp::model_quantize_default_params();
        params.nthread = n_threads as _;
        params.ftype = ftype as _;
        params.quantize_output_tensor = true;
        llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
    };
    if result == 0 {
        Ok(())
    } else {
        Err(format!("Quantization failed, error code: {}", result))
    }
}
Update the llamacpp backend (#3022) * Build faster Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Make --model-gguf optional Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Bump llama.cpp Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Enable mmap, offload_kqv & flash_attention by default Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update doc Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Better error message Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update doc Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update installed packages Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Save gguf in models/MODEL_ID/model.gguf Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Fix build with Mach-O Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Quantize without llama-quantize Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Bump llama.cpp and switch to ggml-org Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Remove make-gguf.sh Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Update Cargo.lock Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Support HF_HUB_USER_AGENT_ORIGIN Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Bump llama.cpp Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Add --build-arg llamacpp_native & llamacpp_cpu_arm_arch Signed-off-by: Adrien Gallouët <angt@huggingface.co> --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co> 2025-03-11 08:19:01 +00:00			`use crate::llamacpp;`

			`use std::ffi::CString;`

			`#[repr(u32)]`
			`#[derive(Debug, Clone, Copy)]`
			`pub enum QuantizeType {`
			`MostlyQ4_0 = 2,`
			`}`

			`pub fn model(`
			`input_path: &str,`
			`output_path: &str,`
			`ftype: QuantizeType,`
			`n_threads: usize,`
			`) -> Result<(), String> {`
			`let c_input_path =`
			`CString::new(input_path).map_err(\|e\| format!("Failed to convert input path: {}", e))?;`

			`let c_output_path =`
			`CString::new(output_path).map_err(\|e\| format!("Failed to convert output path: {}", e))?;`

			`let result = unsafe {`
			`let mut params = llamacpp::model_quantize_default_params();`
			`params.nthread = n_threads as _;`
			`params.ftype = ftype as _;`
			`params.quantize_output_tensor = true;`
			`llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)`
			`};`
			`if result == 0 {`
			`Ok(())`
			`} else {`
			`Err(format!("Quantization failed, error code: {}", result))`
			`}`
			`}`