Add --defrag-threshold

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2025-01-31 10:38:34 +00:00
parent f388747985
commit e07835c5b5
No known key found for this signature in database
2 changed files with 7 additions and 0 deletions

View File

@ -52,6 +52,7 @@ pub struct LlamacppConfig {
pub n_threads: usize, pub n_threads: usize,
pub n_gpu_layers: usize, pub n_gpu_layers: usize,
pub split_mode: LlamacppSplitMode, pub split_mode: LlamacppSplitMode,
pub defrag_threshold: f32,
pub use_mmap: bool, pub use_mmap: bool,
pub use_mlock: bool, pub use_mlock: bool,
pub flash_attention: bool, pub flash_attention: bool,
@ -167,6 +168,7 @@ impl Llamacpp {
params.n_seq_max = conf.max_batch_size as _; params.n_seq_max = conf.max_batch_size as _;
params.n_threads = conf.n_threads as _; params.n_threads = conf.n_threads as _;
params.n_threads_batch = conf.n_threads as _; // TODO ? params.n_threads_batch = conf.n_threads as _; // TODO ?
params.defrag_thold = conf.defrag_threshold;
params.flash_attn = conf.flash_attention; params.flash_attn = conf.flash_attention;
params.no_perf = true; params.no_perf = true;
bindings::llama_init_from_model(model, params) bindings::llama_init_from_model(model, params)

View File

@ -40,6 +40,10 @@ struct Args {
#[clap(default_value = "Layer", value_enum, long, env)] #[clap(default_value = "Layer", value_enum, long, env)]
split_mode: LlamacppSplitMode, split_mode: LlamacppSplitMode,
/// Defragment the KV cache if holes/size > threshold.
#[clap(default_value = "-1.0", long, env)]
defrag_threshold: f32,
#[clap(default_value = "true", long, env)] #[clap(default_value = "true", long, env)]
/// Whether to use memory mapping. /// Whether to use memory mapping.
use_mmap: bool, use_mmap: bool,
@ -188,6 +192,7 @@ async fn main() -> Result<(), RouterError> {
n_threads: args.n_threads, n_threads: args.n_threads,
n_gpu_layers: args.n_gpu_layers, n_gpu_layers: args.n_gpu_layers,
split_mode: args.split_mode, split_mode: args.split_mode,
defrag_threshold: args.defrag_threshold,
use_mmap: args.use_mmap, use_mmap: args.use_mmap,
use_mlock: args.use_mlock, use_mlock: args.use_mlock,
flash_attention: args.flash_attention, flash_attention: args.flash_attention,