diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index d7bc31de..53f2c098 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -52,6 +52,7 @@ pub struct LlamacppConfig { pub n_threads: usize, pub n_gpu_layers: usize, pub split_mode: LlamacppSplitMode, + pub defrag_threshold: f32, pub use_mmap: bool, pub use_mlock: bool, pub flash_attention: bool, @@ -167,6 +168,7 @@ impl Llamacpp { params.n_seq_max = conf.max_batch_size as _; params.n_threads = conf.n_threads as _; params.n_threads_batch = conf.n_threads as _; // TODO ? + params.defrag_thold = conf.defrag_threshold; params.flash_attn = conf.flash_attention; params.no_perf = true; bindings::llama_init_from_model(model, params) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index fe7c1cd1..53a83aa1 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -40,6 +40,10 @@ struct Args { #[clap(default_value = "Layer", value_enum, long, env)] split_mode: LlamacppSplitMode, + /// Defragment the KV cache if holes/size > threshold. + #[clap(default_value = "-1.0", long, env)] + defrag_threshold: f32, + #[clap(default_value = "true", long, env)] /// Whether to use memory mapping. use_mmap: bool, @@ -188,6 +192,7 @@ async fn main() -> Result<(), RouterError> { n_threads: args.n_threads, n_gpu_layers: args.n_gpu_layers, split_mode: args.split_mode, + defrag_threshold: args.defrag_threshold, use_mmap: args.use_mmap, use_mlock: args.use_mlock, flash_attention: args.flash_attention,