From 30cd3cf510714f293d65844f990d7d0c76ff232c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 14 Feb 2025 13:18:28 +0000 Subject: [PATCH] Enable mmap, offload_kqv & flash_attention by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index fe22c1d7b..604d9ddc4 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -54,7 +54,7 @@ struct Args { /// Use memory mapping for the model. #[clap(long, env)] - use_mmap: bool, + disable_mmap: bool, /// Use memory locking to prevent swapping. #[clap(long, env)] @@ -62,11 +62,11 @@ struct Args { /// Enable offloading of KQV operations to the GPU. #[clap(long, env)] - offload_kqv: bool, + disable_offload_kqv: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL) #[clap(long, env)] - flash_attention: bool, + disable_flash_attention: bool, /// Data type used for K cache. #[clap(default_value = "f16", value_enum, long, env)] @@ -245,12 +245,12 @@ async fn main() -> Result<(), RouterError> { split_mode: args.split_mode, defrag_threshold: args.defrag_threshold, numa: args.numa, - use_mmap: args.use_mmap, + use_mmap: !args.disable_mmap, use_mlock: args.use_mlock, - flash_attention: args.flash_attention, + flash_attention: !args.disable_flash_attention, type_k: args.type_k, type_v: args.type_v, - offload_kqv: args.offload_kqv, + offload_kqv: !args.disable_offload_kqv, max_batch_total_tokens, max_physical_batch_total_tokens, max_batch_size,