diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 753138f9..e8aa579f 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -60,7 +60,7 @@ struct Args { use_mlock: bool, /// Enable offloading of KQV operations to the GPU. - #[clap(default_value = "false", long, env)] + #[clap(default_value = "true", long, env)] offload_kqv: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL)