Enable KQV offload by default

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-09-11 04:14:52 +00:00 · 2025-02-06 18:33:30 +00:00 · 2025-02-06 18:33:30 +00:00 · 3b1b049b32
commit 3b1b049b32
parent 809e288b5a
1 changed files with 1 additions and 1 deletions
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -60,7 +60,7 @@ struct Args {
    use_mlock: bool,

    /// Enable offloading of KQV operations to the GPU.
-    #[clap(default_value = "false", long, env)]
+    #[clap(default_value = "true", long, env)]
    offload_kqv: bool,

    /// Enable flash attention for faster inference. (EXPERIMENTAL)