diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index a8edc081..5a07acdc 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -52,19 +52,19 @@ struct Args { numa: LlamacppNuma, /// Use memory mapping for the model. - #[clap(default_value = "true", long, env)] + #[clap(long, env)] use_mmap: bool, /// Use memory locking to prevent swapping. - #[clap(default_value = "false", long, env)] + #[clap(long, env)] use_mlock: bool, /// Enable offloading of KQV operations to the GPU. - #[clap(default_value = "true", long, env)] + #[clap(long, env)] offload_kqv: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL) - #[clap(default_value = "true", long, env)] + #[clap(long, env)] flash_attention: bool, /// Data type used for K cache. @@ -132,7 +132,7 @@ struct Args { tokenizer_config_path: Option, /// Disable grammar support. - #[clap(long, env, default_value_t = false)] + #[clap(long, env)] disable_grammar_support: bool, /// Maximum number of inputs per request.