diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index ba5ca186..bf45a67f 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -151,8 +151,6 @@ impl Llamacpp { LlamacppSplitMode::GPU(n) => n as _, _ => 0, }; - info!(?params.split_mode); - info!(?params.main_gpu); params.use_mmap = conf.use_mmap; params.use_mlock = conf.use_mlock; bindings::llama_model_load_from_file(gguf.as_ptr(), params) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index e1edd72d..5fb23d17 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -37,19 +37,19 @@ struct Args { n_gpu_layers: usize, /// Split the model across multiple GPUs. - #[clap(default_value = "Layer", value_enum, long, env)] + #[clap(default_value = "Layer", long, env)] split_mode: LlamacppSplitMode, /// Defragment the KV cache if holes/size > threshold. #[clap(default_value = "-1.0", long, env)] defrag_threshold: f32, - #[clap(default_value = "true", long, env)] /// Whether to use memory mapping. + #[clap(default_value = "true", long, env)] use_mmap: bool, - #[clap(default_value = "false", long, env)] /// Whether to use memory locking. + #[clap(default_value = "false", long, env)] use_mlock: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL)