mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
Cleanup
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
d6ded897a8
commit
390f0ec061
@ -151,8 +151,6 @@ impl Llamacpp {
|
||||
LlamacppSplitMode::GPU(n) => n as _,
|
||||
_ => 0,
|
||||
};
|
||||
info!(?params.split_mode);
|
||||
info!(?params.main_gpu);
|
||||
params.use_mmap = conf.use_mmap;
|
||||
params.use_mlock = conf.use_mlock;
|
||||
bindings::llama_model_load_from_file(gguf.as_ptr(), params)
|
||||
|
@ -37,19 +37,19 @@ struct Args {
|
||||
n_gpu_layers: usize,
|
||||
|
||||
/// Split the model across multiple GPUs.
|
||||
#[clap(default_value = "Layer", value_enum, long, env)]
|
||||
#[clap(default_value = "Layer", long, env)]
|
||||
split_mode: LlamacppSplitMode,
|
||||
|
||||
/// Defragment the KV cache if holes/size > threshold.
|
||||
#[clap(default_value = "-1.0", long, env)]
|
||||
defrag_threshold: f32,
|
||||
|
||||
#[clap(default_value = "true", long, env)]
|
||||
/// Whether to use memory mapping.
|
||||
#[clap(default_value = "true", long, env)]
|
||||
use_mmap: bool,
|
||||
|
||||
#[clap(default_value = "false", long, env)]
|
||||
/// Whether to use memory locking.
|
||||
#[clap(default_value = "false", long, env)]
|
||||
use_mlock: bool,
|
||||
|
||||
/// Enable flash attention for faster inference. (EXPERIMENTAL)
|
||||
|
Loading…
Reference in New Issue
Block a user