Add --numa

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2025-01-31 15:09:29 +00:00
parent 390f0ec061
commit 7a3ed4171e
No known key found for this signature in database
2 changed files with 23 additions and 2 deletions

View File

@ -43,6 +43,15 @@ impl FromStr for LlamacppSplitMode {
} }
} }
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub enum LlamacppNuma {
Disabled,
Distribute,
Isolate,
Numactl,
Mirror,
}
pub struct LlamacppConfig { pub struct LlamacppConfig {
pub model_gguf: String, pub model_gguf: String,
pub n_ctx: usize, pub n_ctx: usize,
@ -52,6 +61,7 @@ pub struct LlamacppConfig {
pub n_threads: usize, pub n_threads: usize,
pub n_gpu_layers: usize, pub n_gpu_layers: usize,
pub split_mode: LlamacppSplitMode, pub split_mode: LlamacppSplitMode,
pub numa: LlamacppNuma,
pub defrag_threshold: f32, pub defrag_threshold: f32,
pub use_mmap: bool, pub use_mmap: bool,
pub use_mlock: bool, pub use_mlock: bool,
@ -387,7 +397,13 @@ impl LlamacppBackend {
INIT.call_once(|| unsafe { INIT.call_once(|| unsafe {
bindings::llama_log_set(Some(llamacpp_log_callback), std::ptr::null_mut()); bindings::llama_log_set(Some(llamacpp_log_callback), std::ptr::null_mut());
bindings::llama_backend_init(); bindings::llama_backend_init();
bindings::llama_numa_init(bindings::GGML_NUMA_STRATEGY_NUMACTL); // TODO add option & test bindings::llama_numa_init(match conf.numa {
LlamacppNuma::Disabled => bindings::GGML_NUMA_STRATEGY_DISABLED,
LlamacppNuma::Distribute => bindings::GGML_NUMA_STRATEGY_DISTRIBUTE,
LlamacppNuma::Isolate => bindings::GGML_NUMA_STRATEGY_ISOLATE,
LlamacppNuma::Numactl => bindings::GGML_NUMA_STRATEGY_NUMACTL,
LlamacppNuma::Mirror => bindings::GGML_NUMA_STRATEGY_MIRROR,
});
}); });
let (status_tx, status_rx) = watch::channel(false); let (status_tx, status_rx) = watch::channel(false);

View File

@ -1,6 +1,6 @@
mod backend; mod backend;
use backend::{LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError}; use backend::{LlamacppNuma, LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError};
use clap::{Parser}; use clap::{Parser};
use text_generation_router::{logging, server, usage_stats}; use text_generation_router::{logging, server, usage_stats};
use thiserror::Error; use thiserror::Error;
@ -44,6 +44,10 @@ struct Args {
#[clap(default_value = "-1.0", long, env)] #[clap(default_value = "-1.0", long, env)]
defrag_threshold: f32, defrag_threshold: f32,
/// Setup NUMA optimizations.
#[clap(default_value = "Disabled", value_enum, long, env)]
numa: LlamacppNuma,
/// Whether to use memory mapping. /// Whether to use memory mapping.
#[clap(default_value = "true", long, env)] #[clap(default_value = "true", long, env)]
use_mmap: bool, use_mmap: bool,
@ -193,6 +197,7 @@ async fn main() -> Result<(), RouterError> {
n_gpu_layers: args.n_gpu_layers, n_gpu_layers: args.n_gpu_layers,
split_mode: args.split_mode, split_mode: args.split_mode,
defrag_threshold: args.defrag_threshold, defrag_threshold: args.defrag_threshold,
numa: args.numa,
use_mmap: args.use_mmap, use_mmap: args.use_mmap,
use_mlock: args.use_mlock, use_mlock: args.use_mlock,
flash_attention: args.flash_attention, flash_attention: args.flash_attention,