Enable mmap, offload_kqv & flash_attention by default

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2025-02-14 13:18:28 +00:00
parent 46bc8e6bc7
commit 30cd3cf510
No known key found for this signature in database

View File

@ -54,7 +54,7 @@ struct Args {
/// Use memory mapping for the model. /// Use memory mapping for the model.
#[clap(long, env)] #[clap(long, env)]
use_mmap: bool, disable_mmap: bool,
/// Use memory locking to prevent swapping. /// Use memory locking to prevent swapping.
#[clap(long, env)] #[clap(long, env)]
@ -62,11 +62,11 @@ struct Args {
/// Enable offloading of KQV operations to the GPU. /// Enable offloading of KQV operations to the GPU.
#[clap(long, env)] #[clap(long, env)]
offload_kqv: bool, disable_offload_kqv: bool,
/// Enable flash attention for faster inference. (EXPERIMENTAL) /// Enable flash attention for faster inference. (EXPERIMENTAL)
#[clap(long, env)] #[clap(long, env)]
flash_attention: bool, disable_flash_attention: bool,
/// Data type used for K cache. /// Data type used for K cache.
#[clap(default_value = "f16", value_enum, long, env)] #[clap(default_value = "f16", value_enum, long, env)]
@ -245,12 +245,12 @@ async fn main() -> Result<(), RouterError> {
split_mode: args.split_mode, split_mode: args.split_mode,
defrag_threshold: args.defrag_threshold, defrag_threshold: args.defrag_threshold,
numa: args.numa, numa: args.numa,
use_mmap: args.use_mmap, use_mmap: !args.disable_mmap,
use_mlock: args.use_mlock, use_mlock: args.use_mlock,
flash_attention: args.flash_attention, flash_attention: !args.disable_flash_attention,
type_k: args.type_k, type_k: args.type_k,
type_v: args.type_v, type_v: args.type_v,
offload_kqv: args.offload_kqv, offload_kqv: !args.disable_offload_kqv,
max_batch_total_tokens, max_batch_total_tokens,
max_physical_batch_total_tokens, max_physical_batch_total_tokens,
max_batch_size, max_batch_size,