mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
Add --defrag-threshold
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
f388747985
commit
e07835c5b5
@ -52,6 +52,7 @@ pub struct LlamacppConfig {
|
|||||||
pub n_threads: usize,
|
pub n_threads: usize,
|
||||||
pub n_gpu_layers: usize,
|
pub n_gpu_layers: usize,
|
||||||
pub split_mode: LlamacppSplitMode,
|
pub split_mode: LlamacppSplitMode,
|
||||||
|
pub defrag_threshold: f32,
|
||||||
pub use_mmap: bool,
|
pub use_mmap: bool,
|
||||||
pub use_mlock: bool,
|
pub use_mlock: bool,
|
||||||
pub flash_attention: bool,
|
pub flash_attention: bool,
|
||||||
@ -167,6 +168,7 @@ impl Llamacpp {
|
|||||||
params.n_seq_max = conf.max_batch_size as _;
|
params.n_seq_max = conf.max_batch_size as _;
|
||||||
params.n_threads = conf.n_threads as _;
|
params.n_threads = conf.n_threads as _;
|
||||||
params.n_threads_batch = conf.n_threads as _; // TODO ?
|
params.n_threads_batch = conf.n_threads as _; // TODO ?
|
||||||
|
params.defrag_thold = conf.defrag_threshold;
|
||||||
params.flash_attn = conf.flash_attention;
|
params.flash_attn = conf.flash_attention;
|
||||||
params.no_perf = true;
|
params.no_perf = true;
|
||||||
bindings::llama_init_from_model(model, params)
|
bindings::llama_init_from_model(model, params)
|
||||||
|
@ -40,6 +40,10 @@ struct Args {
|
|||||||
#[clap(default_value = "Layer", value_enum, long, env)]
|
#[clap(default_value = "Layer", value_enum, long, env)]
|
||||||
split_mode: LlamacppSplitMode,
|
split_mode: LlamacppSplitMode,
|
||||||
|
|
||||||
|
/// Defragment the KV cache if holes/size > threshold.
|
||||||
|
#[clap(default_value = "-1.0", long, env)]
|
||||||
|
defrag_threshold: f32,
|
||||||
|
|
||||||
#[clap(default_value = "true", long, env)]
|
#[clap(default_value = "true", long, env)]
|
||||||
/// Whether to use memory mapping.
|
/// Whether to use memory mapping.
|
||||||
use_mmap: bool,
|
use_mmap: bool,
|
||||||
@ -188,6 +192,7 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
n_threads: args.n_threads,
|
n_threads: args.n_threads,
|
||||||
n_gpu_layers: args.n_gpu_layers,
|
n_gpu_layers: args.n_gpu_layers,
|
||||||
split_mode: args.split_mode,
|
split_mode: args.split_mode,
|
||||||
|
defrag_threshold: args.defrag_threshold,
|
||||||
use_mmap: args.use_mmap,
|
use_mmap: args.use_mmap,
|
||||||
use_mlock: args.use_mlock,
|
use_mlock: args.use_mlock,
|
||||||
flash_attention: args.flash_attention,
|
flash_attention: args.flash_attention,
|
||||||
|
Loading…
Reference in New Issue
Block a user