mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
Add --offload-kqv
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
ae5bb789c2
commit
e88a527fcf
@ -65,6 +65,7 @@ pub struct LlamacppConfig {
|
||||
pub defrag_threshold: f32,
|
||||
pub use_mmap: bool,
|
||||
pub use_mlock: bool,
|
||||
pub offload_kqv: bool,
|
||||
pub flash_attention: bool,
|
||||
}
|
||||
|
||||
@ -177,6 +178,7 @@ impl Llamacpp {
|
||||
params.n_threads = conf.n_threads as _;
|
||||
params.n_threads_batch = conf.n_threads as _; // TODO ?
|
||||
params.defrag_thold = conf.defrag_threshold;
|
||||
params.offload_kqv = conf.offload_kqv;
|
||||
params.flash_attn = conf.flash_attention;
|
||||
params.no_perf = true;
|
||||
bindings::llama_init_from_model(model, params)
|
||||
|
@ -56,6 +56,10 @@ struct Args {
|
||||
#[clap(default_value = "false", long, env)]
|
||||
use_mlock: bool,
|
||||
|
||||
/// Enable offloading of KQV operations to the GPU.
|
||||
#[clap(default_value = "false", long, env)]
|
||||
offload_kqv: bool,
|
||||
|
||||
/// Enable flash attention for faster inference. (EXPERIMENTAL)
|
||||
#[clap(default_value = "true", long, env)]
|
||||
flash_attention: bool,
|
||||
@ -201,6 +205,7 @@ async fn main() -> Result<(), RouterError> {
|
||||
use_mmap: args.use_mmap,
|
||||
use_mlock: args.use_mlock,
|
||||
flash_attention: args.flash_attention,
|
||||
offload_kqv: args.offload_kqv,
|
||||
max_batch_total_tokens: args.max_batch_total_tokens,
|
||||
max_batch_size: args.max_batch_size,
|
||||
batch_timeout: tokio::time::Duration::from_millis(5),
|
||||
|
Loading…
Reference in New Issue
Block a user