mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 12:54:52 +00:00
Fix cache block size for flash decoding
This seems to have been accidentally dropped during the TRT-LLM PR rebase.
This commit is contained in:
parent
9ab9937414
commit
278697cf55
@ -35,9 +35,16 @@ impl BackendV3 {
|
|||||||
window_size: Option<u32>,
|
window_size: Option<u32>,
|
||||||
speculate: u32,
|
speculate: u32,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
|
let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
|
||||||
|
matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
let block_size = if flashdecoding { 256 } else { 16 };
|
||||||
|
|
||||||
let queue = Queue::new(
|
let queue = Queue::new(
|
||||||
requires_padding,
|
requires_padding,
|
||||||
16,
|
block_size,
|
||||||
window_size,
|
window_size,
|
||||||
speculate,
|
speculate,
|
||||||
max_batch_total_tokens,
|
max_batch_total_tokens,
|
||||||
|
Loading…
Reference in New Issue
Block a user