Fix cache block size for flash decoding

This seems to have been accidentally dropped during the TRT-LLM
PR rebase.
This commit is contained in:
Daniël de Kok 2024-08-01 12:34:34 +00:00
parent 9ab9937414
commit 278697cf55

View File

@ -35,9 +35,16 @@ impl BackendV3 {
window_size: Option<u32>, window_size: Option<u32>,
speculate: u32, speculate: u32,
) -> Self { ) -> Self {
let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
} else {
false
};
let block_size = if flashdecoding { 256 } else { 16 };
let queue = Queue::new( let queue = Queue::new(
requires_padding, requires_padding,
16, block_size,
window_size, window_size,
speculate, speculate,
max_batch_total_tokens, max_batch_total_tokens,