From 278697cf5570333c9ed43513d7cf9d6e1922ed34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Thu, 1 Aug 2024 12:34:34 +0000 Subject: [PATCH] Fix cache block size for flash decoding This seems to have been accidentally dropped during the TRT-LLM PR rebase. --- backends/v3/src/backend.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index 49e2bc8f..d82355de 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -35,9 +35,16 @@ impl BackendV3 { window_size: Option, speculate: u32, ) -> Self { + let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") { + matches!(flashdecoding.to_lowercase().as_str(), "1" | "true") + } else { + false + }; + let block_size = if flashdecoding { 256 } else { 16 }; + let queue = Queue::new( requires_padding, - 16, + block_size, window_size, speculate, max_batch_total_tokens,