Fix cache block size for flash decoding

This seems to have been accidentally dropped during the TRT-LLM PR rebase.
2025-09-12 04:44:52 +00:00 · 2024-08-01 12:34:34 +00:00 · 2024-08-01 12:34:34 +00:00 · 278697cf55
commit 278697cf55
parent 9ab9937414
1 changed files with 8 additions and 1 deletions
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@ -35,9 +35,16 @@ impl BackendV3 {
        window_size: Option<u32>,
        speculate: u32,
    ) -> Self {
+        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
+            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
+        } else {
+            false
+        };
+        let block_size = if flashdecoding { 256 } else { 16 };
+
        let queue = Queue::new(
            requires_padding,
-            16,
+            block_size,
            window_size,
            speculate,
            max_batch_total_tokens,