allocate from 1 block in router

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 19:34:53 +00:00 · 2025-05-18 06:42:11 -07:00 · 2025-05-18 06:42:11 -07:00 · c18766afec
commit c18766afec
parent becf36f5e4
1 changed files with 2 additions and 2 deletions
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -1610,8 +1610,8 @@ class FlashCausalLM(Model):
            max_input_tokens,
            max_total_tokens_aligned,
        )
-        max_blocks = max(
+        max_blocks = (
-            BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE
+            max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1
        )
        self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
        if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":