From c18766afecc36426759d648c100b49ee39479a4e Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Sun, 18 May 2025 06:42:11 -0700
Subject: [PATCH] allocate from 1 block in router

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 .../server/text_generation_server/models/flash_causal_lm.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
index 09a05585..eb0f7454 100644
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@@ -1610,8 +1610,8 @@ class FlashCausalLM(Model):
             max_input_tokens,
             max_total_tokens_aligned,
         )
-        max_blocks = max(
-            BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE
+        max_blocks = (
+            max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1
         )
         self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
         if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":