From c18766afecc36426759d648c100b49ee39479a4e Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Sun, 18 May 2025 06:42:11 -0700 Subject: [PATCH] allocate from 1 block in router Signed-off-by: Wang, Yi A --- .../server/text_generation_server/models/flash_causal_lm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py index 09a05585..eb0f7454 100644 --- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py @@ -1610,8 +1610,8 @@ class FlashCausalLM(Model): max_input_tokens, max_total_tokens_aligned, ) - max_blocks = max( - BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE + max_blocks = ( + max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1 ) self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks) if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":