allocate from 1 block in router

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
Wang, Yi A 2025-05-18 06:42:11 -07:00
parent becf36f5e4
commit c18766afec

View File

@ -1610,8 +1610,8 @@ class FlashCausalLM(Model):
max_input_tokens, max_input_tokens,
max_total_tokens_aligned, max_total_tokens_aligned,
) )
max_blocks = max( max_blocks = (
BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1
) )
self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks) self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true": if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":