work with the latest vllm extension ops

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 11:24:53 +00:00 · 2025-04-10 19:56:58 -07:00 · 2025-04-10 19:56:58 -07:00 · a83e9fe003
commit a83e9fe003
parent 4de8fb0127
1 changed files with 2 additions and 1 deletions
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -1512,9 +1512,10 @@ class FlashCausalLM(Model):

        self.bucketing_ctx = HPUBucketingContext(
            os.getenv("DECODE_MAX_BS", 128),  # self.max_num_seqs, #TODO
-            os.getenv("PREFILL_MAX_BS", 16),  # self.max_num_prefill_seqs, #TODO
+            os.getenv("PREFILL_MAX_BS", 64),  # self.max_num_prefill_seqs, #TODO
            BLOCK_SIZE,
            num_blocks * BLOCK_SIZE,
+            False,
        )
        self.bucketing_ctx.num_hpu_blocks = num_blocks
        if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":