work with the latest vllm extension ops

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
Wang, Yi A 2025-04-10 19:56:58 -07:00
parent 4de8fb0127
commit a83e9fe003

View File

@ -1512,9 +1512,10 @@ class FlashCausalLM(Model):
self.bucketing_ctx = HPUBucketingContext( self.bucketing_ctx = HPUBucketingContext(
os.getenv("DECODE_MAX_BS", 128), # self.max_num_seqs, #TODO os.getenv("DECODE_MAX_BS", 128), # self.max_num_seqs, #TODO
os.getenv("PREFILL_MAX_BS", 16), # self.max_num_prefill_seqs, #TODO os.getenv("PREFILL_MAX_BS", 64), # self.max_num_prefill_seqs, #TODO
BLOCK_SIZE, BLOCK_SIZE,
num_blocks * BLOCK_SIZE, num_blocks * BLOCK_SIZE,
False,
) )
self.bucketing_ctx.num_hpu_blocks = num_blocks self.bucketing_ctx.num_hpu_blocks = num_blocks
if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true": if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":