mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 13:52:07 +00:00
work with the latest vllm extension ops
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
4de8fb0127
commit
a83e9fe003
@ -1512,9 +1512,10 @@ class FlashCausalLM(Model):
|
||||
|
||||
self.bucketing_ctx = HPUBucketingContext(
|
||||
os.getenv("DECODE_MAX_BS", 128), # self.max_num_seqs, #TODO
|
||||
os.getenv("PREFILL_MAX_BS", 16), # self.max_num_prefill_seqs, #TODO
|
||||
os.getenv("PREFILL_MAX_BS", 64), # self.max_num_prefill_seqs, #TODO
|
||||
BLOCK_SIZE,
|
||||
num_blocks * BLOCK_SIZE,
|
||||
False,
|
||||
)
|
||||
self.bucketing_ctx.num_hpu_blocks = num_blocks
|
||||
if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":
|
||||
|
Loading…
Reference in New Issue
Block a user