mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 19:34:53 +00:00
allocate from 1 block in router
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
becf36f5e4
commit
c18766afec
@ -1610,8 +1610,8 @@ class FlashCausalLM(Model):
|
|||||||
max_input_tokens,
|
max_input_tokens,
|
||||||
max_total_tokens_aligned,
|
max_total_tokens_aligned,
|
||||||
)
|
)
|
||||||
max_blocks = max(
|
max_blocks = (
|
||||||
BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE
|
max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1
|
||||||
)
|
)
|
||||||
self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
|
self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
|
||||||
if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":
|
if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":
|
||||||
|
Loading…
Reference in New Issue
Block a user