mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 03:14:53 +00:00
enable VLLM_EXPONENTIAL_BUCKETING
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
550c85c39e
commit
ae0c9dfb62
@ -62,6 +62,7 @@ ENV PREFIX_CACHING=0
|
|||||||
ENV PREFILL_CHUNKING=0
|
ENV PREFILL_CHUNKING=0
|
||||||
ENV PT_HPU_LAZY_MODE=1
|
ENV PT_HPU_LAZY_MODE=1
|
||||||
ENV PT_HPU_WEIGHT_SHARING=0
|
ENV PT_HPU_WEIGHT_SHARING=0
|
||||||
|
ENV VLLM_EXPONENTIAL_BUCKETING=true
|
||||||
|
|
||||||
# Text Generation Inference base env
|
# Text Generation Inference base env
|
||||||
ENV HF_HOME=/data \
|
ENV HF_HOME=/data \
|
||||||
|
@ -1603,7 +1603,11 @@ class FlashCausalLM(Model):
|
|||||||
self.max_batch_prefill_tokens = get_max_prefill_tokens()
|
self.max_batch_prefill_tokens = get_max_prefill_tokens()
|
||||||
max_num_seqs = int(os.getenv("MAX_BATCH_SIZE"))
|
max_num_seqs = int(os.getenv("MAX_BATCH_SIZE"))
|
||||||
HPUBucketingContext = get_bucketing_context()
|
HPUBucketingContext = get_bucketing_context()
|
||||||
max_total_tokens_aligned = math.ceil(max_total_tokens / BLOCK_SIZE) * BLOCK_SIZE
|
# need to warmup one more step since block is allocated from 1
|
||||||
|
block_step = os.getenv("VLLM_DECODE_BLOCK_BUCKET_STEP", BLOCK_SIZE)
|
||||||
|
max_total_tokens_aligned = math.ceil(
|
||||||
|
max_total_tokens / BLOCK_SIZE
|
||||||
|
) * BLOCK_SIZE + math.ceil(block_step * BLOCK_SIZE / max_num_seqs)
|
||||||
model_max_length = self.tokenizer.model_max_length
|
model_max_length = self.tokenizer.model_max_length
|
||||||
max_position_embeddings = getattr(
|
max_position_embeddings = getattr(
|
||||||
self.config, "max_position_embeddings", model_max_length
|
self.config, "max_position_embeddings", model_max_length
|
||||||
@ -1619,8 +1623,8 @@ class FlashCausalLM(Model):
|
|||||||
max_input_tokens,
|
max_input_tokens,
|
||||||
max_total_tokens_aligned,
|
max_total_tokens_aligned,
|
||||||
)
|
)
|
||||||
max_blocks = (
|
max_blocks = max(
|
||||||
max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1
|
BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE
|
||||||
)
|
)
|
||||||
self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
|
self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
|
||||||
synchronize(self.device)
|
synchronize(self.device)
|
||||||
@ -1683,8 +1687,7 @@ class FlashCausalLM(Model):
|
|||||||
f"Using {format_bytes(graph_free_mem)}"
|
f"Using {format_bytes(graph_free_mem)}"
|
||||||
f"/{format_bytes(free_mem)} "
|
f"/{format_bytes(free_mem)} "
|
||||||
"of free device memory for HPUGraphs, "
|
"of free device memory for HPUGraphs, "
|
||||||
f"{format_bytes(prompt_available_memory)} \
|
f"{format_bytes(prompt_available_memory)} for prompt and "
|
||||||
for prompt and "
|
|
||||||
f"{format_bytes(decode_available_memory)} for decode "
|
f"{format_bytes(decode_available_memory)} for decode "
|
||||||
f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
|
f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
|
||||||
)
|
)
|
||||||
|
@ -366,8 +366,7 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
|
|||||||
f"Using {format_bytes(graph_free_mem)}"
|
f"Using {format_bytes(graph_free_mem)}"
|
||||||
f"/{format_bytes(free_mem)} "
|
f"/{format_bytes(free_mem)} "
|
||||||
"of free device memory for HPUGraphs, "
|
"of free device memory for HPUGraphs, "
|
||||||
f"{format_bytes(prompt_available_memory)} \
|
f"{format_bytes(prompt_available_memory)} for prompt and "
|
||||||
for prompt and "
|
|
||||||
f"{format_bytes(decode_available_memory)} for decode "
|
f"{format_bytes(decode_available_memory)} for decode "
|
||||||
f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
|
f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user