From ae0c9dfb6243fec86466a3ad306771cdc4fee286 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Sun, 18 May 2025 19:56:11 -0700 Subject: [PATCH] enable VLLM_EXPONENTIAL_BUCKETING Signed-off-by: Wang, Yi A --- Dockerfile_gaudi | 1 + .../models/flash_causal_lm.py | 13 ++++++++----- .../models/mllama_causal_lm.py | 3 +-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi index 54a0bb7c..e6c40d68 100644 --- a/Dockerfile_gaudi +++ b/Dockerfile_gaudi @@ -62,6 +62,7 @@ ENV PREFIX_CACHING=0 ENV PREFILL_CHUNKING=0 ENV PT_HPU_LAZY_MODE=1 ENV PT_HPU_WEIGHT_SHARING=0 +ENV VLLM_EXPONENTIAL_BUCKETING=true # Text Generation Inference base env ENV HF_HOME=/data \ diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py index c617d33a..bc0d240e 100644 --- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py @@ -1603,7 +1603,11 @@ class FlashCausalLM(Model): self.max_batch_prefill_tokens = get_max_prefill_tokens() max_num_seqs = int(os.getenv("MAX_BATCH_SIZE")) HPUBucketingContext = get_bucketing_context() - max_total_tokens_aligned = math.ceil(max_total_tokens / BLOCK_SIZE) * BLOCK_SIZE + # need to warmup one more step since block is allocated from 1 + block_step = os.getenv("VLLM_DECODE_BLOCK_BUCKET_STEP", BLOCK_SIZE) + max_total_tokens_aligned = math.ceil( + max_total_tokens / BLOCK_SIZE + ) * BLOCK_SIZE + math.ceil(block_step * BLOCK_SIZE / max_num_seqs) model_max_length = self.tokenizer.model_max_length max_position_embeddings = getattr( self.config, "max_position_embeddings", model_max_length @@ -1619,8 +1623,8 @@ class FlashCausalLM(Model): max_input_tokens, max_total_tokens_aligned, ) - max_blocks = ( - max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1 + max_blocks = max( + BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE ) self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks) synchronize(self.device) @@ -1683,8 +1687,7 @@ class FlashCausalLM(Model): f"Using {format_bytes(graph_free_mem)}" f"/{format_bytes(free_mem)} " "of free device memory for HPUGraphs, " - f"{format_bytes(prompt_available_memory)} \ - for prompt and " + f"{format_bytes(prompt_available_memory)} for prompt and " f"{format_bytes(decode_available_memory)} for decode " f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})" ) diff --git a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py index 8037b492..db3904a2 100644 --- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py @@ -366,8 +366,7 @@ class FlashMllamaCausalLM(FlashVlmCausalLM): f"Using {format_bytes(graph_free_mem)}" f"/{format_bytes(free_mem)} " "of free device memory for HPUGraphs, " - f"{format_bytes(prompt_available_memory)} \ - for prompt and " + f"{format_bytes(prompt_available_memory)} for prompt and " f"{format_bytes(decode_available_memory)} for decode " f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})" )