enable VLLM_EXPONENTIAL_BUCKETING

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 03:14:53 +00:00 · 2025-05-18 19:56:11 -07:00 · 2025-05-18 19:56:11 -07:00 · ae0c9dfb62
commit ae0c9dfb62
parent 550c85c39e
3 changed files with 10 additions and 7 deletions
--- a/1
+++ b/1
@ -62,6 +62,7 @@ ENV PREFIX_CACHING=0
 ENV PREFILL_CHUNKING=0
 ENV PT_HPU_LAZY_MODE=1
 ENV PT_HPU_WEIGHT_SHARING=0
 ENV VLLM_EXPONENTIAL_BUCKETING=true
 # Text Generation Inference base env
 ENV HF_HOME=/data \
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -1603,7 +1603,11 @@ class FlashCausalLM(Model):
        self.max_batch_prefill_tokens = get_max_prefill_tokens()
        max_num_seqs = int(os.getenv("MAX_BATCH_SIZE"))
        HPUBucketingContext = get_bucketing_context()
-        max_total_tokens_aligned = math.ceil(max_total_tokens / BLOCK_SIZE) * BLOCK_SIZE
+        # need to warmup one more step since block is allocated from 1
        block_step = os.getenv("VLLM_DECODE_BLOCK_BUCKET_STEP", BLOCK_SIZE)
        max_total_tokens_aligned = math.ceil(
            max_total_tokens / BLOCK_SIZE
        ) * BLOCK_SIZE + math.ceil(block_step * BLOCK_SIZE / max_num_seqs)
        model_max_length = self.tokenizer.model_max_length
        max_position_embeddings = getattr(
            self.config, "max_position_embeddings", model_max_length
@ -1619,8 +1623,8 @@ class FlashCausalLM(Model):
            max_input_tokens,
            max_total_tokens_aligned,
        )
-        max_blocks = (
+        max_blocks = max(
-            max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1
+            BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE
        )
        self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
        synchronize(self.device)
@ -1683,8 +1687,7 @@ class FlashCausalLM(Model):
            f"Using {format_bytes(graph_free_mem)}"
            f"/{format_bytes(free_mem)} "
            "of free device memory for HPUGraphs, "
-            f"{format_bytes(prompt_available_memory)} \
+            f"{format_bytes(prompt_available_memory)} for prompt and "
                for prompt and "
            f"{format_bytes(decode_available_memory)} for decode "
            f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
        )
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@ -366,8 +366,7 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
            f"Using {format_bytes(graph_free_mem)}"
            f"/{format_bytes(free_mem)} "
            "of free device memory for HPUGraphs, "
-            f"{format_bytes(prompt_available_memory)} \
+            f"{format_bytes(prompt_available_memory)} for prompt and "
                for prompt and "
            f"{format_bytes(decode_available_memory)} for decode "
            f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
        )