From ae0c9dfb6243fec86466a3ad306771cdc4fee286 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Sun, 18 May 2025 19:56:11 -0700
Subject: [PATCH] enable VLLM_EXPONENTIAL_BUCKETING

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_gaudi                                    |  1 +
 .../models/flash_causal_lm.py                       | 13 ++++++++-----
 .../models/mllama_causal_lm.py                      |  3 +--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi
index 54a0bb7c..e6c40d68 100644
--- a/Dockerfile_gaudi
+++ b/Dockerfile_gaudi
@@ -62,6 +62,7 @@ ENV PREFIX_CACHING=0
 ENV PREFILL_CHUNKING=0
 ENV PT_HPU_LAZY_MODE=1
 ENV PT_HPU_WEIGHT_SHARING=0
+ENV VLLM_EXPONENTIAL_BUCKETING=true
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
index c617d33a..bc0d240e 100644
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@@ -1603,7 +1603,11 @@ class FlashCausalLM(Model):
         self.max_batch_prefill_tokens = get_max_prefill_tokens()
         max_num_seqs = int(os.getenv("MAX_BATCH_SIZE"))
         HPUBucketingContext = get_bucketing_context()
-        max_total_tokens_aligned = math.ceil(max_total_tokens / BLOCK_SIZE) * BLOCK_SIZE
+        # need to warmup one more step since block is allocated from 1
+        block_step = os.getenv("VLLM_DECODE_BLOCK_BUCKET_STEP", BLOCK_SIZE)
+        max_total_tokens_aligned = math.ceil(
+            max_total_tokens / BLOCK_SIZE
+        ) * BLOCK_SIZE + math.ceil(block_step * BLOCK_SIZE / max_num_seqs)
         model_max_length = self.tokenizer.model_max_length
         max_position_embeddings = getattr(
             self.config, "max_position_embeddings", model_max_length
@@ -1619,8 +1623,8 @@ class FlashCausalLM(Model):
             max_input_tokens,
             max_total_tokens_aligned,
         )
-        max_blocks = (
-            max(BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE) + 1
+        max_blocks = max(
+            BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE
         )
         self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
         synchronize(self.device)
@@ -1683,8 +1687,7 @@ class FlashCausalLM(Model):
             f"Using {format_bytes(graph_free_mem)}"
             f"/{format_bytes(free_mem)} "
             "of free device memory for HPUGraphs, "
-            f"{format_bytes(prompt_available_memory)} \
-                for prompt and "
+            f"{format_bytes(prompt_available_memory)} for prompt and "
             f"{format_bytes(decode_available_memory)} for decode "
             f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
         )
diff --git a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
index 8037b492..db3904a2 100644
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@@ -366,8 +366,7 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
             f"Using {format_bytes(graph_free_mem)}"
             f"/{format_bytes(free_mem)} "
             "of free device memory for HPUGraphs, "
-            f"{format_bytes(prompt_available_memory)} \
-                for prompt and "
+            f"{format_bytes(prompt_available_memory)} for prompt and "
             f"{format_bytes(decode_available_memory)} for decode "
             f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
         )