From f13e28c98dcaf0fd9bbee4eef4cae5e9fd0fd04c Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 18 Jun 2025 04:34:00 -0600
Subject: [PATCH] [gaudi] Refine logging for Gaudi warmup (#3222)

* Refine logging for Gaudi warmup

* Make style

* Make style 2

* Flash causal LM case

* Add log_master & VLM cases

* Black
---
 .../models/custom_modeling/flash_llava_next.py      |  2 +-
 .../models/custom_modeling/idefics2.py              |  2 +-
 .../models/custom_modeling/idefics3.py              |  2 +-
 .../models/custom_modeling/idefics_config.py        |  2 +-
 .../models/custom_modeling/idefics_modeling.py      |  2 +-
 .../models/custom_modeling/idefics_vision.py        |  2 +-
 .../models/flash_causal_lm.py                       | 13 +++++++++++++
 .../models/flash_vlm_causal_lm.py                   |  5 +++++
 .../models/mllama_causal_lm.py                      | 12 ++++++++++++
 .../server/text_generation_server/utils/segments.py |  2 +-
 10 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
index c4d4f728..d884f413 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Llava-NeXT model."""
+"""PyTorch Llava-NeXT model."""
 
 from typing import List, Optional, Tuple
 
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
index 41a45373..0579ca5d 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics2 model."""
+"""PyTorch Idefics2 model."""
 
 from typing import List, Optional, Tuple
 
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
index 6dd44c11..e12f2209 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics3 model."""
+"""PyTorch Idefics3 model."""
 
 from typing import List, Optional, Tuple
 
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py
index a5565819..6ce2054e 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Idefics model configuration"""
+"""Idefics model configuration"""
 import copy
 
 from transformers import PretrainedConfig
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index a130dbc1..910e9bcd 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics model."""
+"""PyTorch Idefics model."""
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py
index dd8f76bc..7d2051e0 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
+"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
 
 
 from dataclasses import dataclass
diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
index 9081daa0..51dca2e9 100644
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@@ -1721,6 +1721,7 @@ class FlashCausalLM(Model):
             f"{dim}:{seq_len} "
             f"bypass:{bypass} "
             f"free_mem:{free_mem}"
+            ", this may take a while..."
         )
         log_master(logger.info, msg)
 
@@ -1772,6 +1773,11 @@ class FlashCausalLM(Model):
         total_batch_seq = 0.001
         total_mem = 0
         available_mem = prompt_available_memory
+        msg = (
+            f"Prefill batch size list:{[bsz[0] for bsz in buckets]}\n"
+            f"Prefill sequence length list:{[seq[1] for seq in buckets]}\n"
+        )
+        log_master(logger.info, msg)
         for i, (batch_size, seq_len) in enumerate(buckets):
             if batch_size * seq_len > self.max_batch_prefill_tokens:
                 continue
@@ -1798,6 +1804,8 @@ class FlashCausalLM(Model):
                 total_mem += used_mem
                 total_batch_seq += batch_seq
 
+        log_master(logger.info, "Prefill warmup successful.\n")
+
         def ordering_function_max_bs(b):
             return (-b[0], b[1])
 
@@ -1809,6 +1817,9 @@ class FlashCausalLM(Model):
         total_batch_seq = 0.001
         total_mem = 0
         available_mem = free_mem - self.mem_reserved
+        log_master(
+            logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n"
+        )
         for i, (batch_size, block_num) in enumerate(buckets):
             if batch_size > block_num:
                 continue
@@ -1833,6 +1844,8 @@ class FlashCausalLM(Model):
                 total_mem += used_mem
                 total_batch_seq += batch_seq
 
+        log_master(logger.info, "Decode warmup successful.\n")
+
         log_master(
             logger.info,
             f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}",
diff --git a/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
index a9dcdf11..4220f40e 100644
--- a/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
@@ -822,6 +822,9 @@ class FlashVlmCausalLM(FlashCausalLM):
         total_batch_seq = 0.001
         total_mem = 0
         available_mem = decode_available_memory
+        log_master(
+            logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n"
+        )
         for i, (batch_size, block_num) in enumerate(buckets):
             if batch_size > block_num:
                 continue
@@ -847,6 +850,8 @@ class FlashVlmCausalLM(FlashCausalLM):
                 total_mem += used_mem
                 total_batch_seq += batch_seq
 
+        log_master(logger.info, "Decode warmup successful.\n")
+
         log_master(
             logger.info,
             f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}",
diff --git a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
index a26b9111..dbaccfa0 100644
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@@ -398,6 +398,11 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
         total_batch_seq = 0.001
         total_mem = 0
         available_mem = prompt_available_memory
+        msg = (
+            f"Prefill batch size list:{[bsz[0] for bsz in buckets]}\n"
+            f"Prefill sequence length list:{[seq[1] for seq in buckets]}\n"
+        )
+        log_master(logger.info, msg)
         for i, (batch_size, seq_len) in enumerate(buckets):
             if batch_size * seq_len > self.max_batch_prefill_tokens:
                 continue
@@ -424,6 +429,8 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
                 total_mem += used_mem
                 total_batch_seq += batch_seq
 
+        log_master(logger.info, "Prefill warmup successful.\n")
+
         def ordering_function_max_bs(b):
             return (-b[0], b[1])
 
@@ -435,6 +442,9 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
         total_batch_seq = 0.001
         total_mem = 0
         available_mem = free_mem - self.mem_reserved
+        log_master(
+            logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n"
+        )
         for i, (batch_size, block_num) in enumerate(buckets):
             if batch_size > block_num:
                 continue
@@ -459,6 +469,8 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
                 total_mem += used_mem
                 total_batch_seq += batch_seq
 
+        log_master(logger.info, "Decode warmup successful.\n")
+
         log_master(
             logger.info,
             f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}",
diff --git a/backends/gaudi/server/text_generation_server/utils/segments.py b/backends/gaudi/server/text_generation_server/utils/segments.py
index f5961102..133049be 100644
--- a/backends/gaudi/server/text_generation_server/utils/segments.py
+++ b/backends/gaudi/server/text_generation_server/utils/segments.py
@@ -8,7 +8,7 @@ import torch
 
 
 def find_segments(
-    adapter_indices: Union[torch.Tensor, List[int]]
+    adapter_indices: Union[torch.Tensor, List[int]],
 ) -> Tuple[List[int], List[int]]:
     segments = [0]
     segment_indices = []