From f13e28c98dcaf0fd9bbee4eef4cae5e9fd0fd04c Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Wed, 18 Jun 2025 04:34:00 -0600 Subject: [PATCH] [gaudi] Refine logging for Gaudi warmup (#3222) * Refine logging for Gaudi warmup * Make style * Make style 2 * Flash causal LM case * Add log_master & VLM cases * Black --- .../models/custom_modeling/flash_llava_next.py | 2 +- .../models/custom_modeling/idefics2.py | 2 +- .../models/custom_modeling/idefics3.py | 2 +- .../models/custom_modeling/idefics_config.py | 2 +- .../models/custom_modeling/idefics_modeling.py | 2 +- .../models/custom_modeling/idefics_vision.py | 2 +- .../models/flash_causal_lm.py | 13 +++++++++++++ .../models/flash_vlm_causal_lm.py | 5 +++++ .../models/mllama_causal_lm.py | 12 ++++++++++++ .../server/text_generation_server/utils/segments.py | 2 +- 10 files changed, 37 insertions(+), 7 deletions(-) diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py index c4d4f728..d884f413 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Llava-NeXT model.""" +"""PyTorch Llava-NeXT model.""" from typing import List, Optional, Tuple diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py index 41a45373..0579ca5d 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Idefics2 model.""" +"""PyTorch Idefics2 model.""" from typing import List, Optional, Tuple diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py index 6dd44c11..e12f2209 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Idefics3 model.""" +"""PyTorch Idefics3 model.""" from typing import List, Optional, Tuple diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py index a5565819..6ce2054e 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Idefics model configuration""" +"""Idefics model configuration""" import copy from transformers import PretrainedConfig diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py index a130dbc1..910e9bcd 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Idefics model.""" +"""PyTorch Idefics model.""" from typing import List, Optional, Tuple, Union import torch diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py index dd8f76bc..7d2051e0 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object""" +"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object""" from dataclasses import dataclass diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py index 9081daa0..51dca2e9 100644 --- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py @@ -1721,6 +1721,7 @@ class FlashCausalLM(Model): f"{dim}:{seq_len} " f"bypass:{bypass} " f"free_mem:{free_mem}" + ", this may take a while..." ) log_master(logger.info, msg) @@ -1772,6 +1773,11 @@ class FlashCausalLM(Model): total_batch_seq = 0.001 total_mem = 0 available_mem = prompt_available_memory + msg = ( + f"Prefill batch size list:{[bsz[0] for bsz in buckets]}\n" + f"Prefill sequence length list:{[seq[1] for seq in buckets]}\n" + ) + log_master(logger.info, msg) for i, (batch_size, seq_len) in enumerate(buckets): if batch_size * seq_len > self.max_batch_prefill_tokens: continue @@ -1798,6 +1804,8 @@ class FlashCausalLM(Model): total_mem += used_mem total_batch_seq += batch_seq + log_master(logger.info, "Prefill warmup successful.\n") + def ordering_function_max_bs(b): return (-b[0], b[1]) @@ -1809,6 +1817,9 @@ class FlashCausalLM(Model): total_batch_seq = 0.001 total_mem = 0 available_mem = free_mem - self.mem_reserved + log_master( + logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n" + ) for i, (batch_size, block_num) in enumerate(buckets): if batch_size > block_num: continue @@ -1833,6 +1844,8 @@ class FlashCausalLM(Model): total_mem += used_mem total_batch_seq += batch_seq + log_master(logger.info, "Decode warmup successful.\n") + log_master( logger.info, f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}", diff --git a/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py index a9dcdf11..4220f40e 100644 --- a/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py @@ -822,6 +822,9 @@ class FlashVlmCausalLM(FlashCausalLM): total_batch_seq = 0.001 total_mem = 0 available_mem = decode_available_memory + log_master( + logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n" + ) for i, (batch_size, block_num) in enumerate(buckets): if batch_size > block_num: continue @@ -847,6 +850,8 @@ class FlashVlmCausalLM(FlashCausalLM): total_mem += used_mem total_batch_seq += batch_seq + log_master(logger.info, "Decode warmup successful.\n") + log_master( logger.info, f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}", diff --git a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py index a26b9111..dbaccfa0 100644 --- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py @@ -398,6 +398,11 @@ class FlashMllamaCausalLM(FlashVlmCausalLM): total_batch_seq = 0.001 total_mem = 0 available_mem = prompt_available_memory + msg = ( + f"Prefill batch size list:{[bsz[0] for bsz in buckets]}\n" + f"Prefill sequence length list:{[seq[1] for seq in buckets]}\n" + ) + log_master(logger.info, msg) for i, (batch_size, seq_len) in enumerate(buckets): if batch_size * seq_len > self.max_batch_prefill_tokens: continue @@ -424,6 +429,8 @@ class FlashMllamaCausalLM(FlashVlmCausalLM): total_mem += used_mem total_batch_seq += batch_seq + log_master(logger.info, "Prefill warmup successful.\n") + def ordering_function_max_bs(b): return (-b[0], b[1]) @@ -435,6 +442,9 @@ class FlashMllamaCausalLM(FlashVlmCausalLM): total_batch_seq = 0.001 total_mem = 0 available_mem = free_mem - self.mem_reserved + log_master( + logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n" + ) for i, (batch_size, block_num) in enumerate(buckets): if batch_size > block_num: continue @@ -459,6 +469,8 @@ class FlashMllamaCausalLM(FlashVlmCausalLM): total_mem += used_mem total_batch_seq += batch_seq + log_master(logger.info, "Decode warmup successful.\n") + log_master( logger.info, f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}", diff --git a/backends/gaudi/server/text_generation_server/utils/segments.py b/backends/gaudi/server/text_generation_server/utils/segments.py index f5961102..133049be 100644 --- a/backends/gaudi/server/text_generation_server/utils/segments.py +++ b/backends/gaudi/server/text_generation_server/utils/segments.py @@ -8,7 +8,7 @@ import torch def find_segments( - adapter_indices: Union[torch.Tensor, List[int]] + adapter_indices: Union[torch.Tensor, List[int]], ) -> Tuple[List[int], List[int]]: segments = [0] segment_indices = []