diff --git a/backends/gaudi/server/text_generation_server/models/causal_lm.py b/backends/gaudi/server/text_generation_server/models/causal_lm.py index 9b58b676..967d4e2c 100644 --- a/backends/gaudi/server/text_generation_server/models/causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py @@ -1380,10 +1380,7 @@ class CausalLM(Model): prefill_seqlen_list.sort() prefill_batch_size_list.sort() mem_stats = get_hpu_memory_stats(self.device) - logger.info( - f"Prefill warmup successful.\n" - f"Memory stats: {mem_stats} " - ) + logger.info(f"Prefill warmup successful.\n" f"Memory stats: {mem_stats} ") max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS) max_exp = math.ceil(math.log(max_decode_batch_size, BATCH_SIZE_EXPONENT_BASE)) @@ -1395,7 +1392,9 @@ class CausalLM(Model): try: for batch_size in decode_batch_size_list: - logger.info(f"Decode warmup for `batch_size={batch_size}`, this may take a while...") + logger.info( + f"Decode warmup for `batch_size={batch_size}`, this may take a while..." + ) batches = [] iters = math.floor(batch_size / max_prefill_batch_size) for i in range(iters): @@ -1428,10 +1427,7 @@ class CausalLM(Model): decode_batch_size_list.sort() max_supported_total_tokens = MAX_TOTAL_TOKENS * decode_batch_size_list[-1] mem_stats = get_hpu_memory_stats(self.device) - logger.info( - f"Decode warmup successful.\n" - f"Memory stats: {mem_stats} " - ) + logger.info(f"Decode warmup successful.\n" f"Memory stats: {mem_stats} ") max_input_tokens = max_input_tokens max_total_tokens = MAX_TOTAL_TOKENS diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py index 9fbf62d9..6d0f920e 100644 --- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py @@ -1539,10 +1539,7 @@ class VlmCausalLM(Model): ) mem_stats = get_hpu_memory_stats(self.device) - logger.info( - f"Prefill warmup successful.\n" - f"Memory stats: {mem_stats} " - ) + logger.info(f"Prefill warmup successful.\n" f"Memory stats: {mem_stats} ") max_decode_batch_size = MAX_BATCH_SIZE batch_size = max_prefill_batch_size * 2 @@ -1556,7 +1553,9 @@ class VlmCausalLM(Model): batches = [] while batch_size <= max_decode_batch_size: for i in range(int(batch_size / max_prefill_batch_size)): - logger.info(f"Decode warmup for `batch_size={batch_size}`, this may take a while...") + logger.info( + f"Decode warmup for `batch_size={batch_size}`, this may take a while..." + ) batch = self.generate_warmup_batch( request, PREFILL_WARMUP_SEQLEN_LIST[0] - 1, @@ -1599,10 +1598,7 @@ class VlmCausalLM(Model): ) mem_stats = get_hpu_memory_stats(self.device) - logger.info( - f"Decode warmup successful.\n" - f"Memory stats: {mem_stats}" - ) + logger.info(f"Decode warmup successful.\n" f"Memory stats: {mem_stats}") max_supported_total_tokens = MAX_BATCH_SIZE * MAX_TOTAL_TOKENS max_input_tokens = max_input_tokens