diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py index 28a96523..f1e73f46 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py @@ -229,21 +229,8 @@ class Qwen3MoE(nn.Module): self.process_group = weights.process_group def forward(self, x: torch.Tensor) -> torch.Tensor: - # router_logits: (num_tokens, n_experts) router_logits = self.gate(x) - # synchronize(x.device) - # real_free_memory = get_free_memory(x.device, 1) - # log_master( - # logger.debug, - # f"moe forward 1Free memory real: {real_free_memory / 1e9:.2f}GB" - # ) out = self.moe(x, gating_output=router_logits) - # synchronize(x.device) - # real_free_memory = get_free_memory(x.device, 1) - # log_master( - # logger.debug, - # f"moe forward 2 Free memory real: {real_free_memory / 1e9:.2f}GB" - # ) # Reduce sum if self.process_group.size() > 1: diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py index be24ed03..b6cf271d 100644 --- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py @@ -1412,7 +1412,6 @@ class FlashCausalLM(Model): aliases=aliases, weights_loader=weights_loader, ) - print(f"weights: {weights}") prefix = None model = model_class(prefix, config, weights)