Remove debug info

Signed-off-by: yuanwu <yuan.wu@intel.com>
2025-09-09 11:24:53 +00:00 · 2025-06-03 05:28:38 +00:00 · 2025-06-03 05:28:38 +00:00 · 1a5ef906ae
commit 1a5ef906ae
parent 5155fef477
2 changed files with 2 additions and 20 deletions
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py
@ -30,12 +30,6 @@ from text_generation_server.layers import (
    SpeculativeHead,
    FastLinear,
 )
 from text_generation_server.utils.import_utils import (
    synchronize,
    get_free_memory,
 )
 from loguru import logger
 from text_generation_server.utils.log import log_master
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
@ -359,9 +353,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        routing_weights, selected_experts = torch.topk(
            routing_weights, self.top_k, dim=-1
        )
        print(
            f"routing_weights: {routing_weights.device}, selected_experts: {selected_experts.device}"
        )
        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
@ -376,7 +367,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        expert_mask = torch.nn.functional.one_hot(
            selected_experts, num_classes=self.num_experts
        ).permute(2, 1, 0)
        print(f"expert_mask: {expert_mask.device}")
        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
@ -432,10 +422,6 @@ class Qwen3MoeDecoderLayer(nn.Module):
            SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
        )
        moe_layer_cls = (
            SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
        )
        if (layer_idx not in config.mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
@ -516,11 +502,6 @@ class Qwen3MoeModel(nn.Module):
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )
        synchronize(weights.device)
        real_free_memory = get_free_memory(weights.device, 1)
        log_master(
            logger.debug, f"init model Free memory real: {real_free_memory / 1e9:.2f}GB"
        )
    def forward(
        self,
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
@ -49,7 +49,8 @@ from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
 # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
 from typing import Union
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import ImageInput, VideoInput
+from transformers.image_utils import ImageInput
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import (
    ProcessingKwargs,
    ProcessorMixin,