diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py index d04146f9..1a264fbe 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py @@ -30,12 +30,6 @@ from text_generation_server.layers import ( SpeculativeHead, FastLinear, ) -from text_generation_server.utils.import_utils import ( - synchronize, - get_free_memory, -) -from loguru import logger -from text_generation_server.utils.log import log_master from text_generation_server.layers.layernorm import ( FastRMSNorm, @@ -359,9 +353,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module): routing_weights, selected_experts = torch.topk( routing_weights, self.top_k, dim=-1 ) - print( - f"routing_weights: {routing_weights.device}, selected_experts: {selected_experts.device}" - ) if self.norm_topk_prob: # only diff with mixtral sparse moe block! routing_weights /= routing_weights.sum(dim=-1, keepdim=True) # we cast back to the input dtype @@ -376,7 +367,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module): expert_mask = torch.nn.functional.one_hot( selected_experts, num_classes=self.num_experts ).permute(2, 1, 0) - print(f"expert_mask: {expert_mask.device}") # Loop over all available experts in the model and perform the computation on each expert for expert_idx in range(self.num_experts): expert_layer = self.experts[expert_idx] @@ -432,10 +422,6 @@ class Qwen3MoeDecoderLayer(nn.Module): SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer ) - moe_layer_cls = ( - SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer - ) - if (layer_idx not in config.mlp_only_layers) and ( config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0 ): @@ -516,11 +502,6 @@ class Qwen3MoeModel(nn.Module): self.norm = FastRMSNorm.load( prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps ) - synchronize(weights.device) - real_free_memory = get_free_memory(weights.device, 1) - log_master( - logger.debug, f"init model Free memory real: {real_free_memory / 1e9:.2f}GB" - ) def forward( self, diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py index 441b0016..5f180458 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py @@ -49,7 +49,8 @@ from text_generation_server.models.custom_modeling.flash_qwen2_modeling import ( # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py from typing import Union from transformers.feature_extraction_utils import BatchFeature -from transformers.image_utils import ImageInput, VideoInput +from transformers.image_utils import ImageInput +from transformers.video_utils import VideoInput from transformers.processing_utils import ( ProcessingKwargs, ProcessorMixin,