Remove useless code

Signed-off-by: yuanwu <yuan.wu@intel.com>
2025-06-19 07:42:06 +00:00 · 2025-06-13 01:48:29 +00:00 · 2025-06-13 01:48:29 +00:00 · 0bbd8d1645
commit 0bbd8d1645
parent 91c40e6c58
1 changed files with 0 additions and 254 deletions
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py
@ -344,27 +344,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        return final_hidden_states
 # @use_kernel_forward_from_hub("RMSNorm")
 # class Qwen3MoeRMSNorm(nn.Module):
 #     def __init__(self, hidden_size, eps=1e-6):
 #         """
 #         Qwen3MoeRMSNorm is equivalent to T5LayerNorm
 #         """
 #         super().__init__()
 #         self.weight = nn.Parameter(torch.ones(hidden_size))
 #         self.variance_epsilon = eps
 #     def forward(self, hidden_states):
 #         input_dtype = hidden_states.dtype
 #         hidden_states = hidden_states.to(torch.float32)
 #         variance = hidden_states.pow(2).mean(-1, keepdim=True)
 #         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
 #         return self.weight * hidden_states.to(input_dtype)
 #     def extra_repr(self):
 #         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 class Qwen3MoeDecoderLayer(nn.Module):
    def __init__(self, config, prefix, weights, layer_idx: int):
        super().__init__()
@ -508,239 +487,6 @@ class Qwen3MoeModel(nn.Module):
        return hidden_states
 #     def _update_causal_mask(
 #         self,
 #         attention_mask: Union[torch.Tensor, "BlockMask"],
 #         input_tensor: torch.Tensor,
 #         cache_position: torch.Tensor,
 #         past_key_values: Cache,
 #         output_attentions: bool = False,
 #     ):
 #         if self.config._attn_implementation == "flash_attention_2":
 #             if attention_mask is not None and past_key_values is not None:
 #                 is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
 #                 if is_padding_right:
 #                     raise ValueError(
 #                         "You are attempting to perform batched generation with padding_side='right'"
 #                         " this may lead to unexpected behaviour for Flash Attention version of Qwen3Moe. Make sure to "
 #                         " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
 #                     )
 #             if attention_mask is not None and 0.0 in attention_mask:
 #                 return attention_mask
 #             return None
 #         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
 #         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
 #         # to infer the attention mask.
 #         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
 #         using_static_cache = isinstance(past_key_values, StaticCache)
 #         using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 #         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
 #         if (
 #             self.config._attn_implementation == "sdpa"
 #             and not (using_static_cache or using_sliding_window_cache)
 #             and not output_attentions
 #         ):
 #             if AttentionMaskConverter._ignore_causal_mask_sdpa(
 #                 attention_mask,
 #                 inputs_embeds=input_tensor,
 #                 past_key_values_length=past_seen_tokens,
 #                 sliding_window=self.config.sliding_window,
 #                 is_training=self.training,
 #             ):
 #                 return None
 #         dtype = input_tensor.dtype
 #         min_dtype = torch.finfo(dtype).min
 #         sequence_length = input_tensor.shape[1]
 #         # SlidingWindowCache or StaticCache
 #         if using_sliding_window_cache or using_static_cache:
 #             target_length = past_key_values.get_max_cache_shape()
 #         # DynamicCache or no cache
 #         else:
 #             target_length = (
 #                 attention_mask.shape[-1]
 #                 if isinstance(attention_mask, torch.Tensor)
 #                 else past_seen_tokens + sequence_length + 1
 #             )
 #         # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
 #         causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
 #             attention_mask,
 #             sequence_length=sequence_length,
 #             target_length=target_length,
 #             dtype=dtype,
 #             cache_position=cache_position,
 #             batch_size=input_tensor.shape[0],
 #             config=self.config,
 #             past_key_values=past_key_values,
 #         )
 #         if (
 #             self.config._attn_implementation == "sdpa"
 #             and attention_mask is not None
 #             and attention_mask.device.type in ["cuda", "xpu", "npu"]
 #             and not output_attentions
 #         ):
 #             # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
 #             # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
 #             # Details: https://github.com/pytorch/pytorch/issues/110213
 #             causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 #         return causal_mask
 #     @staticmethod
 #     def _prepare_4d_causal_attention_mask_with_cache_position(
 #         attention_mask: torch.Tensor,
 #         sequence_length: int,
 #         target_length: int,
 #         dtype: torch.dtype,
 #         cache_position: torch.Tensor,
 #         batch_size: int,
 #         config: Qwen3MoeConfig,
 #         past_key_values: Cache,
 #     ):
 #         """
 #         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
 #         `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
 #         Args:
 #             attention_mask (`torch.Tensor`):
 #                 A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
 #             sequence_length (`int`):
 #                 The sequence length being processed.
 #             target_length (`int`):
 #                 The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
 #             dtype (`torch.dtype`):
 #                 The dtype to use for the 4D attention mask.
 #             cache_position (`torch.Tensor`):
 #                 Indices depicting the position of the input sequence tokens in the sequence.
 #             batch_size (`torch.Tensor`):
 #                 Batch size.
 #             config (`Qwen3MoeConfig`):
 #                 The model's configuration class
 #             past_key_values (`Cache`):
 #                 The cache class that is being used currently to generate
 #         """
 #         if attention_mask is not None and attention_mask.dim() == 4:
 #             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
 #             causal_mask = attention_mask
 #         else:
 #             min_dtype = torch.finfo(dtype).min
 #             causal_mask = torch.full(
 #                 (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
 #             )
 #             diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
 #                 -1, 1
 #             )
 #             if config.get_text_config().sliding_window is not None:
 #                 # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
 #                 # the check is needed to verify is current checkpoint was trained with sliding window or not
 #                 if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
 #                     sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
 #                         cache_position.reshape(-1, 1) - config.get_text_config().sliding_window
 #                     )
 #                     diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
 #             causal_mask *= diagonal_attend_mask
 #             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
 #             if attention_mask is not None:
 #                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
 #                 if attention_mask.shape[-1] > target_length:
 #                     attention_mask = attention_mask[:, :target_length]
 #                 mask_length = attention_mask.shape[-1]
 #                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
 #                     causal_mask.device
 #                 )
 #                 padding_mask = padding_mask == 0
 #                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
 #                     padding_mask, min_dtype
 #                 )
 #         return causal_mask
 # def load_balancing_loss_func(
 #     gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
 #     num_experts: Optional[int] = None,
 #     top_k=2,
 #     attention_mask: Optional[torch.Tensor] = None,
 # ) -> Union[torch.Tensor, int]:
 #     r"""
 #     Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
 #     See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
 #     function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
 #     experts is too unbalanced.
 #     Args:
 #         gate_logits:
 #             Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
 #             shape [batch_size X sequence_length, num_experts].
 #         num_experts:
 #             Number of experts
 #         top_k:
 #             The number of experts to route per-token, can be also interpreted as the `top-k` routing
 #             parameter.
 #         attention_mask (`torch.Tensor`, *optional*):
 #             The attention_mask used in forward function
 #             shape [batch_size X sequence_length] if not None.
 #     Returns:
 #         The auxiliary loss.
 #     """
 #     if gate_logits is None or not isinstance(gate_logits, tuple):
 #         return 0
 #     if isinstance(gate_logits, tuple):
 #         compute_device = gate_logits[0].device
 #         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 #     routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
 #     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 #     expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
 #     if attention_mask is None:
 #         # Compute the percentage of tokens routed to each experts
 #         tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
 #         # Compute the average probability of routing to these experts
 #         router_prob_per_expert = torch.mean(routing_weights, dim=0)
 #     else:
 #         batch_size, sequence_length = attention_mask.shape
 #         num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
 #         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
 #         expert_attention_mask = (
 #             attention_mask[None, :, :, None, None]
 #             .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
 #             .reshape(-1, top_k, num_experts)
 #             .to(compute_device)
 #         )
 #         # Compute the percentage of tokens routed to each experts
 #         tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
 #             expert_attention_mask, dim=0
 #         )
 #         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
 #         router_per_expert_attention_mask = (
 #             attention_mask[None, :, :, None]
 #             .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
 #             .reshape(-1, num_experts)
 #             .to(compute_device)
 #         )
 #         # Compute the average probability of routing to these experts
 #         router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
 #             router_per_expert_attention_mask, dim=0
 #         )
 #     overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
 #     return overall_loss * num_experts
 class Qwen3MoeForCausalLM(nn.Module):
    def __init__(self, prefix: str, config, weights):