mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 07:42:06 +00:00
Remove useless code
Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
parent
91c40e6c58
commit
0bbd8d1645
@ -344,27 +344,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
return final_hidden_states
|
return final_hidden_states
|
||||||
|
|
||||||
|
|
||||||
# @use_kernel_forward_from_hub("RMSNorm")
|
|
||||||
# class Qwen3MoeRMSNorm(nn.Module):
|
|
||||||
# def __init__(self, hidden_size, eps=1e-6):
|
|
||||||
# """
|
|
||||||
# Qwen3MoeRMSNorm is equivalent to T5LayerNorm
|
|
||||||
# """
|
|
||||||
# super().__init__()
|
|
||||||
# self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
||||||
# self.variance_epsilon = eps
|
|
||||||
|
|
||||||
# def forward(self, hidden_states):
|
|
||||||
# input_dtype = hidden_states.dtype
|
|
||||||
# hidden_states = hidden_states.to(torch.float32)
|
|
||||||
# variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
|
||||||
# hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
|
||||||
# return self.weight * hidden_states.to(input_dtype)
|
|
||||||
|
|
||||||
# def extra_repr(self):
|
|
||||||
# return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
|
||||||
|
|
||||||
|
|
||||||
class Qwen3MoeDecoderLayer(nn.Module):
|
class Qwen3MoeDecoderLayer(nn.Module):
|
||||||
def __init__(self, config, prefix, weights, layer_idx: int):
|
def __init__(self, config, prefix, weights, layer_idx: int):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -508,239 +487,6 @@ class Qwen3MoeModel(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
# def _update_causal_mask(
|
|
||||||
# self,
|
|
||||||
# attention_mask: Union[torch.Tensor, "BlockMask"],
|
|
||||||
# input_tensor: torch.Tensor,
|
|
||||||
# cache_position: torch.Tensor,
|
|
||||||
# past_key_values: Cache,
|
|
||||||
# output_attentions: bool = False,
|
|
||||||
# ):
|
|
||||||
# if self.config._attn_implementation == "flash_attention_2":
|
|
||||||
# if attention_mask is not None and past_key_values is not None:
|
|
||||||
# is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
|
||||||
# if is_padding_right:
|
|
||||||
# raise ValueError(
|
|
||||||
# "You are attempting to perform batched generation with padding_side='right'"
|
|
||||||
# " this may lead to unexpected behaviour for Flash Attention version of Qwen3Moe. Make sure to "
|
|
||||||
# " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
|
||||||
# )
|
|
||||||
# if attention_mask is not None and 0.0 in attention_mask:
|
|
||||||
# return attention_mask
|
|
||||||
# return None
|
|
||||||
|
|
||||||
# # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
|
||||||
# # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
|
||||||
# # to infer the attention mask.
|
|
||||||
# past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
|
||||||
# using_static_cache = isinstance(past_key_values, StaticCache)
|
|
||||||
# using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
|
|
||||||
|
|
||||||
# # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
|
||||||
# if (
|
|
||||||
# self.config._attn_implementation == "sdpa"
|
|
||||||
# and not (using_static_cache or using_sliding_window_cache)
|
|
||||||
# and not output_attentions
|
|
||||||
# ):
|
|
||||||
# if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
|
||||||
# attention_mask,
|
|
||||||
# inputs_embeds=input_tensor,
|
|
||||||
# past_key_values_length=past_seen_tokens,
|
|
||||||
# sliding_window=self.config.sliding_window,
|
|
||||||
# is_training=self.training,
|
|
||||||
# ):
|
|
||||||
# return None
|
|
||||||
|
|
||||||
# dtype = input_tensor.dtype
|
|
||||||
# min_dtype = torch.finfo(dtype).min
|
|
||||||
# sequence_length = input_tensor.shape[1]
|
|
||||||
# # SlidingWindowCache or StaticCache
|
|
||||||
# if using_sliding_window_cache or using_static_cache:
|
|
||||||
# target_length = past_key_values.get_max_cache_shape()
|
|
||||||
# # DynamicCache or no cache
|
|
||||||
# else:
|
|
||||||
# target_length = (
|
|
||||||
# attention_mask.shape[-1]
|
|
||||||
# if isinstance(attention_mask, torch.Tensor)
|
|
||||||
# else past_seen_tokens + sequence_length + 1
|
|
||||||
# )
|
|
||||||
|
|
||||||
# # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
|
||||||
# causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
# attention_mask,
|
|
||||||
# sequence_length=sequence_length,
|
|
||||||
# target_length=target_length,
|
|
||||||
# dtype=dtype,
|
|
||||||
# cache_position=cache_position,
|
|
||||||
# batch_size=input_tensor.shape[0],
|
|
||||||
# config=self.config,
|
|
||||||
# past_key_values=past_key_values,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# if (
|
|
||||||
# self.config._attn_implementation == "sdpa"
|
|
||||||
# and attention_mask is not None
|
|
||||||
# and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
|
||||||
# and not output_attentions
|
|
||||||
# ):
|
|
||||||
# # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
|
||||||
# # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
|
||||||
# # Details: https://github.com/pytorch/pytorch/issues/110213
|
|
||||||
# causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
|
||||||
|
|
||||||
# return causal_mask
|
|
||||||
|
|
||||||
# @staticmethod
|
|
||||||
# def _prepare_4d_causal_attention_mask_with_cache_position(
|
|
||||||
# attention_mask: torch.Tensor,
|
|
||||||
# sequence_length: int,
|
|
||||||
# target_length: int,
|
|
||||||
# dtype: torch.dtype,
|
|
||||||
# cache_position: torch.Tensor,
|
|
||||||
# batch_size: int,
|
|
||||||
# config: Qwen3MoeConfig,
|
|
||||||
# past_key_values: Cache,
|
|
||||||
# ):
|
|
||||||
# """
|
|
||||||
# Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
|
||||||
# `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
|
||||||
|
|
||||||
# Args:
|
|
||||||
# attention_mask (`torch.Tensor`):
|
|
||||||
# A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
|
|
||||||
# sequence_length (`int`):
|
|
||||||
# The sequence length being processed.
|
|
||||||
# target_length (`int`):
|
|
||||||
# The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
|
|
||||||
# dtype (`torch.dtype`):
|
|
||||||
# The dtype to use for the 4D attention mask.
|
|
||||||
# cache_position (`torch.Tensor`):
|
|
||||||
# Indices depicting the position of the input sequence tokens in the sequence.
|
|
||||||
# batch_size (`torch.Tensor`):
|
|
||||||
# Batch size.
|
|
||||||
# config (`Qwen3MoeConfig`):
|
|
||||||
# The model's configuration class
|
|
||||||
# past_key_values (`Cache`):
|
|
||||||
# The cache class that is being used currently to generate
|
|
||||||
# """
|
|
||||||
# if attention_mask is not None and attention_mask.dim() == 4:
|
|
||||||
# # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
|
||||||
# causal_mask = attention_mask
|
|
||||||
# else:
|
|
||||||
# min_dtype = torch.finfo(dtype).min
|
|
||||||
# causal_mask = torch.full(
|
|
||||||
# (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
|
||||||
# )
|
|
||||||
# diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
|
|
||||||
# -1, 1
|
|
||||||
# )
|
|
||||||
# if config.get_text_config().sliding_window is not None:
|
|
||||||
# # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
|
|
||||||
# # the check is needed to verify is current checkpoint was trained with sliding window or not
|
|
||||||
# if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
|
|
||||||
# sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
|
|
||||||
# cache_position.reshape(-1, 1) - config.get_text_config().sliding_window
|
|
||||||
# )
|
|
||||||
# diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
|
|
||||||
# causal_mask *= diagonal_attend_mask
|
|
||||||
# causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
|
||||||
# if attention_mask is not None:
|
|
||||||
# causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
|
||||||
# if attention_mask.shape[-1] > target_length:
|
|
||||||
# attention_mask = attention_mask[:, :target_length]
|
|
||||||
# mask_length = attention_mask.shape[-1]
|
|
||||||
# padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
|
||||||
# causal_mask.device
|
|
||||||
# )
|
|
||||||
# padding_mask = padding_mask == 0
|
|
||||||
# causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
|
||||||
# padding_mask, min_dtype
|
|
||||||
# )
|
|
||||||
# return causal_mask
|
|
||||||
|
|
||||||
|
|
||||||
# def load_balancing_loss_func(
|
|
||||||
# gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
|
|
||||||
# num_experts: Optional[int] = None,
|
|
||||||
# top_k=2,
|
|
||||||
# attention_mask: Optional[torch.Tensor] = None,
|
|
||||||
# ) -> Union[torch.Tensor, int]:
|
|
||||||
# r"""
|
|
||||||
# Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
|
|
||||||
|
|
||||||
# See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
|
|
||||||
# function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
|
|
||||||
# experts is too unbalanced.
|
|
||||||
|
|
||||||
# Args:
|
|
||||||
# gate_logits:
|
|
||||||
# Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
|
|
||||||
# shape [batch_size X sequence_length, num_experts].
|
|
||||||
# num_experts:
|
|
||||||
# Number of experts
|
|
||||||
# top_k:
|
|
||||||
# The number of experts to route per-token, can be also interpreted as the `top-k` routing
|
|
||||||
# parameter.
|
|
||||||
# attention_mask (`torch.Tensor`, *optional*):
|
|
||||||
# The attention_mask used in forward function
|
|
||||||
# shape [batch_size X sequence_length] if not None.
|
|
||||||
|
|
||||||
# Returns:
|
|
||||||
# The auxiliary loss.
|
|
||||||
# """
|
|
||||||
# if gate_logits is None or not isinstance(gate_logits, tuple):
|
|
||||||
# return 0
|
|
||||||
|
|
||||||
# if isinstance(gate_logits, tuple):
|
|
||||||
# compute_device = gate_logits[0].device
|
|
||||||
# concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
|
|
||||||
|
|
||||||
# routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
|
|
||||||
|
|
||||||
# _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
|
|
||||||
|
|
||||||
# expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
|
|
||||||
|
|
||||||
# if attention_mask is None:
|
|
||||||
# # Compute the percentage of tokens routed to each experts
|
|
||||||
# tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
|
|
||||||
|
|
||||||
# # Compute the average probability of routing to these experts
|
|
||||||
# router_prob_per_expert = torch.mean(routing_weights, dim=0)
|
|
||||||
# else:
|
|
||||||
# batch_size, sequence_length = attention_mask.shape
|
|
||||||
# num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
|
|
||||||
|
|
||||||
# # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
|
|
||||||
# expert_attention_mask = (
|
|
||||||
# attention_mask[None, :, :, None, None]
|
|
||||||
# .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
|
|
||||||
# .reshape(-1, top_k, num_experts)
|
|
||||||
# .to(compute_device)
|
|
||||||
# )
|
|
||||||
|
|
||||||
# # Compute the percentage of tokens routed to each experts
|
|
||||||
# tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
|
|
||||||
# expert_attention_mask, dim=0
|
|
||||||
# )
|
|
||||||
|
|
||||||
# # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
|
|
||||||
# router_per_expert_attention_mask = (
|
|
||||||
# attention_mask[None, :, :, None]
|
|
||||||
# .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
|
|
||||||
# .reshape(-1, num_experts)
|
|
||||||
# .to(compute_device)
|
|
||||||
# )
|
|
||||||
|
|
||||||
# # Compute the average probability of routing to these experts
|
|
||||||
# router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
|
|
||||||
# router_per_expert_attention_mask, dim=0
|
|
||||||
# )
|
|
||||||
|
|
||||||
# overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
|
|
||||||
# return overall_loss * num_experts
|
|
||||||
|
|
||||||
|
|
||||||
class Qwen3MoeForCausalLM(nn.Module):
|
class Qwen3MoeForCausalLM(nn.Module):
|
||||||
|
|
||||||
def __init__(self, prefix: str, config, weights):
|
def __init__(self, prefix: str, config, weights):
|
||||||
|
Loading…
Reference in New Issue
Block a user