mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
Remove debug info
Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
parent
5155fef477
commit
1a5ef906ae
@ -30,12 +30,6 @@ from text_generation_server.layers import (
|
|||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
FastLinear,
|
FastLinear,
|
||||||
)
|
)
|
||||||
from text_generation_server.utils.import_utils import (
|
|
||||||
synchronize,
|
|
||||||
get_free_memory,
|
|
||||||
)
|
|
||||||
from loguru import logger
|
|
||||||
from text_generation_server.utils.log import log_master
|
|
||||||
|
|
||||||
from text_generation_server.layers.layernorm import (
|
from text_generation_server.layers.layernorm import (
|
||||||
FastRMSNorm,
|
FastRMSNorm,
|
||||||
@ -359,9 +353,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
routing_weights, selected_experts = torch.topk(
|
routing_weights, selected_experts = torch.topk(
|
||||||
routing_weights, self.top_k, dim=-1
|
routing_weights, self.top_k, dim=-1
|
||||||
)
|
)
|
||||||
print(
|
|
||||||
f"routing_weights: {routing_weights.device}, selected_experts: {selected_experts.device}"
|
|
||||||
)
|
|
||||||
if self.norm_topk_prob: # only diff with mixtral sparse moe block!
|
if self.norm_topk_prob: # only diff with mixtral sparse moe block!
|
||||||
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
||||||
# we cast back to the input dtype
|
# we cast back to the input dtype
|
||||||
@ -376,7 +367,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
expert_mask = torch.nn.functional.one_hot(
|
expert_mask = torch.nn.functional.one_hot(
|
||||||
selected_experts, num_classes=self.num_experts
|
selected_experts, num_classes=self.num_experts
|
||||||
).permute(2, 1, 0)
|
).permute(2, 1, 0)
|
||||||
print(f"expert_mask: {expert_mask.device}")
|
|
||||||
# Loop over all available experts in the model and perform the computation on each expert
|
# Loop over all available experts in the model and perform the computation on each expert
|
||||||
for expert_idx in range(self.num_experts):
|
for expert_idx in range(self.num_experts):
|
||||||
expert_layer = self.experts[expert_idx]
|
expert_layer = self.experts[expert_idx]
|
||||||
@ -432,10 +422,6 @@ class Qwen3MoeDecoderLayer(nn.Module):
|
|||||||
SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
|
SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
|
||||||
)
|
)
|
||||||
|
|
||||||
moe_layer_cls = (
|
|
||||||
SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
|
|
||||||
)
|
|
||||||
|
|
||||||
if (layer_idx not in config.mlp_only_layers) and (
|
if (layer_idx not in config.mlp_only_layers) and (
|
||||||
config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
|
config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
|
||||||
):
|
):
|
||||||
@ -516,11 +502,6 @@ class Qwen3MoeModel(nn.Module):
|
|||||||
self.norm = FastRMSNorm.load(
|
self.norm = FastRMSNorm.load(
|
||||||
prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
|
prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
|
||||||
)
|
)
|
||||||
synchronize(weights.device)
|
|
||||||
real_free_memory = get_free_memory(weights.device, 1)
|
|
||||||
log_master(
|
|
||||||
logger.debug, f"init model Free memory real: {real_free_memory / 1e9:.2f}GB"
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
|
@ -49,7 +49,8 @@ from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
|
|||||||
# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
|
# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from transformers.feature_extraction_utils import BatchFeature
|
from transformers.feature_extraction_utils import BatchFeature
|
||||||
from transformers.image_utils import ImageInput, VideoInput
|
from transformers.image_utils import ImageInput
|
||||||
|
from transformers.video_utils import VideoInput
|
||||||
from transformers.processing_utils import (
|
from transformers.processing_utils import (
|
||||||
ProcessingKwargs,
|
ProcessingKwargs,
|
||||||
ProcessorMixin,
|
ProcessorMixin,
|
||||||
|
Loading…
Reference in New Issue
Block a user