mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 11:24:53 +00:00
Fix crash
Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
parent
3482d7ca82
commit
ccddbba752
@ -25,7 +25,6 @@ class FastLinear(torch.nn.Module):
|
|||||||
return cls(weight, bias)
|
return cls(weight, bias)
|
||||||
|
|
||||||
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
||||||
print(f"input.shape={input.shape}, self.weight={self.weight.shape}")
|
|
||||||
return F.linear(input, self.weight, self.bias)
|
return F.linear(input, self.weight, self.bias)
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,7 +37,6 @@ class UnquantizedSparseMoELayer(nn.Module):
|
|||||||
self.weight_block_size = weights.weights_loader.weight_block_size
|
self.weight_block_size = weights.weights_loader.weight_block_size
|
||||||
self.scoring_func = scoring_func
|
self.scoring_func = scoring_func
|
||||||
self.e_score_correction_bias = e_score_correction_bias
|
self.e_score_correction_bias = e_score_correction_bias
|
||||||
|
|
||||||
self.gate_up_proj = _load_expert_multi_weights_col(
|
self.gate_up_proj = _load_expert_multi_weights_col(
|
||||||
prefix=prefix,
|
prefix=prefix,
|
||||||
n_experts=n_experts,
|
n_experts=n_experts,
|
||||||
@ -52,7 +51,6 @@ class UnquantizedSparseMoELayer(nn.Module):
|
|||||||
name=down_proj_name,
|
name=down_proj_name,
|
||||||
weights=weights,
|
weights=weights,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.hpu_fused_moe = DynamicFusedMOE(n_experts)
|
self.hpu_fused_moe = DynamicFusedMOE(n_experts)
|
||||||
for i in range(n_experts):
|
for i in range(n_experts):
|
||||||
self.hpu_fused_moe.MoeOp.w13_list[i].set_weight(self.gate_up_proj[i])
|
self.hpu_fused_moe.MoeOp.w13_list[i].set_weight(self.gate_up_proj[i])
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -31,6 +31,7 @@ from text_generation_server.layers.attention import (
|
|||||||
KVCache,
|
KVCache,
|
||||||
get_kv_scales,
|
get_kv_scales,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.utils.log import log_master
|
||||||
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
|
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
|
||||||
from text_generation_server.layers.attention import (
|
from text_generation_server.layers.attention import (
|
||||||
paged_attention,
|
paged_attention,
|
||||||
@ -46,6 +47,7 @@ from text_generation_server.layers import (
|
|||||||
TensorParallelMultiAdapterLinear,
|
TensorParallelMultiAdapterLinear,
|
||||||
TensorParallelAdapterRowLinear,
|
TensorParallelAdapterRowLinear,
|
||||||
)
|
)
|
||||||
|
from loguru import logger
|
||||||
from text_generation_server.layers.rotary import PositionRotaryEmbedding
|
from text_generation_server.layers.rotary import PositionRotaryEmbedding
|
||||||
from text_generation_server.layers.layernorm import (
|
from text_generation_server.layers.layernorm import (
|
||||||
FastRMSNorm,
|
FastRMSNorm,
|
||||||
@ -633,7 +635,14 @@ class FlashLlamaForCausalLM(torch.nn.Module):
|
|||||||
adapter_data: Optional[torch.Tensor] = None,
|
adapter_data: Optional[torch.Tensor] = None,
|
||||||
cross_attention_states=None,
|
cross_attention_states=None,
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||||
|
|
||||||
|
|
||||||
|
log_master(
|
||||||
|
logger.debug,
|
||||||
|
f"input_ids: {input_ids}, input_ids.shape={input_ids.shape}, input_ids={input_ids[:-20]}"
|
||||||
|
)
|
||||||
inputs_embeds = self.embed_tokens(input_ids)
|
inputs_embeds = self.embed_tokens(input_ids)
|
||||||
|
print(f"111111111 inputs_embeds: {inputs_embeds}")
|
||||||
hidden_states = self.model(
|
hidden_states = self.model(
|
||||||
inputs_embeds,
|
inputs_embeds,
|
||||||
position_ids,
|
position_ids,
|
||||||
|
@ -1792,7 +1792,7 @@ class FlashCausalLM(Model):
|
|||||||
kwargs = {}
|
kwargs = {}
|
||||||
if htorch.utils.internal.is_lazy():
|
if htorch.utils.internal.is_lazy():
|
||||||
kwargs["bypass_hpu_graphs"] = batch.prefilling
|
kwargs["bypass_hpu_graphs"] = batch.prefilling
|
||||||
|
print(f"11111111111111111111input_ids: {input_ids.shape}")
|
||||||
logits, speculative_logits = self.model.forward(
|
logits, speculative_logits = self.model.forward(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
|
Loading…
Reference in New Issue
Block a user