debug

2025-09-18 07:44:53 +00:00 · 2024-06-13 07:31:11 +00:00 · 2024-06-13 07:31:11 +00:00 · 8f1de30b0f
commit 8f1de30b0f
parent b3e9a13e27
1 changed files with 16 additions and 1 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@ -298,7 +298,7 @@ class GPT2MLP(nn.Module):
 class FlashGPT2Layer(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix, config, weights, layer_idx):
        super().__init__()
        self.self_attn = FlashGPT2Attention(
            prefix=f"{prefix}.attn", config=config, weights=weights
@ -313,6 +313,7 @@ class FlashGPT2Layer(nn.Module):
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.layer_idx = layer_idx
    def forward(
        self,
@ -324,10 +325,14 @@ class FlashGPT2Layer(nn.Module):
        slots,
        input_lengths,
        max_s,
        step,
    ):
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        if self.layer_idx < 5:
            torch.save(hidden_states, f"hidden_states_bef_attn_step{step}_layer{self.layer_idx}.pt")
        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
@ -339,6 +344,9 @@ class FlashGPT2Layer(nn.Module):
            max_s,
        )
        if self.layer_idx < 5:
            torch.save(attn_output, f"attn_output_step{step}_layer{self.layer_idx}.pt")
        hidden_states = attn_output + residual
        residual = hidden_states
@ -346,6 +354,9 @@ class FlashGPT2Layer(nn.Module):
        mlp_output = self.mlp(hidden_states)
        if self.layer_idx < 5:
            torch.save(mlp_output, f"mlp_output_step{step}_layer{self.layer_idx}.pt")
        return residual + mlp_output, residual
@ -364,6 +375,7 @@ class FlashGPT2Model(torch.nn.Module):
                    ),
                    config=config,
                    weights=weights,
                    layer_idx=layer_id
                )
                for layer_id in range(config.num_hidden_layers)
            ]
@ -379,6 +391,7 @@ class FlashGPT2Model(torch.nn.Module):
        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.step = 0
    def forward(
        self,
@ -406,7 +419,9 @@ class FlashGPT2Model(torch.nn.Module):
                slots,
                input_lengths,
                max_s,
                self.step,
            )
        self.step += 1
        hidden_states = self.norm(hidden_states)