feat: support force downcast after FastRMSNorm multiply

2025-09-11 12:24:53 +00:00 · 2024-03-20 17:47:20 +00:00 · 2024-03-20 17:47:20 +00:00 · b307fce653
commit b307fce653
parent dfbd9a39a2
2 changed files with 22 additions and 6 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -209,7 +209,7 @@ class GemmaConfig(PretrainedConfig):
        num_attention_heads=16,
        num_key_value_heads=16,
        head_dim=256,
-        hidden_act="gelu",
+        hidden_act="gelu_pytorch_tanh",
        max_position_embeddings=8192,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
@ -473,7 +473,9 @@ class FlashGemmaLayer(nn.Module):
        input_lengths,
        max_s,
    ):
-        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+        normed_hidden_states, res = self.input_layernorm(
            hidden_states, residual, force_downcast_after=True
        )
        # Self Attention
        attn_output = self.self_attn(
@ -490,7 +492,7 @@ class FlashGemmaLayer(nn.Module):
        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
-            attn_output, res
+            attn_output, res, force_downcast_after=True
        )
        mlp_output = self.mlp(normed_attn_res_output)
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -687,7 +687,7 @@ try:
            weight = weights.get_tensor(f"{prefix}.weight")
            return cls(weight, eps)
-        def forward(self, hidden_states, residual=None):
+        def forward(self, hidden_states, residual=None, force_downcast_after=False):
            if hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual
@ -701,9 +701,23 @@ try:
                # convert into half-precision if necessary
                if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                    hidden_states = hidden_states.to(self.weight.dtype)
+                    # perform the multiplication in float32 then cast back to half
                    if force_downcast_after:
                        hidden_states = (hidden_states * self.weight).to(
                            self.weight.dtype
                        )
                    else:
                        # cast to half before the multiplication
                        hidden_states = self.weight * hidden_states.to(
                            self.weight.dtype
                        )
                # avoid converting to half and multiply in float32
                else:
                    hidden_states = self.weight * hidden_states
                return hidden_states, residual
                return self.weight * hidden_states, residual
            elif IS_CUDA_SYSTEM:
                # faster post attention rms norm
                (