feat: prefer gemma specific rms

2025-09-11 12:24:53 +00:00 · 2024-03-21 03:28:03 +00:00 · 2024-03-21 03:28:03 +00:00 · 5b076dfcf2
commit 5b076dfcf2
parent b307fce653
2 changed files with 27 additions and 21 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -261,6 +261,28 @@ class GemmaFastRMSNorm(FastRMSNorm):
        weight = weights.get_tensor(f"{prefix}.weight") + 1
        return cls(weight, eps)

+    # perform the multiplication in full precision and downcast after
+    def forward_downcast_after(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = (hidden_states * self.weight).to(self.weight.dtype)
+        else:
+            hidden_states = hidden_states * self.weight
+
+        return hidden_states, residual
+
+    def forward(self, hidden_states, residual=None):
+        hidden_states, residual = self.forward_downcast_after(hidden_states, residual)
+        return hidden_states, residual
+

 def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
@ -473,9 +495,7 @@ class FlashGemmaLayer(nn.Module):
        input_lengths,
        max_s,
    ):
-        normed_hidden_states, res = self.input_layernorm(
-            hidden_states, residual, force_downcast_after=True
-        )
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
@ -492,7 +512,7 @@ class FlashGemmaLayer(nn.Module):

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
-            attn_output, res, force_downcast_after=True
+            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output)
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -687,7 +687,7 @@ try:
            weight = weights.get_tensor(f"{prefix}.weight")
            return cls(weight, eps)

-        def forward(self, hidden_states, residual=None, force_downcast_after=False):
+        def forward(self, hidden_states, residual=None):
            if hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual
@ -701,23 +701,9 @@ try:

                # convert into half-precision if necessary
                if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                    # perform the multiplication in float32 then cast back to half
-                    if force_downcast_after:
-                        hidden_states = (hidden_states * self.weight).to(
-                            self.weight.dtype
-                        )
-                    else:
-                        # cast to half before the multiplication
-                        hidden_states = self.weight * hidden_states.to(
-                            self.weight.dtype
-                        )
-
-                # avoid converting to half and multiply in float32
-                else:
-                    hidden_states = self.weight * hidden_states
-
-                return hidden_states, residual
+                    hidden_states = hidden_states.to(self.weight.dtype)

+                return self.weight * hidden_states, residual
            elif IS_CUDA_SYSTEM:
                # faster post attention rms norm
                (