fix: simplify syntax

2025-09-11 12:24:53 +00:00 · 2024-03-21 03:33:56 +00:00 · 2024-03-21 03:33:56 +00:00 · 704d4ddfaa
commit 704d4ddfaa
parent 5b076dfcf2
1 changed files with 3 additions and 14 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -262,26 +262,15 @@ class GemmaFastRMSNorm(FastRMSNorm):
        return cls(weight, eps)

    # perform the multiplication in full precision and downcast after
-    def forward_downcast_after(self, hidden_states, residual=None):
+    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
-
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = (hidden_states * self.weight).to(self.weight.dtype)
-        else:
-            hidden_states = hidden_states * self.weight
-
-        return hidden_states, residual
-
-    def forward(self, hidden_states, residual=None):
-        hidden_states, residual = self.forward_downcast_after(hidden_states, residual)
-        return hidden_states, residual
+        hidden_states = hidden_states * self.weight
+        return hidden_states.to(self.weight.dtype), residual


 def load_attention(config, prefix, weights):