fix: simplify syntax

2025-09-11 12:24:53 +00:00 · 2024-03-21 03:33:56 +00:00 · 2024-03-21 03:33:56 +00:00 · 704d4ddfaa
commit 704d4ddfaa
parent 5b076dfcf2
1 changed files with 3 additions and 14 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -262,26 +262,15 @@ class GemmaFastRMSNorm(FastRMSNorm):
        return cls(weight, eps)
    # perform the multiplication in full precision and downcast after
-    def forward_downcast_after(self, hidden_states, residual=None):
+    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        # convert into half-precision if necessary
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = (hidden_states * self.weight).to(self.weight.dtype)
        else:
        hidden_states = hidden_states * self.weight
-
+        return hidden_states.to(self.weight.dtype), residual
        return hidden_states, residual
    def forward(self, hidden_states, residual=None):
        hidden_states, residual = self.forward_downcast_after(hidden_states, residual)
        return hidden_states, residual
 def load_attention(config, prefix, weights):