Revert pr 235 as flash attention is not really enabled for gemma (#239)

2025-09-11 20:34:54 +00:00 · 2024-10-23 01:58:57 -07:00 · 2024-10-23 01:58:57 -07:00 · b126bf4785
commit b126bf4785
parent c5e3881051
1 changed files with 2 additions and 3 deletions
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -694,13 +694,12 @@ class CausalLM(Model):
            "return_dict": True,
        }

-        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon", "gemma"]:
+        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]:

            if model.config.model_type not in ["falcon"]:
                kwargs["attn_softmax_bf16"] = True

-            if model.config.model_type not in ["gemma"]:
-                kwargs["trim_logits"] = True
+            kwargs["trim_logits"] = True

            if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true":
                kwargs["use_flash_attention"] = True