Enables Flash Attention in TGI for gemma models (#235)

2025-07-29 11:20:16 +00:00 · 2024-10-18 09:20:42 -07:00 · 2024-10-18 09:20:42 -07:00 · c5e3881051
commit c5e3881051
parent 9ae5ad5057
1 changed files with 4 additions and 2 deletions
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -694,10 +694,12 @@ class CausalLM(Model):
            "return_dict": True,
        }

-        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]:
+        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon", "gemma"]:

            if model.config.model_type not in ["falcon"]:
                kwargs["attn_softmax_bf16"] = True
+
+            if model.config.model_type not in ["gemma"]:
                kwargs["trim_logits"] = True

            if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true":