From c5e3881051c9aab9bb8edfde6ea581e31905a0bf Mon Sep 17 00:00:00 2001
From: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
Date: Fri, 18 Oct 2024 09:20:42 -0700
Subject: [PATCH] Enables Flash Attention in TGI for gemma models (#235)

---
 server/text_generation_server/models/causal_lm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 7ba959ee..10ebd41c 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -694,11 +694,13 @@ class CausalLM(Model):
             "return_dict": True,
         }
 
-        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]:
+        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon", "gemma"]:
 
             if model.config.model_type not in ["falcon"]:
                 kwargs["attn_softmax_bf16"] = True
-            kwargs["trim_logits"] = True
+
+            if model.config.model_type not in ["gemma"]:
+                kwargs["trim_logits"] = True
 
             if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true":
                 kwargs["use_flash_attention"] = True