Enabling Flash Attention support for falcon model (#232)

2025-08-02 05:10:23 +00:00 · 2024-10-15 10:50:17 -07:00 · 2024-10-15 10:50:17 -07:00 · e06320f64e
commit e06320f64e
parent 0578bd917d
1 changed files with 4 additions and 4 deletions
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -694,9 +694,9 @@ class CausalLM(Model):
            "return_dict": True,
        }
-        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2"]:
+        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]:
-            if model.config.model_type in ["llama", "mistral", "qwen2"]:
+            if model.config.model_type not in ["falcon"]
                kwargs["attn_softmax_bf16"] = True
            kwargs["trim_logits"] = True