Enabling Flash Attention support for falcon model (#232)

2025-09-18 07:44:53 +00:00 · 2024-10-15 10:50:17 -07:00 · 2024-10-15 10:50:17 -07:00 · e06320f64e
commit e06320f64e
parent 0578bd917d
1 changed files with 4 additions and 4 deletions
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -694,11 +694,11 @@ class CausalLM(Model):
            "return_dict": True,
        }

-        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2"]:
-            
-            if model.config.model_type in ["llama", "mistral", "qwen2"]:
+        if model.config.model_type in ["llama", "mistral", "starcoder2", "qwen2", "falcon"]:
+
+            if model.config.model_type not in ["falcon"]
                kwargs["attn_softmax_bf16"] = True
-                kwargs["trim_logits"] = True
+            kwargs["trim_logits"] = True

            if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true":
                kwargs["use_flash_attention"] = True