Test on mistral with CausalLM

2025-09-10 20:04:52 +00:00 · 2023-11-23 16:34:06 +00:00 · 2023-11-23 16:34:06 +00:00 · f1004498d0
commit f1004498d0
parent 3c02262f29
1 changed files with 20 additions and 4 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -44,7 +44,7 @@ __all__ = [

 FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."

-FLASH_ATTENTION = True
+FLASH_ATTENTION = False
 try:
    from text_generation_server.models.flash_rw import FlashRWSharded
    from text_generation_server.models.flash_neox import FlashNeoXSharded
@ -248,15 +248,31 @@ def get_model(
                )

    if model_type == "mistral":
-        if MISTRAL:
-            return FlashMistral(
+        # if MISTRAL:
+        #     return FlashMistral(
+        #         model_id,
+        #         revision,
+        #         quantize=quantize,
+        #         dtype=dtype,
+        #         trust_remote_code=trust_remote_code,
+        #     )
+        # raise NotImplementedError("Mistral model requires flash attention v2")
+        if FLASH_ATTENTION:
+            return FlashLlama(
+                model_id,
+                revision,
+                quantize=quantize,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            return CausalLM(
                model_id,
                revision,
                quantize=quantize,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
-        raise NotImplementedError("Mistral model requires flash attention v2")

    if model_type == "opt":
        return OPTSharded(