First working step.

2025-09-12 04:44:52 +00:00 · 2024-07-02 11:25:18 +00:00 · 2024-07-02 11:25:18 +00:00 · 69cb084b5f
commit 69cb084b5f
parent b28946d695
5 changed files with 15 additions and 48 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -88,6 +88,9 @@ try:
    from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
        FlashMistralForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
        FlashMixtralForCausalLM,
    )
    from text_generation_server.models.flash_phi import FlashPhi
    from text_generation_server.models.flash_starcoder2 import FlashStarcoder2
    from text_generation_server.models.flash_dbrx import FlashDbrx
@ -106,7 +109,6 @@ if FLASH_ATTENTION:
    # __all__.append(FlashLlama)
    __all__.append(IDEFICSSharded)
    __all__.append(FlashMistral)
    __all__.append(FlashMixtral)
    __all__.append(FlashDbrx)
    __all__.append(FlashPhi)
    __all__.append(FlashQwen2)
@ -773,13 +775,15 @@ def get_model(
    if model_type == MIXTRAL:
        if FLASH_ATTENTION:
-            return FlashMixtral(
+            return FlashMistral(
-                model_id,
+                model_id=model_id,
-                revision,
+                model_class=FlashMixtralForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mixtral"))
--- a/server/text_generation_server/models/flash_mixtral.py
+++ b/server/text_generation_server/models/flash_mixtral.py
@ -1,31 +0,0 @@
 import torch
 from typing import Optional
 from text_generation_server.models.flash_mistral import BaseFlashMistral
 from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
    MixtralConfig,
    FlashMixtralForCausalLM,
 )
 class FlashMixtral(BaseFlashMistral):
    def __init__(
        self,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        super(FlashMixtral, self).__init__(
            config_cls=MixtralConfig,
            model_cls=FlashMixtralForCausalLM,
            model_id=model_id,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
--- a/server/text_generation_server/models/flash_qwen2.py
+++ b/server/text_generation_server/models/flash_qwen2.py
@ -8,8 +8,7 @@ from transformers import AutoTokenizer, AutoConfig
 from typing import Optional
 from text_generation_server.models.flash_mistral import (
-    BaseFlashMistral,
+    FlashMistral,
    set_sliding_window,
 )
 from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
    Qwen2ForCausalLM,
@ -24,7 +23,7 @@ from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
-class FlashQwen2(BaseFlashMistral):
+class FlashQwen2(FlashMistral):
    def __init__(
        self,
        model_id: str,
@ -62,10 +61,6 @@ class FlashQwen2(BaseFlashMistral):
        config.quantize = quantize
        config.speculator = speculator
        # Set context windows
        if config.sliding_window is not None:
            set_sliding_window(config.sliding_window)
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
@ -78,7 +73,7 @@ class FlashQwen2(BaseFlashMistral):
        self.cuda_graphs = {}
        torch.distributed.barrier(group=self.process_group)
-        super(BaseFlashMistral, self).__init__(
+        super(FlashMistral, self).__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
--- a/server/text_generation_server/models/flash_starcoder2.py
+++ b/server/text_generation_server/models/flash_starcoder2.py
@ -7,8 +7,7 @@ from typing import Optional
 from transformers.models.gpt2 import GPT2TokenizerFast
 from text_generation_server.models.flash_mistral import (
-    BaseFlashMistral,
+    FlashMistral,
    set_sliding_window,
 )
 from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
    Starcoder2Config,
@ -22,7 +21,7 @@ from text_generation_server.utils import (
 # Starcoder2 has the same base as Mistral
-class FlashStarcoder2(BaseFlashMistral):
+class FlashStarcoder2(FlashMistral):
    def __init__(
        self,
        model_id: str,
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -11,7 +11,7 @@ from transformers.image_processing_utils import select_best_resolution
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch
 from text_generation_server.models.flash_mistral import (
-    BaseFlashMistral,
+    FlashMistral,
 )
 tracer = trace.get_tracer(__name__)
@ -239,7 +239,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
        return batch
-class VlmCausalLM(BaseFlashMistral):
+class VlmCausalLM(FlashMistral):
    @property
    def batch_type(self) -> Type[VlmCausalLMBatch]:
        return VlmCausalLMBatch