Working loading state.

2025-09-09 11:24:53 +00:00 · 2024-09-18 17:01:36 +02:00 · 2024-09-18 17:01:36 +02:00 · 907906466a
commit 907906466a
parent 7efcb5e0ed
3 changed files with 1266 additions and 9 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -308,6 +308,12 @@ class ModelType(enum.Enum):
        "url": "https://huggingface.co/HuggingFaceM4/idefics-9b",
        "multimodal": True,
    }
    MLLAMA = {
        "type": "mllama",
        "name": "Mllama",
        "url": "https://huggingface.co/xxx/xx",
        "multimodal": True,
    }
 __GLOBALS = locals()
@ -1095,6 +1101,18 @@ def get_model(
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if model_type == MLLAMA:
        if FLASH_ATTENTION:
            return IDEFICSSharded(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Mllama"))
    if model_type == IDEFICS2:
        if FLASH_ATTENTION:
            return VlmCausalLM(
--- a/server/text_generation_server/models/custom_modeling/mllama.py
+++ b/server/text_generation_server/models/custom_modeling/mllama.py
--- a/server/text_generation_server/models/idefics.py
+++ b/server/text_generation_server/models/idefics.py
@ -4,14 +4,13 @@ import torch.distributed
 from typing import Optional
-from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
+from transformers import AutoConfig, AutoProcessor, AutoTokenizer
 from text_generation_server.models.custom_modeling.idefics_processing import (
    IdeficsProcessor,
 )
 from transformers import LlamaTokenizerFast
 from text_generation_server.models.custom_modeling.idefics_modeling import (
    IdeficsForVisionText2Text,
 )
 from text_generation_server.models.custom_modeling.mllama import (
    MllamaForConditionalGeneration,
 )
 from text_generation_server.models.idefics_causal_lm import IdeficsCausalLM
 from text_generation_server.utils import (
    initialize_torch_distributed,
@ -53,7 +52,7 @@ class IDEFICSSharded(IdeficsCausalLM):
            dtype = torch.float32 if dtype is None else dtype
        self.device, self.dtype = device, dtype
-        config = IdeficsConfig.from_pretrained(
+        config = AutoConfig.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
@ -62,14 +61,14 @@ class IDEFICSSharded(IdeficsCausalLM):
        config.speculator = speculator
        config.vision_config.quantize = quantize
-        tokenizer = LlamaTokenizerFast.from_pretrained(
+        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
-        self.processor = IdeficsProcessor.from_pretrained(
+        self.processor = AutoProcessor.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
@ -90,7 +89,14 @@ class IDEFICSSharded(IdeficsCausalLM):
            weights_loader=weights_loader,
        )
        if config.model_type == "idefics":
            model = IdeficsForVisionText2Text(config, weights)
        elif config.model_type == "mllama":
            model = MllamaForConditionalGeneration(
                prefix="", config=config, weights=weights
            )
        else:
            raise RuntimeError(f"Unsupported model type {config.model_type}")
        torch.distributed.barrier(group=self.process_group)
        super(IdeficsCausalLM, self).__init__(