Non flash MPT.

2025-09-10 20:04:52 +00:00 · 2023-06-30 09:52:49 +00:00 · 2023-06-30 09:52:49 +00:00 · f33ad7ed98
commit f33ad7ed98
parent 2b53d71991
4 changed files with 1232 additions and 0 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -10,6 +10,7 @@ from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM
 from text_generation_server.models.flash_causal_lm import FlashCausalLM
 from text_generation_server.models.bloom import BLOOMSharded
+from text_generation_server.models.mpt import MPTSharded
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
 from text_generation_server.models.rw import RW
 from text_generation_server.models.opt import OPTSharded
@ -178,6 +179,10 @@ def get_model(
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
+    elif model_type == "mpt":
+        return MPTSharded(
+            model_id, revision, quantize=quantize, trust_remote_code=trust_remote_code
+        )

    elif model_type == "gpt_neox":
        if FLASH_ATTENTION:
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
--- a/server/text_generation_server/models/mpt.py
+++ b/server/text_generation_server/models/mpt.py
@ -0,0 +1,74 @@
+import torch
+import torch.distributed
+
+from opentelemetry import trace
+from transformers import AutoTokenizer, PretrainedConfig
+from typing import Optional
+from huggingface_hub import hf_hub_download
+import json
+
+from text_generation_server.models import CausalLM
+from text_generation_server.models.custom_modeling.mpt_modeling import (
+    MPTForCausalLM,
+)
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+
+tracer = trace.get_tracer(__name__)
+
+
+class MPTSharded(CausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        trust_remote_code: bool = False,
+    ):
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = torch.float16
+        else:
+            raise NotImplementedError("MPTSharded is only available on GPU")
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+
+        filename = hf_hub_download(model_id, revision=revision, filename="config.json")
+        with open(filename, "r") as f:
+            config = json.load(f)
+        config = PretrainedConfig(**config)
+        config.quantize = quantize
+        # config = AutoConfig.from_pretrained(
+        #     # model_id, revision=revision, trust_remote_code=trust_remote_code
+        #     model_id, revision=revision, trust_remote_code=False
+        # )
+
+        torch.distributed.barrier(group=self.process_group)
+
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(filenames, device, dtype, process_group=self.process_group)
+
+        config.quantize = quantize
+        model = MPTForCausalLM(config, weights)
+
+        torch.distributed.barrier(group=self.process_group)
+        super(CausalLM, self).__init__(
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=False,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -31,7 +31,19 @@ def load_layer_norm(cls, prefix, weights, eps):
    return ln


+@classmethod
+def load_layer_norm_no_bias(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = nn.Parameter(weight)
+    ln.bias = None
+    return ln
+
+
 torch.nn.LayerNorm.load = load_layer_norm
+torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias


 class FastLinear(nn.Module):