wip

2025-09-09 11:24:53 +00:00 · 2023-04-06 16:13:32 +02:00 · 2023-04-06 16:13:32 +02:00 · 9541c8f146
commit 9541c8f146
parent 2378529c15
3 changed files with 119 additions and 83 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -18,7 +18,6 @@ from text_generation_server.models.t5 import T5Sharded

 try:
    from text_generation_server.models.flash_neox import FlashNeoX, FlashNeoXSharded
-    from text_generation_server.models.flash_santacoder import FlashSantacoder
    from text_generation_server.models.flash_llama import FlashLlama, FlashLlamaSharded
    from text_generation_server.models.flash_santacoder import FlashSantacoder, FlashSantacoderSharded

@ -84,7 +83,9 @@ def get_model(
    if "bigcode" in model_id:
        if sharded:
            if not FLASH_ATTENTION:
-                raise NotImplementedError("sharded is not supported for Santacoder when FLASH_ATTENTION=0")
+                raise NotImplementedError(
+                    "sharded is not supported for Santacoder when FLASH_ATTENTION=0"
+                )
            return FlashSantacoderSharded(model_id, revision=revision)
        else:
            santacoder_cls = FlashSantacoder if FLASH_ATTENTION else SantaCoder
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@ -69,13 +69,13 @@ class FastLinear(nn.Linear):

 class TensorParallelColumnLinear(FastLinear):
    def __init__(
-            self,
-            in_features,
-            out_features,
-            process_group: torch.distributed.ProcessGroup,
-            bias=True,
-            device=None,
-            dtype=None,
+        self,
+        in_features,
+        out_features,
+        process_group: torch.distributed.ProcessGroup,
+        bias=True,
+        device=None,
+        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
@ -93,14 +93,14 @@ class TensorParallelColumnLinear(FastLinear):

 class TensorParallelRowLinear(FastLinear):
    def __init__(
-            self,
-            in_features,
-            out_features,
-            process_group: torch.distributed.ProcessGroup,
-            reduce=True,
-            bias=True,
-            device=None,
-            dtype=None,
+        self,
+        in_features,
+        out_features,
+        process_group: torch.distributed.ProcessGroup,
+        reduce=True,
+        bias=True,
+        device=None,
+        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
@ -126,19 +126,19 @@ class TensorParallelRowLinear(FastLinear):

 class TensorParallelEmbedding(nn.Embedding):
    def __init__(
-            self,
-            num_embeddings,
-            embedding_dim,
-            process_group: torch.distributed.ProcessGroup,
-            reduce=True,
-            padding_idx=None,
-            max_norm=None,
-            norm_type=2.0,
-            scale_grad_by_freq=False,
-            sparse=False,
-            _weight=None,
-            device=None,
-            dtype=None,
+        self,
+        num_embeddings,
+        embedding_dim,
+        process_group: torch.distributed.ProcessGroup,
+        reduce=True,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+        _weight=None,
+        device=None,
+        dtype=None,
    ):
        self.process_group = process_group
        self.tp_rank = process_group.rank()
@ -207,11 +207,7 @@ class FlashMQAttention(torch.nn.Module):
            self.c_proj = FastLinear(hidden_size, hidden_size)
        else:
            self.num_heads = self.num_heads // process_group.size()
-            self.hidden_size = self.hidden_size // process_group.size()
-            self.c_attn = FastLinear(
-                hidden_size,
-                self.head_size * (self.num_heads + 2)
-            )
+            self.c_attn = FastLinear(hidden_size, self.head_size * (self.num_heads + 2))
            self.c_proj = TensorParallelRowLinear(
                hidden_size, hidden_size, process_group=process_group, reduce=True
            )
@ -228,7 +224,9 @@ class FlashMQAttention(torch.nn.Module):
        qkv = self.c_attn(hidden_states)

        # Split query from key_value
-        query, key_value = qkv.split([self.head_size * self.num_heads, 2 * self.head_size], dim=1)
+        query, key_value = qkv.split(
+            [self.head_size * self.num_heads, 2 * self.head_size], dim=1
+        )

        # Prepare query and key_value for indexing
        query = query.view(-1, self.num_heads, self.head_size)
@ -302,7 +300,7 @@ class MLP(nn.Module):
                x,
                approximate="tanh"
                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else None,
+                else "none",
            )
        )

@ -399,11 +397,13 @@ class FlashSantacoderModel(nn.Module):
            self.wte = TensorParallelEmbedding(
                config.vocab_size,
                config.hidden_size,
+                reduce=False,
                process_group=process_group,
            )
            self.wpe = TensorParallelEmbedding(
                config.max_position_embeddings,
                config.hidden_size,
+                reduce=False,
                process_group=process_group,
            )
        else:
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@ -195,7 +195,8 @@ class FlashSantacoderSharded(FlashSantacoder):
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")

        with init_empty_weights():
-            model = FlashSantacoderForCausalLM(config, self.process_group)
+            # model = FlashSantacoderForCausalLM(config, self.process_group)
+            model = FlashSantacoderForCausalLM(config)

        torch.distributed.barrier(group=self.process_group)
        self.load_weights(
@ -204,7 +205,7 @@ class FlashSantacoderSharded(FlashSantacoder):
            device=device,
            rank=self.rank,
            world_size=self.world_size,
-            transpose=config.architectures[0].startswith("GPT2")
+            transpose=config.architectures[0].startswith("GPT2"),
        )
        self.model = model.eval().to(dtype)
        torch.distributed.barrier(group=self.process_group)
@ -220,7 +221,7 @@ class FlashSantacoderSharded(FlashSantacoder):
        device: torch.device,
        rank: int,
        world_size: int,
-            transpose: bool
+        transpose: bool,
    ):
        for file in filenames:
            with safe_open(file, framework="pt", device=str(device)) as f:
@ -240,44 +241,43 @@ class FlashSantacoderSharded(FlashSantacoder):
                    module_name, param_name = final_name.rsplit(".", 1)
                    module = model.get_submodule(module_name)

-                    if isinstance(module, TensorParallelColumnLinear):
-                        size = slice_.get_shape()[0]
-                        block_size = size // world_size
-                        start = rank * block_size
-                        stop = (rank + 1) * block_size
-                        tensor = slice_[start:stop]
-                    elif isinstance(module, TensorParallelRowLinear):
-                        if param_name == "weight":
-                            size = slice_.get_shape()[1]
-                            block_size = size // world_size
-                            start = rank * block_size
-                            stop = (rank + 1) * block_size
-                            tensor = slice_[:, start:stop]
-                        else:
-                            tensor = slice_[:]
-                            # XXX: Hack for Rowlinear to add the bias only once.
-                            if rank != 0:
-                                tensor = torch.zeros_like(tensor)
-                    elif isinstance(module, TensorParallelEmbedding):
-                        size = slice_.get_shape()[0]
-                        block_size = size // world_size
-                        start = rank * block_size
-                        stop = (rank + 1) * block_size
-                        tensor = slice_[start:stop]
-                    elif "c_attn" in name:
-                        size = slice_.get_shape()[0]
-                        raise ValueError
-                    elif name == "lm_head.weight" and model.transformer.tp_embeddings:
-                        size = slice_.get_shape()[0]
-                        block_size = size // world_size
-                        start = rank * block_size
-                        stop = (rank + 1) * block_size
-                        tensor = slice_[start:stop]
-                    else:
-                        try:
-                            tensor = slice_[:]
-                        except:
-                            tensor = f.get_tensor(name)
+                    # if isinstance(module, TensorParallelColumnLinear):
+                    #     dim = 1 if transpose and "weight" in param_name else 0
+                    #     size = slice_.get_shape()[dim]
+                    #     block_size = size // world_size
+                    #     start = rank * block_size
+                    #     stop = (rank + 1) * block_size
+                    #     tensor = slice_[start:stop] if dim == 0 else slice_[:, start:stop]
+                    # elif isinstance(module, TensorParallelRowLinear):
+                    #     if param_name == "weight":
+                    #         dim = 0 if transpose else 1
+                    #         size = slice_.get_shape()[dim]
+                    #         block_size = size // world_size
+                    #         start = rank * block_size
+                    #         stop = (rank + 1) * block_size
+                    #         tensor = slice_[start:stop] if dim == 0 else slice_[:, start:stop]
+                    #     else:
+                    #         tensor = slice_[:]
+                    #         # XXX: Hack for Rowlinear to add the bias only once.
+                    #         if rank != 0:
+                    #             tensor = torch.zeros_like(tensor)
+                    # elif isinstance(module, TensorParallelEmbedding):
+                    #     size = slice_.get_shape()[0]
+                    #     block_size = size // world_size
+                    #     start = rank * block_size
+                    #     stop = (rank + 1) * block_size
+                    #     tensor = slice_[start:stop]
+                    # elif name == "lm_head.weight" and model.transformer.tp_embeddings:
+                    #     size = slice_.get_shape()[0]
+                    #     block_size = size // world_size
+                    #     start = rank * block_size
+                    #     stop = (rank + 1) * block_size
+                    #     tensor = slice_[start:stop]
+                    # else:
+                    try:
+                        tensor = slice_[:]
+                    except:
+                        tensor = f.get_tensor(name)

                    tensor = tensor.contiguous()

@ -295,7 +295,7 @@ class FlashSantacoderSharded(FlashSantacoder):
                            or "c_attn.weight" in name
                        ):
                            # Tranpose as we use nn.Linear instead of Conv1D
-                            value = value.T
+                            tensor = tensor.T

                        if current_parameter_tensor.device == torch.device("meta"):
                            # Init qkv
@ -316,19 +316,54 @@ class FlashSantacoderSharded(FlashSantacoder):
                                )

                        # Copy to correct slice
+                        # if "q_attn" in name:
+                        #     size = tensor.shape[0]
+                        #     block_size = size // world_size
+                        #     start = rank * block_size
+                        #     stop = (rank + 1) * block_size
+                        #     tensor = tensor[start:stop]
+                        #     module._parameters[param_name][: tensor.shape[0]] = tensor
+                        # elif "kv_attn.weight" in name:
+                        #     module._parameters[param_name][
+                        #         model.transformer.head_size
+                        #         * model.transformer.num_heads :
+                        #     ] = tensor
+                        # elif "kv_attn.bias" in name:
+                        #     module._parameters[param_name][
+                        #         model.transformer.head_size
+                        #         * model.transformer.num_heads :
+                        #     ] = tensor
+                        # elif "c_attn" in name:
+                        #     q_tensor = tensor[: -2 * model.transformer.head_size]
+                        #     kv_tensor = tensor[-2 * model.transformer.head_size :]
+                        #     from loguru import logger
+                        #
+                        #     block_size = q_tensor.shape[0] // world_size
+                        #     start = rank * block_size
+                        #     stop = (rank + 1) * block_size
+                        #     q_tensor = q_tensor[start:stop]
+                        #     logger.error(q_tensor.shape)
+                        #     logger.error(kv_tensor.shape)
+                        #     module._parameters[param_name][
+                        #         : q_tensor.shape[0]
+                        #     ] = q_tensor
+                        #     module._parameters[param_name][
+                        #         q_tensor.shape[0] :
+                        #     ] = kv_tensor
+                        from loguru import logger
                        if "q_attn.weight" in name:
+                            logger.error(f"q - {module._parameters[param_name][: tensor.shape[0]].shape} - {tensor.shape}")
                            module._parameters[param_name][: tensor.shape[0]] = tensor
                        elif "q_attn.bias" in name:
                            module._parameters[param_name][: tensor.shape[0]] = tensor
                        elif "kv_attn.weight" in name:
+                            logger.error(f"kv - {module._parameters[param_name][model.transformer.head_size * model.transformer.num_heads:].shape} - {tensor.shape}")
                            module._parameters[param_name][
-                                model.transformer.head_size
-                                * model.transformer.num_heads :
+                            model.transformer.head_size * model.transformer.num_heads:
                            ] = tensor
                        elif "kv_attn.bias" in name:
                            module._parameters[param_name][
-                                model.transformer.head_size
-                                * model.transformer.num_heads :
+                            model.transformer.head_size * model.transformer.num_heads:
                            ] = tensor
                        else:
                            if current_parameter_tensor.shape != tensor.shape: