wip

2025-09-10 03:44:54 +00:00 · 2023-04-06 16:13:32 +02:00 · 2023-04-06 16:13:32 +02:00 · 9541c8f146
commit 9541c8f146
parent 2378529c15
3 changed files with 119 additions and 83 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -18,7 +18,6 @@ from text_generation_server.models.t5 import T5Sharded
 try:
    from text_generation_server.models.flash_neox import FlashNeoX, FlashNeoXSharded
    from text_generation_server.models.flash_santacoder import FlashSantacoder
    from text_generation_server.models.flash_llama import FlashLlama, FlashLlamaSharded
    from text_generation_server.models.flash_santacoder import FlashSantacoder, FlashSantacoderSharded
@ -84,7 +83,9 @@ def get_model(
    if "bigcode" in model_id:
        if sharded:
            if not FLASH_ATTENTION:
-                raise NotImplementedError("sharded is not supported for Santacoder when FLASH_ATTENTION=0")
+                raise NotImplementedError(
                    "sharded is not supported for Santacoder when FLASH_ATTENTION=0"
                )
            return FlashSantacoderSharded(model_id, revision=revision)
        else:
            santacoder_cls = FlashSantacoder if FLASH_ATTENTION else SantaCoder
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@ -207,11 +207,7 @@ class FlashMQAttention(torch.nn.Module):
            self.c_proj = FastLinear(hidden_size, hidden_size)
        else:
            self.num_heads = self.num_heads // process_group.size()
-            self.hidden_size = self.hidden_size // process_group.size()
+            self.c_attn = FastLinear(hidden_size, self.head_size * (self.num_heads + 2))
            self.c_attn = FastLinear(
                hidden_size,
                self.head_size * (self.num_heads + 2)
            )
            self.c_proj = TensorParallelRowLinear(
                hidden_size, hidden_size, process_group=process_group, reduce=True
            )
@ -228,7 +224,9 @@ class FlashMQAttention(torch.nn.Module):
        qkv = self.c_attn(hidden_states)
        # Split query from key_value
-        query, key_value = qkv.split([self.head_size * self.num_heads, 2 * self.head_size], dim=1)
+        query, key_value = qkv.split(
            [self.head_size * self.num_heads, 2 * self.head_size], dim=1
        )
        # Prepare query and key_value for indexing
        query = query.view(-1, self.num_heads, self.head_size)
@ -302,7 +300,7 @@ class MLP(nn.Module):
                x,
                approximate="tanh"
                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else None,
+                else "none",
            )
        )
@ -399,11 +397,13 @@ class FlashSantacoderModel(nn.Module):
            self.wte = TensorParallelEmbedding(
                config.vocab_size,
                config.hidden_size,
                reduce=False,
                process_group=process_group,
            )
            self.wpe = TensorParallelEmbedding(
                config.max_position_embeddings,
                config.hidden_size,
                reduce=False,
                process_group=process_group,
            )
        else:
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@ -195,7 +195,8 @@ class FlashSantacoderSharded(FlashSantacoder):
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        with init_empty_weights():
-            model = FlashSantacoderForCausalLM(config, self.process_group)
+            # model = FlashSantacoderForCausalLM(config, self.process_group)
            model = FlashSantacoderForCausalLM(config)
        torch.distributed.barrier(group=self.process_group)
        self.load_weights(
@ -204,7 +205,7 @@ class FlashSantacoderSharded(FlashSantacoder):
            device=device,
            rank=self.rank,
            world_size=self.world_size,
-            transpose=config.architectures[0].startswith("GPT2")
+            transpose=config.architectures[0].startswith("GPT2"),
        )
        self.model = model.eval().to(dtype)
        torch.distributed.barrier(group=self.process_group)
@ -220,7 +221,7 @@ class FlashSantacoderSharded(FlashSantacoder):
        device: torch.device,
        rank: int,
        world_size: int,
-            transpose: bool
+        transpose: bool,
    ):
        for file in filenames:
            with safe_open(file, framework="pt", device=str(device)) as f:
@ -240,40 +241,39 @@ class FlashSantacoderSharded(FlashSantacoder):
                    module_name, param_name = final_name.rsplit(".", 1)
                    module = model.get_submodule(module_name)
-                    if isinstance(module, TensorParallelColumnLinear):
+                    # if isinstance(module, TensorParallelColumnLinear):
-                        size = slice_.get_shape()[0]
+                    #     dim = 1 if transpose and "weight" in param_name else 0
-                        block_size = size // world_size
+                    #     size = slice_.get_shape()[dim]
-                        start = rank * block_size
+                    #     block_size = size // world_size
-                        stop = (rank + 1) * block_size
+                    #     start = rank * block_size
-                        tensor = slice_[start:stop]
+                    #     stop = (rank + 1) * block_size
-                    elif isinstance(module, TensorParallelRowLinear):
+                    #     tensor = slice_[start:stop] if dim == 0 else slice_[:, start:stop]
-                        if param_name == "weight":
+                    # elif isinstance(module, TensorParallelRowLinear):
-                            size = slice_.get_shape()[1]
+                    #     if param_name == "weight":
-                            block_size = size // world_size
+                    #         dim = 0 if transpose else 1
-                            start = rank * block_size
+                    #         size = slice_.get_shape()[dim]
-                            stop = (rank + 1) * block_size
+                    #         block_size = size // world_size
-                            tensor = slice_[:, start:stop]
+                    #         start = rank * block_size
-                        else:
+                    #         stop = (rank + 1) * block_size
-                            tensor = slice_[:]
+                    #         tensor = slice_[start:stop] if dim == 0 else slice_[:, start:stop]
-                            # XXX: Hack for Rowlinear to add the bias only once.
+                    #     else:
-                            if rank != 0:
+                    #         tensor = slice_[:]
-                                tensor = torch.zeros_like(tensor)
+                    #         # XXX: Hack for Rowlinear to add the bias only once.
-                    elif isinstance(module, TensorParallelEmbedding):
+                    #         if rank != 0:
-                        size = slice_.get_shape()[0]
+                    #             tensor = torch.zeros_like(tensor)
-                        block_size = size // world_size
+                    # elif isinstance(module, TensorParallelEmbedding):
-                        start = rank * block_size
+                    #     size = slice_.get_shape()[0]
-                        stop = (rank + 1) * block_size
+                    #     block_size = size // world_size
-                        tensor = slice_[start:stop]
+                    #     start = rank * block_size
-                    elif "c_attn" in name:
+                    #     stop = (rank + 1) * block_size
-                        size = slice_.get_shape()[0]
+                    #     tensor = slice_[start:stop]
-                        raise ValueError
+                    # elif name == "lm_head.weight" and model.transformer.tp_embeddings:
-                    elif name == "lm_head.weight" and model.transformer.tp_embeddings:
+                    #     size = slice_.get_shape()[0]
-                        size = slice_.get_shape()[0]
+                    #     block_size = size // world_size
-                        block_size = size // world_size
+                    #     start = rank * block_size
-                        start = rank * block_size
+                    #     stop = (rank + 1) * block_size
-                        stop = (rank + 1) * block_size
+                    #     tensor = slice_[start:stop]
-                        tensor = slice_[start:stop]
+                    # else:
                    else:
                    try:
                        tensor = slice_[:]
                    except:
@ -295,7 +295,7 @@ class FlashSantacoderSharded(FlashSantacoder):
                            or "c_attn.weight" in name
                        ):
                            # Tranpose as we use nn.Linear instead of Conv1D
-                            value = value.T
+                            tensor = tensor.T
                        if current_parameter_tensor.device == torch.device("meta"):
                            # Init qkv
@ -316,19 +316,54 @@ class FlashSantacoderSharded(FlashSantacoder):
                                )
                        # Copy to correct slice
                        # if "q_attn" in name:
                        #     size = tensor.shape[0]
                        #     block_size = size // world_size
                        #     start = rank * block_size
                        #     stop = (rank + 1) * block_size
                        #     tensor = tensor[start:stop]
                        #     module._parameters[param_name][: tensor.shape[0]] = tensor
                        # elif "kv_attn.weight" in name:
                        #     module._parameters[param_name][
                        #         model.transformer.head_size
                        #         * model.transformer.num_heads :
                        #     ] = tensor
                        # elif "kv_attn.bias" in name:
                        #     module._parameters[param_name][
                        #         model.transformer.head_size
                        #         * model.transformer.num_heads :
                        #     ] = tensor
                        # elif "c_attn" in name:
                        #     q_tensor = tensor[: -2 * model.transformer.head_size]
                        #     kv_tensor = tensor[-2 * model.transformer.head_size :]
                        #     from loguru import logger
                        #
                        #     block_size = q_tensor.shape[0] // world_size
                        #     start = rank * block_size
                        #     stop = (rank + 1) * block_size
                        #     q_tensor = q_tensor[start:stop]
                        #     logger.error(q_tensor.shape)
                        #     logger.error(kv_tensor.shape)
                        #     module._parameters[param_name][
                        #         : q_tensor.shape[0]
                        #     ] = q_tensor
                        #     module._parameters[param_name][
                        #         q_tensor.shape[0] :
                        #     ] = kv_tensor
                        from loguru import logger
                        if "q_attn.weight" in name:
                            logger.error(f"q - {module._parameters[param_name][: tensor.shape[0]].shape} - {tensor.shape}")
                            module._parameters[param_name][: tensor.shape[0]] = tensor
                        elif "q_attn.bias" in name:
                            module._parameters[param_name][: tensor.shape[0]] = tensor
                        elif "kv_attn.weight" in name:
                            logger.error(f"kv - {module._parameters[param_name][model.transformer.head_size * model.transformer.num_heads:].shape} - {tensor.shape}")
                            module._parameters[param_name][
-                                model.transformer.head_size
+                            model.transformer.head_size * model.transformer.num_heads:
                                * model.transformer.num_heads :
                            ] = tensor
                        elif "kv_attn.bias" in name:
                            module._parameters[param_name][
-                                model.transformer.head_size
+                            model.transformer.head_size * model.transformer.num_heads:
                                * model.transformer.num_heads :
                            ] = tensor
                        else:
                            if current_parameter_tensor.shape != tensor.shape: