Fix rebase.

2025-09-10 03:44:54 +00:00 · 2023-06-06 11:20:53 +02:00 · 2023-06-06 11:20:53 +02:00 · 2a1ecf3863
commit 2a1ecf3863
parent 7fa79f02ca
1 changed files with 0 additions and 135 deletions
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@ -21,12 +21,6 @@ from text_generation_server.utils import (
    Weights,
 )
 HAS_BITS_AND_BYTES = True
 try:
    pass
 except Exception:
    HAS_BITS_AND_BYTES = False
 class BloomCausalLMBatch(CausalLMBatch):
    @classmethod
@ -95,138 +89,9 @@ class BLOOMSharded(CausalLM):
            world_size=world_size,
        )
 <<<<<<< HEAD
    @staticmethod
    def load_weights(
        model,
        filenames: List[str],
        quantize: Optional[str],
        device: torch.device,
        dtype: torch.dtype,
        rank: int,
        world_size: int,
    ):
        parameters = dict(model.named_parameters())
        for file in filenames:
            with safe_open(
                file, framework="pt", device=str(device) if quantize is None else "cpu"
            ) as f:
                for name in f.keys():
                    if name.startswith("transformer.") or name.startswith("lm_head."):
                        full_name = name
                    else:
                        full_name = f"transformer.{name}"
                    module_name, param_name = full_name.rsplit(".", 1)
                    module = model.get_submodule(module_name)
                    current_tensor = parameters[full_name]
                    slice_ = f.get_slice(name)
                    if isinstance(module, TensorParallelColumnLinear):
                        size = slice_.get_shape()[0]
                        block_size = size // world_size
                        start = rank * block_size
                        stop = (rank + 1) * block_size
                        tensor = slice_[start:stop]
                    elif isinstance(module, TensorParallelRowLinear):
                        if param_name == "weight":
                            size = slice_.get_shape()[1]
                            block_size = size // world_size
                            start = rank * block_size
                            stop = (rank + 1) * block_size
                            tensor = slice_[:, start:stop]
                        else:
                            tensor = slice_[:]
                            # XXX: Hack for Rowlinear to add the bias only once.
                            if rank != 0:
                                tensor = torch.zeros_like(tensor)
                    elif (
                        isinstance(module, TensorParallelEmbedding)
                        or name == "lm_head.weight"
                    ):
                        size = slice_.get_shape()[0]
                        block_size = size // world_size
                        start = rank * block_size
                        stop = (rank + 1) * block_size
                        tensor = slice_[start:stop]
                    else:
                        tensor = slice_[:]
                    if current_tensor.shape != tensor.shape:
                        raise ValueError(
                            f"Name {name} -- Current {current_tensor.shape} and got {tensor.shape}"
                        )
                    tensor = tensor.contiguous().to(dtype)
                    if quantize == "bitsandbytes":
                        if not HAS_BITS_AND_BYTES:
                            raise ImportError(
                                "bitsandbytes is not available on your machine either because it is not installed "
                                "or you don't have a GPU.\n"
                                "You can install it with `pip install bitsandbytes`."
                            )
                        if (
                            type(module)
                            in [TensorParallelRowLinear, TensorParallelColumnLinear]
                            and param_name == "weight"
                        ):
                            tensor = Int8Params(
                                tensor,
                                has_fp16_weights=False,
                                requires_grad=False,
                            ).to(device)
                            state = bnb.MatmulLtState()
                            state.threshold = 6.0
                            state.has_fp16_weights = False
                            state.memory_efficient_backward = False
                            state.use_pool = True
                            state.CB = tensor.CB
                            state.SCB = tensor.SCB
                            tensor.CB = None
                            tensor.SCB = None
                            def replace_linear(state):
                                def linear(input, weight, bias):
                                    out = bnb.matmul(
                                        input,
                                        weight,
                                        state=state,
                                        threshold=state.threshold,
                                        bias=bias,
                                    )
                                    if state.CB is not None:
                                        # we converted 8-bit row major to turing/ampere format
                                        # in the first inference pass
                                        # we no longer need the row-major weight
                                        del state.CB
                                        weight.data = state.CxB
                                    return out
                                return linear
                            module.linear = replace_linear(state)
                        else:
                            tensor = tensor.to(device)
                    elif quantize == "gptq":
                        raise NotImplementedError("`gptq` is not implemented for now")
                    elif quantize is None:
                        tensor = tensor.to(device)
                    else:
                        raise ValueError(f"Unexpected quantize `{quantize}`")
                    module._parameters[param_name] = tensor
                    if name == "word_embeddings.weight":
                        model.lm_head._parameters["weight"] = tensor
 =======
    @property
    def batch_type(self) -> Type[CausalLMBatch]:
        return BloomCausalLMBatch
 >>>>>>> ba30033 (Fused all commits for saner rebase..)
    def forward(
        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None