Fix rebase.

2025-09-10 03:44:54 +00:00 · 2023-06-06 11:20:53 +02:00 · 2023-06-06 11:20:53 +02:00 · 2a1ecf3863
commit 2a1ecf3863
parent 7fa79f02ca
1 changed files with 0 additions and 135 deletions
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@ -21,12 +21,6 @@ from text_generation_server.utils import (
    Weights,
 )

-HAS_BITS_AND_BYTES = True
-try:
-    pass
-except Exception:
-    HAS_BITS_AND_BYTES = False
-

 class BloomCausalLMBatch(CausalLMBatch):
    @classmethod
@ -95,138 +89,9 @@ class BLOOMSharded(CausalLM):
            world_size=world_size,
        )

-<<<<<<< HEAD
-    @staticmethod
-    def load_weights(
-        model,
-        filenames: List[str],
-        quantize: Optional[str],
-        device: torch.device,
-        dtype: torch.dtype,
-        rank: int,
-        world_size: int,
-    ):
-        parameters = dict(model.named_parameters())
-        for file in filenames:
-            with safe_open(
-                file, framework="pt", device=str(device) if quantize is None else "cpu"
-            ) as f:
-                for name in f.keys():
-                    if name.startswith("transformer.") or name.startswith("lm_head."):
-                        full_name = name
-                    else:
-                        full_name = f"transformer.{name}"
-
-                    module_name, param_name = full_name.rsplit(".", 1)
-                    module = model.get_submodule(module_name)
-                    current_tensor = parameters[full_name]
-
-                    slice_ = f.get_slice(name)
-
-                    if isinstance(module, TensorParallelColumnLinear):
-                        size = slice_.get_shape()[0]
-                        block_size = size // world_size
-                        start = rank * block_size
-                        stop = (rank + 1) * block_size
-                        tensor = slice_[start:stop]
-                    elif isinstance(module, TensorParallelRowLinear):
-                        if param_name == "weight":
-                            size = slice_.get_shape()[1]
-                            block_size = size // world_size
-                            start = rank * block_size
-                            stop = (rank + 1) * block_size
-                            tensor = slice_[:, start:stop]
-                        else:
-                            tensor = slice_[:]
-                            # XXX: Hack for Rowlinear to add the bias only once.
-                            if rank != 0:
-                                tensor = torch.zeros_like(tensor)
-                    elif (
-                        isinstance(module, TensorParallelEmbedding)
-                        or name == "lm_head.weight"
-                    ):
-                        size = slice_.get_shape()[0]
-                        block_size = size // world_size
-                        start = rank * block_size
-                        stop = (rank + 1) * block_size
-                        tensor = slice_[start:stop]
-                    else:
-                        tensor = slice_[:]
-
-                    if current_tensor.shape != tensor.shape:
-                        raise ValueError(
-                            f"Name {name} -- Current {current_tensor.shape} and got {tensor.shape}"
-                        )
-
-                    tensor = tensor.contiguous().to(dtype)
-
-                    if quantize == "bitsandbytes":
-                        if not HAS_BITS_AND_BYTES:
-                            raise ImportError(
-                                "bitsandbytes is not available on your machine either because it is not installed "
-                                "or you don't have a GPU.\n"
-                                "You can install it with `pip install bitsandbytes`."
-                            )
-
-                        if (
-                            type(module)
-                            in [TensorParallelRowLinear, TensorParallelColumnLinear]
-                            and param_name == "weight"
-                        ):
-                            tensor = Int8Params(
-                                tensor,
-                                has_fp16_weights=False,
-                                requires_grad=False,
-                            ).to(device)
-                            state = bnb.MatmulLtState()
-                            state.threshold = 6.0
-                            state.has_fp16_weights = False
-                            state.memory_efficient_backward = False
-                            state.use_pool = True
-                            state.CB = tensor.CB
-                            state.SCB = tensor.SCB
-                            tensor.CB = None
-                            tensor.SCB = None
-
-                            def replace_linear(state):
-                                def linear(input, weight, bias):
-                                    out = bnb.matmul(
-                                        input,
-                                        weight,
-                                        state=state,
-                                        threshold=state.threshold,
-                                        bias=bias,
-                                    )
-
-                                    if state.CB is not None:
-                                        # we converted 8-bit row major to turing/ampere format
-                                        # in the first inference pass
-                                        # we no longer need the row-major weight
-                                        del state.CB
-                                        weight.data = state.CxB
-
-                                    return out
-
-                                return linear
-
-                            module.linear = replace_linear(state)
-                        else:
-                            tensor = tensor.to(device)
-                    elif quantize == "gptq":
-                        raise NotImplementedError("`gptq` is not implemented for now")
-                    elif quantize is None:
-                        tensor = tensor.to(device)
-                    else:
-                        raise ValueError(f"Unexpected quantize `{quantize}`")
-
-                    module._parameters[param_name] = tensor
-                    if name == "word_embeddings.weight":
-                        model.lm_head._parameters["weight"] = tensor
-=======
    @property
    def batch_type(self) -> Type[CausalLMBatch]:
        return BloomCausalLMBatch
->>>>>>> ba30033 (Fused all commits for saner rebase..)

    def forward(
        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None