diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py index 8d0ceeb4..50b3b76a 100644 --- a/server/text_generation_server/models/bloom.py +++ b/server/text_generation_server/models/bloom.py @@ -21,12 +21,6 @@ from text_generation_server.utils import ( Weights, ) -HAS_BITS_AND_BYTES = True -try: - pass -except Exception: - HAS_BITS_AND_BYTES = False - class BloomCausalLMBatch(CausalLMBatch): @classmethod @@ -95,138 +89,9 @@ class BLOOMSharded(CausalLM): world_size=world_size, ) -<<<<<<< HEAD - @staticmethod - def load_weights( - model, - filenames: List[str], - quantize: Optional[str], - device: torch.device, - dtype: torch.dtype, - rank: int, - world_size: int, - ): - parameters = dict(model.named_parameters()) - for file in filenames: - with safe_open( - file, framework="pt", device=str(device) if quantize is None else "cpu" - ) as f: - for name in f.keys(): - if name.startswith("transformer.") or name.startswith("lm_head."): - full_name = name - else: - full_name = f"transformer.{name}" - - module_name, param_name = full_name.rsplit(".", 1) - module = model.get_submodule(module_name) - current_tensor = parameters[full_name] - - slice_ = f.get_slice(name) - - if isinstance(module, TensorParallelColumnLinear): - size = slice_.get_shape()[0] - block_size = size // world_size - start = rank * block_size - stop = (rank + 1) * block_size - tensor = slice_[start:stop] - elif isinstance(module, TensorParallelRowLinear): - if param_name == "weight": - size = slice_.get_shape()[1] - block_size = size // world_size - start = rank * block_size - stop = (rank + 1) * block_size - tensor = slice_[:, start:stop] - else: - tensor = slice_[:] - # XXX: Hack for Rowlinear to add the bias only once. - if rank != 0: - tensor = torch.zeros_like(tensor) - elif ( - isinstance(module, TensorParallelEmbedding) - or name == "lm_head.weight" - ): - size = slice_.get_shape()[0] - block_size = size // world_size - start = rank * block_size - stop = (rank + 1) * block_size - tensor = slice_[start:stop] - else: - tensor = slice_[:] - - if current_tensor.shape != tensor.shape: - raise ValueError( - f"Name {name} -- Current {current_tensor.shape} and got {tensor.shape}" - ) - - tensor = tensor.contiguous().to(dtype) - - if quantize == "bitsandbytes": - if not HAS_BITS_AND_BYTES: - raise ImportError( - "bitsandbytes is not available on your machine either because it is not installed " - "or you don't have a GPU.\n" - "You can install it with `pip install bitsandbytes`." - ) - - if ( - type(module) - in [TensorParallelRowLinear, TensorParallelColumnLinear] - and param_name == "weight" - ): - tensor = Int8Params( - tensor, - has_fp16_weights=False, - requires_grad=False, - ).to(device) - state = bnb.MatmulLtState() - state.threshold = 6.0 - state.has_fp16_weights = False - state.memory_efficient_backward = False - state.use_pool = True - state.CB = tensor.CB - state.SCB = tensor.SCB - tensor.CB = None - tensor.SCB = None - - def replace_linear(state): - def linear(input, weight, bias): - out = bnb.matmul( - input, - weight, - state=state, - threshold=state.threshold, - bias=bias, - ) - - if state.CB is not None: - # we converted 8-bit row major to turing/ampere format - # in the first inference pass - # we no longer need the row-major weight - del state.CB - weight.data = state.CxB - - return out - - return linear - - module.linear = replace_linear(state) - else: - tensor = tensor.to(device) - elif quantize == "gptq": - raise NotImplementedError("`gptq` is not implemented for now") - elif quantize is None: - tensor = tensor.to(device) - else: - raise ValueError(f"Unexpected quantize `{quantize}`") - - module._parameters[param_name] = tensor - if name == "word_embeddings.weight": - model.lm_head._parameters["weight"] = tensor -======= @property def batch_type(self) -> Type[CausalLMBatch]: return BloomCausalLMBatch ->>>>>>> ba30033 (Fused all commits for saner rebase..) def forward( self, input_ids, attention_mask, position_ids, past_key_values: Optional = None