diff --git a/server/text_generation_server/utils/gptq/quant_linear.py b/server/text_generation_server/utils/gptq/quant_linear.py index f1ba9bf9..b01788bb 100644 --- a/server/text_generation_server/utils/gptq/quant_linear.py +++ b/server/text_generation_server/utils/gptq/quant_linear.py @@ -386,7 +386,7 @@ def ext_q4_matmul(x, q4, q4_width): class Ex4bitLinear: """Linear layer implementation with per-group 4-bit quantization of the weights""" - def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize, device, world_size: int): + def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize, device): assert bits == 4 self.device = device @@ -417,9 +417,6 @@ class Ex4bitLinear: # Infer groupsize from height of qzeros self.groupsize = None if self.qzeros.shape[0] > 1: - if world_size is None: - world_size = 1 - # self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0] // world_size) self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0]) assert groupsize == self.groupsize diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index dd399ee3..32de46f4 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -156,7 +156,7 @@ def get_linear(weight, bias, quantize, device = None): f"The passed weight is not `gptq` compatible, loader needs to be updated." ) - linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize, device, world_size) + linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize, device) else: raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.") return linear