diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 63b9a406..b3fa2abb 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -151,7 +151,7 @@ def get_linear(weight, bias, quantize): f"The passed weight is not `gptq` compatible, loader needs to be updated." ) - if use_triton_kernel: + if use_triton_kernel or bits != 4: linear = QuantLinear( qweight, qzeros,