From f96d997494f328f3e1dabadc39c69b9c8ce29e72 Mon Sep 17 00:00:00 2001 From: Florian Zimmermeister Date: Wed, 25 Oct 2023 12:26:20 +0200 Subject: [PATCH] use exllamav2QuantLinear instead of exllama1 --- server/text_generation_server/utils/layers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 7bb95dd2..78f2de8e 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -38,6 +38,7 @@ if os.getenv("DISABLE_EXLLAMA") == "True": elif CAN_EXLLAMA: try: from text_generation_server.utils.gptq.exllama import Ex4bitLinear + from text_generation_server.utils.gptq.exllamav2 import QuantLinear as exllamav2QuantLinear HAS_EXLLAMA = True except ImportError: @@ -308,7 +309,7 @@ def get_linear(weight, bias, quantize): ) if use_exllama: - linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize) + linear = exllamav2QuantLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize) else: linear = QuantLinear( qweight,