mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
use exllamav2QuantLinear instead of exllama1
This commit is contained in:
parent
a02f6839e9
commit
f96d997494
@ -38,6 +38,7 @@ if os.getenv("DISABLE_EXLLAMA") == "True":
|
||||
elif CAN_EXLLAMA:
|
||||
try:
|
||||
from text_generation_server.utils.gptq.exllama import Ex4bitLinear
|
||||
from text_generation_server.utils.gptq.exllamav2 import QuantLinear as exllamav2QuantLinear
|
||||
|
||||
HAS_EXLLAMA = True
|
||||
except ImportError:
|
||||
@ -308,7 +309,7 @@ def get_linear(weight, bias, quantize):
|
||||
)
|
||||
|
||||
if use_exllama:
|
||||
linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
|
||||
linear = exllamav2QuantLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
|
||||
else:
|
||||
linear = QuantLinear(
|
||||
qweight,
|
||||
|
Loading…
Reference in New Issue
Block a user