mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
use exllamav2QuantLinear instead of exllama1
This commit is contained in:
parent
6a0a378c0c
commit
84dd432a43
@ -38,6 +38,7 @@ if os.getenv("DISABLE_EXLLAMA") == "True":
|
|||||||
elif CAN_EXLLAMA:
|
elif CAN_EXLLAMA:
|
||||||
try:
|
try:
|
||||||
from text_generation_server.utils.gptq.exllama import Ex4bitLinear
|
from text_generation_server.utils.gptq.exllama import Ex4bitLinear
|
||||||
|
from text_generation_server.utils.gptq.exllamav2 import QuantLinear as exllamav2QuantLinear
|
||||||
|
|
||||||
HAS_EXLLAMA = True
|
HAS_EXLLAMA = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -308,7 +309,7 @@ def get_linear(weight, bias, quantize):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if use_exllama:
|
if use_exllama:
|
||||||
linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
|
linear = exllamav2QuantLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
|
||||||
else:
|
else:
|
||||||
linear = QuantLinear(
|
linear = QuantLinear(
|
||||||
qweight,
|
qweight,
|
||||||
|
Loading…
Reference in New Issue
Block a user