mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
log message
This commit is contained in:
parent
76834c9989
commit
2629193efa
@ -163,20 +163,17 @@ class Weights:
|
|||||||
g_idx = self.get_tensor(f"{prefix}.g_idx")
|
g_idx = self.get_tensor(f"{prefix}.g_idx")
|
||||||
elif quantize == "gptq" and quant_method == "awq":
|
elif quantize == "gptq" and quant_method == "awq":
|
||||||
log_once(
|
log_once(
|
||||||
logger.info,
|
logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
|
||||||
"Converting AWQ weights to Exllama/GPTQ packing format, "
|
|
||||||
"in order used with Exllama/GPTQ kernels.",
|
|
||||||
)
|
)
|
||||||
from text_generation_server.utils.awq.conversion_utils import (
|
from text_generation_server.utils.awq.conversion_utils import (
|
||||||
fast_awq_to_gptq,
|
fast_awq_to_gptq,
|
||||||
)
|
)
|
||||||
|
|
||||||
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
||||||
g_idx = torch.zeros(
|
g_idx = (
|
||||||
(qweight.shape[0] * 32 // bits),
|
torch.arange(qweight.shape[0] * (32 // bits), device=qweight.device)
|
||||||
dtype=torch.int32,
|
// groupsize
|
||||||
device=qweight.device,
|
).to(dtype=torch.int32)
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
g_idx = None
|
g_idx = None
|
||||||
|
|
||||||
@ -230,20 +227,17 @@ class Weights:
|
|||||||
g_idx = w[0]
|
g_idx = w[0]
|
||||||
elif quantize == "gptq" and quant_method == "awq":
|
elif quantize == "gptq" and quant_method == "awq":
|
||||||
log_once(
|
log_once(
|
||||||
logger.info,
|
logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
|
||||||
"Converting AWQ weights to Exllama/GPTQ packing format, "
|
|
||||||
"in order used with Exllama/GPTQ kernels.",
|
|
||||||
)
|
)
|
||||||
from text_generation_server.utils.awq.conversion_utils import (
|
from text_generation_server.utils.awq.conversion_utils import (
|
||||||
fast_awq_to_gptq,
|
fast_awq_to_gptq,
|
||||||
)
|
)
|
||||||
|
|
||||||
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
||||||
g_idx = torch.zeros(
|
g_idx = (
|
||||||
(qweight.shape[0] * 32 // bits),
|
torch.arange(qweight.shape[0] * (32 // bits), device=qweight.device)
|
||||||
dtype=torch.int32,
|
// groupsize
|
||||||
device=qweight.device,
|
).to(dtype=torch.int32)
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
g_idx = None
|
g_idx = None
|
||||||
|
|
||||||
@ -340,20 +334,17 @@ class Weights:
|
|||||||
|
|
||||||
if quant_method == "awq":
|
if quant_method == "awq":
|
||||||
log_once(
|
log_once(
|
||||||
logger.info,
|
logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
|
||||||
"Converting AWQ weights to Exllama/GPTQ packing format, "
|
|
||||||
"in order used with Exllama/GPTQ kernels.",
|
|
||||||
)
|
)
|
||||||
from text_generation_server.utils.awq.conversion_utils import (
|
from text_generation_server.utils.awq.conversion_utils import (
|
||||||
fast_awq_to_gptq,
|
fast_awq_to_gptq,
|
||||||
)
|
)
|
||||||
|
|
||||||
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
||||||
g_idx = torch.zeros(
|
g_idx = (
|
||||||
(qweight.shape[0] * 32 // bits),
|
torch.arange(qweight.shape[0] * (32 // bits), device=qweight.device)
|
||||||
dtype=torch.int32,
|
// groupsize
|
||||||
device=qweight.device,
|
).to(dtype=torch.int32)
|
||||||
)
|
|
||||||
|
|
||||||
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
|
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
|
||||||
elif quantize == "awq":
|
elif quantize == "awq":
|
||||||
|
Loading…
Reference in New Issue
Block a user