mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
revert changes
This commit is contained in:
parent
0b5b858779
commit
8665ab07ac
@ -63,27 +63,20 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
|
|||||||
return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
|
return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
|
||||||
|
|
||||||
async def Warmup(self, request, context):
|
async def Warmup(self, request, context):
|
||||||
if self.quantize in ["gptq", "awq"]:
|
if self.quantize == "gptq":
|
||||||
has_exllama_layers = False
|
try:
|
||||||
for _, module in self.model.model.named_modules():
|
# When using GPTQ, Exllama kernels need some global kernels
|
||||||
if hasattr(module, "QUANT_TYPE"):
|
# For which we have the finale shapes only after the model has loaded
|
||||||
has_exllama_layers = True
|
# This will allocate those buffers.
|
||||||
break
|
from text_generation_server.utils.layers import (
|
||||||
|
create_exllama_buffers,
|
||||||
|
set_device,
|
||||||
|
)
|
||||||
|
|
||||||
if has_exllama_layers:
|
set_device(self.model.device)
|
||||||
try:
|
create_exllama_buffers(request.max_prefill_tokens)
|
||||||
# When using GPTQ or AWQ, Exllama kernels need some global kernels
|
except ImportError:
|
||||||
# For which we have the finale shapes only after the model has loaded
|
pass
|
||||||
# This will allocate those buffers.
|
|
||||||
from text_generation_server.utils.layers import (
|
|
||||||
create_exllama_buffers,
|
|
||||||
set_device,
|
|
||||||
)
|
|
||||||
|
|
||||||
set_device(self.model.device)
|
|
||||||
create_exllama_buffers(request.max_prefill_tokens)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.model.batch_type == IdeficsCausalLMBatch
|
self.model.batch_type == IdeficsCausalLMBatch
|
||||||
|
@ -1,98 +0,0 @@
|
|||||||
import torch
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
|
|
||||||
AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
|
|
||||||
REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
|
|
||||||
|
|
||||||
|
|
||||||
def pack(imatrix: torch.Tensor, direction: str = "column"):
|
|
||||||
"""
|
|
||||||
Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
|
|
||||||
Args:
|
|
||||||
imatrix (torch.Tensor): matrix of integers
|
|
||||||
direction (str): direction of packing, either "column" or "row"
|
|
||||||
Returns:
|
|
||||||
qmatrix (torch.Tensor): packed matrix of integers
|
|
||||||
"""
|
|
||||||
imatrix = imatrix.to(torch.int8)
|
|
||||||
imatrix = torch.bitwise_and(imatrix, 0x0F) # eventually correct overflow
|
|
||||||
|
|
||||||
shifts = torch.arange(0, 32, 4, device=imatrix.device)
|
|
||||||
|
|
||||||
if direction == "column":
|
|
||||||
imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
|
|
||||||
qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)
|
|
||||||
|
|
||||||
elif direction == "row":
|
|
||||||
imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
|
|
||||||
qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)
|
|
||||||
|
|
||||||
qmatrix = qmatrix.to(torch.int32)
|
|
||||||
|
|
||||||
return qmatrix
|
|
||||||
|
|
||||||
|
|
||||||
def unpack(qmatrix: torch.Tensor, direction: str = "column"):
|
|
||||||
"""
|
|
||||||
Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
|
|
||||||
Args:
|
|
||||||
qmatrix (torch.Tensor): matrix of packed integers
|
|
||||||
direction (str): direction of unpacking, either "column" or "row"
|
|
||||||
Returns:
|
|
||||||
imatrix (torch.Tensor): matrix of integers
|
|
||||||
"""
|
|
||||||
shifts = torch.arange(0, 32, 4, device=qmatrix.device)
|
|
||||||
|
|
||||||
if direction == "column":
|
|
||||||
imatrix = torch.bitwise_right_shift(
|
|
||||||
qmatrix[:, :, None], shifts[None, None, :]
|
|
||||||
).view(qmatrix.shape[0], -1)
|
|
||||||
|
|
||||||
elif direction == "row":
|
|
||||||
imatrix = torch.bitwise_right_shift(
|
|
||||||
qmatrix[:, None, :], shifts[None, :, None]
|
|
||||||
).view(-1, qmatrix.shape[-1])
|
|
||||||
|
|
||||||
imatrix = imatrix.to(torch.int8) & 0x0F # eventually correct overflow
|
|
||||||
|
|
||||||
return imatrix
|
|
||||||
|
|
||||||
|
|
||||||
def apply_order(
|
|
||||||
imatrix: torch.Tensor,
|
|
||||||
direction: str = "column",
|
|
||||||
order: List[int] = AWQ_PACK_ORDER,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Applies the order to a 4-bit integer matrix.
|
|
||||||
Args:
|
|
||||||
imatrix (torch.Tensor): matrix of integers
|
|
||||||
direction (str): direction of applying order, either "column" or "row"
|
|
||||||
order (List[int]): order to apply, default is AWQ_PACK_ORDER
|
|
||||||
Returns:
|
|
||||||
imatrix (torch.Tensor): matrix of integers
|
|
||||||
"""
|
|
||||||
if direction == "column":
|
|
||||||
imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
|
|
||||||
elif direction == "row":
|
|
||||||
imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)
|
|
||||||
|
|
||||||
return imatrix
|
|
||||||
|
|
||||||
|
|
||||||
def fast_awq_to_gptq(qweight, qzeros):
|
|
||||||
# awq uses column packing for both weights and zeros
|
|
||||||
izeros = unpack(qzeros, direction="column")
|
|
||||||
iweights = unpack(qweight, direction="column")
|
|
||||||
|
|
||||||
# Reverse the order of the iweight and izeros tensors
|
|
||||||
izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
|
|
||||||
iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
|
|
||||||
# Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
|
|
||||||
izeros = izeros - 1
|
|
||||||
# exllama uses row packing for weights and column packing for zeros
|
|
||||||
qzeros = pack(izeros, direction="column")
|
|
||||||
qweight = pack(iweights, direction="row")
|
|
||||||
|
|
||||||
return qweight, qzeros
|
|
@ -25,8 +25,6 @@ HAS_AWQ = True
|
|||||||
try:
|
try:
|
||||||
from text_generation_server.utils.awq.quantize.qmodule import WQLinear
|
from text_generation_server.utils.awq.quantize.qmodule import WQLinear
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from text_generation_server.utils.awq.pack_utils import fast_awq_to_gptq
|
|
||||||
|
|
||||||
HAS_AWQ = False
|
HAS_AWQ = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -351,23 +349,19 @@ def get_linear(weight, bias, quantize):
|
|||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"The passed weight is not `awq` compatible, loader needs to be updated."
|
f"The passed weight is not `awq` compatible, loader needs to be updated."
|
||||||
)
|
)
|
||||||
if HAS_AWQ:
|
if IS_ROCM_SYSTEM:
|
||||||
linear = WQLinear(
|
raise NotImplementedError(
|
||||||
w_bit=bits,
|
"AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
|
||||||
group_size=groupsize,
|
"to use Exllama/GPTQ kernels for AWQ inference."
|
||||||
qweight=qweight,
|
|
||||||
qzeros=qzeros,
|
|
||||||
scales=scales,
|
|
||||||
bias=bias is not None,
|
|
||||||
)
|
)
|
||||||
elif HAS_EXLLAMA:
|
linear = WQLinear(
|
||||||
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
w_bit=bits,
|
||||||
linear = ExllamaQuantLinear(
|
group_size=groupsize,
|
||||||
qweight, qzeros, scales, None, bias, bits, groupsize
|
qweight=qweight,
|
||||||
)
|
qzeros=qzeros,
|
||||||
else:
|
scales=scales,
|
||||||
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
bias=bias is not None,
|
||||||
linear = QuantLinear(qweight, qzeros, scales, None, bias, bits, groupsize)
|
)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
|
raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
|
||||||
return linear
|
return linear
|
||||||
|
Loading…
Reference in New Issue
Block a user