mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-20 22:32:07 +00:00
# Add AWQ quantization inference support
Fixes
https://github.com/huggingface/text-generation-inference/issues/781
This PR (partially) adds support for AWQ quantization for inference.
More information on AWQ [here](https://arxiv.org/abs/2306.00978). In
general, AWQ is faster and more accurate than GPTQ, which is currently
supported by TGI.
This PR installs 4-bit GEMM custom CUDA kernels released by AWQ authors
(in `requirements.txt`, just one line change).
Quick way to test this PR would be bring up TGI as follows:
```
text-generation-server download-weights abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq
text-generation-launcher \
--huggingface-hub-cache ~/.cache/huggingface/hub/ \
--model-id abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq \
--trust-remote-code --port 8080 \
--max-input-length 2048 --max-total-tokens 4096 --max-batch-prefill-tokens 4096 \
--quantize awq
```
Please note:
* This PR was tested with FlashAttention v2 and vLLM.
* This PR adds support for AWQ inference, not quantizing the models.
That needs to be done outside of TGI, instructions
[here](f084f40bd9
).
* This PR only adds support for `FlashLlama` models for now.
* Multi-GPU setup has not been tested.
* No integration tests have been added so far, will add later if
maintainers are interested in this change.
* This PR can be tested on any of the models released
[here](https://huggingface.co/abhinavkulkarni?sort_models=downloads#models).
Please refer to the linked issue for benchmarks for
[abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq](https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq)
vs
[TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ).
Please note, AWQ has released faster (and in case of Llama, fused)
kernels for 4-bit GEMM, currently at the top of the `main` branch at
https://github.com/mit-han-lab/llm-awq, but this PR uses an older commit
that has been tested to work. We can switch to latest commit later on.
## Who can review?
@OlivierDehaene OR @Narsil
---------
Co-authored-by: Abhinav Kulkarni <abhinav@concentric.ai>
54 lines
1.9 KiB
Python
54 lines
1.9 KiB
Python
# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
|
|
|
|
import math
|
|
import torch
|
|
import torch.nn as nn
|
|
import awq_inference_engine # with CUDA kernels
|
|
|
|
|
|
class ScaledActivation(nn.Module):
|
|
def __init__(self, module, scales):
|
|
super().__init__()
|
|
self.act = module
|
|
self.scales = nn.Parameter(scales.data)
|
|
|
|
def forward(self, x):
|
|
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
|
|
|
|
|
|
class WQLinear(nn.Module):
|
|
def __init__(self, w_bit, group_size, qweight, qzeros, scales, bias):
|
|
super().__init__()
|
|
|
|
if w_bit not in [4]:
|
|
raise NotImplementedError("Only 4-bit are supported for now.")
|
|
|
|
self.in_features = qweight.shape[0]
|
|
self.out_features = qweight.shape[1] * 32 // w_bit
|
|
|
|
self.w_bit = w_bit
|
|
self.group_size = group_size if group_size != -1 else self.in_features
|
|
# quick sanity check (make sure aligment)
|
|
assert self.in_features % self.group_size == 0
|
|
assert self.out_features % (32 // self.w_bit) == 0
|
|
|
|
self.register_buffer('qweight', qweight)
|
|
self.register_buffer('qzeros', qzeros)
|
|
self.register_buffer('scales', scales)
|
|
if bias:
|
|
self.register_buffer('bias', bias)
|
|
else:
|
|
self.bias = None
|
|
|
|
@torch.no_grad()
|
|
def forward(self, x):
|
|
out_shape = x.shape[:-1] + (self.out_features, )
|
|
out = awq_inference_engine.gemm_forward_cuda(x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8)
|
|
out = out + self.bias if self.bias is not None else out
|
|
return out.reshape(out_shape)
|
|
|
|
def extra_repr(self) -> str:
|
|
return 'in_features={}, out_features={}, bias={}, w_bit={}, group_size={}'.format(
|
|
self.in_features, self.out_features, self.bias is not None, self.w_bit, self.group_size
|
|
)
|