text-generation-inference/server/text_generation_server/utils/awq/quantize/qmodule.py

# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py

import math
import torch
import torch.nn as nn
import awq_inference_engine  # with CUDA kernels


class ScaledActivation(nn.Module):
    def __init__(self, module, scales):
        super().__init__()
        self.act = module
        self.scales = nn.Parameter(scales.data)
    
    def forward(self, x):
        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)


class WQLinear(nn.Module):
    def __init__(self, w_bit, group_size, qweight, qzeros, scales, bias):
        super().__init__()
        
        if w_bit not in [4]:
            raise NotImplementedError("Only 4-bit are supported for now.")
        
        self.in_features = qweight.shape[0]
        self.out_features = qweight.shape[1] * 32 // w_bit

        self.w_bit = w_bit
        self.group_size = group_size if group_size != -1 else self.in_features
        # quick sanity check (make sure aligment)
        assert self.in_features % self.group_size == 0
        assert self.out_features % (32 // self.w_bit) == 0

        self.register_buffer('qweight', qweight)
        self.register_buffer('qzeros', qzeros)
        self.register_buffer('scales', scales)
        if bias:
            self.register_buffer('bias', bias)
        else:
            self.bias = None

    @torch.no_grad()
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.out_features, )
        out = awq_inference_engine.gemm_forward_cuda(x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8)
        out = out + self.bias if self.bias is not None else out
        return out.reshape(out_shape)
    
    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}, bias={}, w_bit={}, group_size={}'.format(
            self.in_features, self.out_features, self.bias is not None, self.w_bit, self.group_size
        )
Add AWQ quantization inference support (#1019) # Add AWQ quantization inference support Fixes https://github.com/huggingface/text-generation-inference/issues/781 This PR (partially) adds support for AWQ quantization for inference. More information on AWQ [here](https://arxiv.org/abs/2306.00978). In general, AWQ is faster and more accurate than GPTQ, which is currently supported by TGI. This PR installs 4-bit GEMM custom CUDA kernels released by AWQ authors (in `requirements.txt`, just one line change). Quick way to test this PR would be bring up TGI as follows: ``` text-generation-server download-weights abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq text-generation-launcher \ --huggingface-hub-cache ~/.cache/huggingface/hub/ \ --model-id abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq \ --trust-remote-code --port 8080 \ --max-input-length 2048 --max-total-tokens 4096 --max-batch-prefill-tokens 4096 \ --quantize awq ``` Please note: * This PR was tested with FlashAttention v2 and vLLM. * This PR adds support for AWQ inference, not quantizing the models. That needs to be done outside of TGI, instructions [here](https://github.com/mit-han-lab/llm-awq/tree/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa). * This PR only adds support for `FlashLlama` models for now. * Multi-GPU setup has not been tested. * No integration tests have been added so far, will add later if maintainers are interested in this change. * This PR can be tested on any of the models released [here](https://huggingface.co/abhinavkulkarni?sort_models=downloads#models). Please refer to the linked issue for benchmarks for [abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq](https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq) vs [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ). Please note, AWQ has released faster (and in case of Llama, fused) kernels for 4-bit GEMM, currently at the top of the `main` branch at https://github.com/mit-han-lab/llm-awq, but this PR uses an older commit that has been tested to work. We can switch to latest commit later on. ## Who can review? @OlivierDehaene OR @Narsil --------- Co-authored-by: Abhinav Kulkarni <abhinav@concentric.ai> 2023-09-25 07:58:02 +00:00			`# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py`

			`import math`
			`import torch`
			`import torch.nn as nn`
			`import awq_inference_engine # with CUDA kernels`


			`class ScaledActivation(nn.Module):`
			`def __init__(self, module, scales):`
			`super().__init__()`
			`self.act = module`
			`self.scales = nn.Parameter(scales.data)`

			`def forward(self, x):`
			`return self.act(x) / self.scales.view(1, 1, -1).to(x.device)`


			`class WQLinear(nn.Module):`
			`def __init__(self, w_bit, group_size, qweight, qzeros, scales, bias):`
			`super().__init__()`

			`if w_bit not in [4]:`
			`raise NotImplementedError("Only 4-bit are supported for now.")`

			`self.in_features = qweight.shape[0]`
			`self.out_features = qweight.shape[1] * 32 // w_bit`

			`self.w_bit = w_bit`
			`self.group_size = group_size if group_size != -1 else self.in_features`
			`# quick sanity check (make sure aligment)`
			`assert self.in_features % self.group_size == 0`
			`assert self.out_features % (32 // self.w_bit) == 0`

			`self.register_buffer('qweight', qweight)`
			`self.register_buffer('qzeros', qzeros)`
			`self.register_buffer('scales', scales)`
			`if bias:`
			`self.register_buffer('bias', bias)`
			`else:`
			`self.bias = None`

			`@torch.no_grad()`
			`def forward(self, x):`
			`out_shape = x.shape[:-1] + (self.out_features, )`
			`out = awq_inference_engine.gemm_forward_cuda(x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8)`
			`out = out + self.bias if self.bias is not None else out`
			`return out.reshape(out_shape)`

			`def extra_repr(self) -> str:`
			`return 'in_features={}, out_features={}, bias={}, w_bit={}, group_size={}'.format(`
			`self.in_features, self.out_features, self.bias is not None, self.w_bit, self.group_size`
			`)`