mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-05-04 16:52:06 +00:00
# Add AWQ quantization inference support
Fixes
https://github.com/huggingface/text-generation-inference/issues/781
This PR (partially) adds support for AWQ quantization for inference.
More information on AWQ [here](https://arxiv.org/abs/2306.00978). In
general, AWQ is faster and more accurate than GPTQ, which is currently
supported by TGI.
This PR installs 4-bit GEMM custom CUDA kernels released by AWQ authors
(in `requirements.txt`, just one line change).
Quick way to test this PR would be bring up TGI as follows:
```
text-generation-server download-weights abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq
text-generation-launcher \
--huggingface-hub-cache ~/.cache/huggingface/hub/ \
--model-id abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq \
--trust-remote-code --port 8080 \
--max-input-length 2048 --max-total-tokens 4096 --max-batch-prefill-tokens 4096 \
--quantize awq
```
Please note:
* This PR was tested with FlashAttention v2 and vLLM.
* This PR adds support for AWQ inference, not quantizing the models.
That needs to be done outside of TGI, instructions
[here](f084f40bd9
).
* This PR only adds support for `FlashLlama` models for now.
* Multi-GPU setup has not been tested.
* No integration tests have been added so far, will add later if
maintainers are interested in this change.
* This PR can be tested on any of the models released
[here](https://huggingface.co/abhinavkulkarni?sort_models=downloads#models).
Please refer to the linked issue for benchmarks for
[abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq](https://huggingface.co/abhinavkulkarni/meta-llama-Llama-2-7b-chat-hf-w4-g128-awq)
vs
[TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ).
Please note, AWQ has released faster (and in case of Llama, fused)
kernels for 4-bit GEMM, currently at the top of the `main` branch at
https://github.com/mit-han-lab/llm-awq, but this PR uses an older commit
that has been tested to work. We can switch to latest commit later on.
## Who can review?
@OlivierDehaene OR @Narsil
---------
Co-authored-by: Abhinav Kulkarni <abhinav@concentric.ai>
78 lines
5.6 KiB
Plaintext
78 lines
5.6 KiB
Plaintext
accelerate==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
aiohttp==3.8.5 ; python_version >= "3.9" and python_version < "3.13"
|
|
aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
attrs==23.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
bitsandbytes==0.41.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
|
|
charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
|
colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
|
|
datasets==2.14.4 ; python_version >= "3.9" and python_version < "3.13"
|
|
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
|
dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
|
|
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
filelock==3.12.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
fsspec[http]==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
grpcio-reflection==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
grpcio-status==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
grpcio==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
hf-transfer==0.1.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
|
|
idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
|
|
jinja2==3.1.2 ; python_version >= "3.9" and python_version < "3.13"
|
|
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
markupsafe==2.1.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
|
|
multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
|
|
networkx==3.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
numpy==1.25.2 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
|
packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
pandas==2.0.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
peft==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
pillow==10.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
protobuf==4.24.2 ; python_version >= "3.9" and python_version < "3.13"
|
|
psutil==5.9.5 ; python_version >= "3.9" and python_version < "3.13"
|
|
pyarrow==13.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "3.13"
|
|
pytz==2023.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
regex==2023.8.8 ; python_version >= "3.9" and python_version < "3.13"
|
|
requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
scipy==1.11.2 ; python_version >= "3.9" and python_version < "3.13"
|
|
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
|
setuptools==68.1.2 ; python_version >= "3.9" and python_version < "3.13"
|
|
six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
|
|
texttable==1.6.7 ; python_version >= "3.9" and python_version < "3.13"
|
|
tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
torch==2.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
transformers==4.32.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
|
|
tzdata==2023.3 ; python_version >= "3.9" and python_version < "3.13"
|
|
urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
|
|
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
|
wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
xxhash==3.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
|
yarl==1.9.2 ; python_version >= "3.9" and python_version < "3.13"
|
|
# Custom 4-bit GEMM AWQ kernels
|
|
git+https://github.com/mit-han-lab/llm-awq.git@f084f40bd996f3cf3a0633c1ad7d9d476c318aaa#subdirectory=awq/kernels
|