diff --git a/Dockerfile b/Dockerfile index 39c9f66b..2d769c15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -121,13 +121,6 @@ COPY server/Makefile-awq Makefile # Build specific version of transformers RUN . .venv/bin/activate && make build-awq -# Build eetq kernels -FROM kernel-builder AS eetq-kernels-builder -WORKDIR /usr/src -COPY server/Makefile-eetq Makefile -# Build specific version of transformers -RUN . .venv/bin/activate && make build-eetq - # Build Lorax Punica kernels FROM kernel-builder AS lorax-punica-builder WORKDIR /usr/src @@ -216,8 +209,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from awq kernels builder COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages -# Copy build artifacts from eetq kernels builder -COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from lorax punica kernels builder COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from mamba builder diff --git a/flake.lock b/flake.lock index 3a9d9c7c..5d6ee463 100644 --- a/flake.lock +++ b/flake.lock @@ -978,15 +978,16 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1738769628, - "narHash": "sha256-hgHf1mscFbH9XtT3dYtFQcxRfict9N+Vi6QSW1c+FjU=", + "lastModified": 1739803255, + "narHash": "sha256-lreIfcjSt6D0wOuZ6jm3WEBYvYvED63T+pOKmOgBLi8=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "9a5a58219dead9704d83d9d32f105b6b90bd31f2", + "rev": "30ab7423277fc93c8fc0ca4df737478ebfdb8eec", "type": "github" }, "original": { "owner": "huggingface", + "ref": "eetq-0.0.1", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index 83cedfa6..5ba50114 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix/eetq-0.0.1"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { diff --git a/nix/server.nix b/nix/server.nix index b638449b..98193cac 100644 --- a/nix/server.nix +++ b/nix/server.nix @@ -6,7 +6,6 @@ awq-inference-engine, causal-conv1d, compressed-tensors, - eetq, einops, exllamav2, flashinfer, @@ -36,6 +35,7 @@ py-cpuinfo, pydantic, quantization, + quantization-eetq, safetensors, tokenizers, torch, @@ -80,7 +80,6 @@ buildPythonPackage { dependencies = [ awq-inference-engine - eetq causal-conv1d compressed-tensors einops @@ -111,6 +110,7 @@ buildPythonPackage { py-cpuinfo pydantic quantization + quantization-eetq safetensors sentencepiece tokenizers diff --git a/server/Makefile b/server/Makefile index 746b7faa..0db6f89b 100644 --- a/server/Makefile +++ b/server/Makefile @@ -2,7 +2,6 @@ include Makefile-flash-att include Makefile-flash-att-v2 include Makefile-vllm include Makefile-awq -include Makefile-eetq include Makefile-selective-scan include Makefile-lorax-punica include Makefile-exllamav2 diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock index 43f7f17d..5254cb0c 100644 --- a/server/hf-kernels.lock +++ b/server/hf-kernels.lock @@ -6736,5 +6736,203 @@ "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" } ] + }, + { + "repo_id": "kernels-community/quantization-eetq", + "sha": "a80ce846d6270ddddeee109523ed947f594f246b", + "files": [ + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "9c191845fb7acbd7ea6bae36ce8c237b168557e1" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_v7rnpcck3kry4.abi3.so", + "blob_id": "9edc9126b9ec8ce4f47a8e6688a5f0329c905329" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "ccec58b06a2282da51356fe5d04dd1e2757ce80c" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_zcfiojfkx55be.abi3.so", + "blob_id": "ea27fb040515267ec631cec5545b878da680e7cc" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "bb409419898138ffa9ade9ba505a167a067ea378" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_btymam4x7xvs6.abi3.so", + "blob_id": "0395dd048ccf10ed020a77fa04bcb026ba369d73" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "f250a00832d2044f7bbb87557a1c878d9c8dd24d" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_yy3p6bsf622sq.abi3.so", + "blob_id": "c98d156835e442b039d38a82e9f111036750329c" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "b5259247e8fb3ed9429cf005a525edc8bcae4903" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_imijtykkseqze.abi3.so", + "blob_id": "c46908ce00d02376ae8e18efebb7fee55afbc3ac" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "79f8d42700ad34b9b46e6e328f90885d1ee9beab" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_4qerj3t7ddiry.abi3.so", + "blob_id": "9ba519d2fd4e347b784c21f4c171cbbab57c7774" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "805ec785b7f5196f78dfe77b6cd7c2603c02490e" + }, + { + "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_j23ltbqvrnixg.abi3.so", + "blob_id": "77d53c16e57c658e8f9caa37b0084c4a3a7ffda1" + }, + { + "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "7b590a5a6ede67e0ae13f97dbd7a82a4674e1b23" + }, + { + "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_p5neqtnhdgxv2.abi3.so", + "blob_id": "e3e5fbd8ce3232b6e9a7c3077eab9665b95bef49" + }, + { + "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "0be7ffcb2e9590899683a197b977ec0b39ca7cb7" + }, + { + "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_idk3dezy35dfk.abi3.so", + "blob_id": "61aa67cbe7ce810bf9792e6e8f19219c757ff181" + }, + { + "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "998eba3eddd0520769a2b4ecb3402c024bde44ea" + }, + { + "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_fpjoxzd7nm2qa.abi3.so", + "blob_id": "31d835db1d0348e3f35c23e6a8f2532fd7e9fea7" + }, + { + "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "6d5320b05b03f2f3ddfd299d6e2a72aa6116264f" + }, + { + "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_k7mlunxe2ye4s.abi3.so", + "blob_id": "1946e4c2fab63243d051012cb12e19895828145f" + }, + { + "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + }, + { + "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/__init__.py", + "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" + }, + { + "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_ops.py", + "blob_id": "9b15d85f44e4223ce1f16df987feafd6640dcc62" + }, + { + "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_7m7hz3sbwkaio.abi3.so", + "blob_id": "eb1536ccd1dfa2655ea7de4445aa3c6790f3a0ae" + }, + { + "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/custom_ops.py", + "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" + } + ] } ] diff --git a/server/pyproject.toml b/server/pyproject.toml index 3f02b4ec..37cb6b1a 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -42,6 +42,7 @@ build-backend = "setuptools.build_meta" "kernels-community/paged-attention" = ">=0.0.2" "kernels-community/moe" = ">=0.1.1" "kernels-community/quantization" = ">=0.0.3" +"kernels-community/quantization-eetq" = ">=0.0.1" [project.scripts] text-generation-server = "text_generation_server.cli:app" diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py index b1e5235a..da1e8f0e 100644 --- a/server/text_generation_server/layers/eetq.py +++ b/server/text_generation_server/layers/eetq.py @@ -1,9 +1,13 @@ from dataclasses import dataclass import torch -from EETQ import quant_weights, w8_a16_gemm +from text_generation_server.utils.kernels import load_kernel from text_generation_server.utils.weights import UnquantizedWeight +quantization_eetq = load_kernel( + module="quantization_eetq", repo_id="kernels-community/quantization-eetq" +) + @dataclass class EETQWeight(UnquantizedWeight): @@ -31,13 +35,13 @@ class EETQLinear(torch.nn.Module): if weight.dtype != torch.float16: weight = weight.to(dtype=torch.float16) weight = torch.t(weight).contiguous().cpu() - weight, scale = quant_weights(weight, torch.int8, False) + weight, scale = quantization_eetq.quant_weights(weight, torch.int8, False) self.weight = weight.cuda(device) self.scale = scale.cuda(device) self.bias = bias.cuda(device) if bias is not None else None def forward(self, input: torch.Tensor) -> torch.Tensor: - output = w8_a16_gemm(input, self.weight, self.scale) + output = quantization_eetq.w8_a16_gemm(input, self.weight, self.scale) output = output + self.bias if self.bias is not None else output return output