Use Hub kernels for Marlin and cutlass quantization kernels

This commit is contained in:
Daniël de Kok 2025-01-27 14:13:48 +00:00
parent 4b8cda684b
commit aab6141b92
12 changed files with 742 additions and 29 deletions

View File

@ -853,11 +853,11 @@
]
},
"locked": {
"lastModified": 1737685583,
"narHash": "sha256-p+NVABRpGi+pT+xxf9HcLcFVxG6L+vEEy+NwzB9T0f8=",
"lastModified": 1738549608,
"narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "eb64cbcc8eee0fa87ebded92805280d2ec97415a",
"rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
"type": "github"
},
"original": {
@ -978,15 +978,16 @@
"nixpkgs": "nixpkgs_6"
},
"locked": {
"lastModified": 1738323634,
"narHash": "sha256-lKPzgEm7pEuQJVhacsxFHqg1MOtrUMZvr+9IuJzC5J4=",
"lastModified": 1737970302,
"narHash": "sha256-uoArelKpaixLDozNTrXii2hOWXwJzonPqAgxwZyjzM0=",
"owner": "huggingface",
"repo": "text-generation-inference-nix",
"rev": "eb5fede2756f544f75e01f55a4097f9c9a8c5005",
"rev": "f43f30042a435e22ab0dbdda8a3d62ad05ff0ada",
"type": "github"
},
"original": {
"owner": "huggingface",
"ref": "hf-kernels",
"repo": "text-generation-inference-nix",
"type": "github"
}

View File

@ -5,7 +5,7 @@
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
};
nix-filter.url = "github:numtide/nix-filter";
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
tgi-nix.url = "github:huggingface/text-generation-inference-nix/hf-kernels";
nixpkgs.follows = "tgi-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {

View File

@ -90,7 +90,7 @@ mkShell {
postVenvCreation = ''
unset SOURCE_DATE_EPOCH
( cd server ; python -m pip install --no-dependencies -e . )
( cd server ; python -m pip install --no-build-isolation --no-dependencies -e . )
( cd clients/python ; python -m pip install --no-dependencies -e . )
'';

View File

@ -19,10 +19,10 @@
grpcio-reflection,
grpcio-status,
grpcio-tools,
hf-kernels,
hf-transfer,
loguru,
mamba-ssm,
marlin-kernels,
moe-kernels,
opentelemetry-api,
opentelemetry-exporter-otlp,
@ -93,10 +93,10 @@ buildPythonPackage {
grpcio-reflection
grpcio-status
grpcio-tools
hf-kernels
hf-transfer
loguru
mamba-ssm
marlin-kernels
moe-kernels
opentelemetry-api
opentelemetry-exporter-otlp

680
server/hf-kernels.lock Normal file
View File

@ -0,0 +1,680 @@
[
{
"repo_id": "kernels-community/quantization",
"sha": "329999a39b1f1329972d2e478c5466b0464441a3",
"files": [
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/_ops.py",
"blob_id": "fc64e2f1be8e8c8c79acb4e85c43aceb317a50c6"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/_quantization_nikdbevszxcfa.abi3.so",
"blob_id": "83c971b999a73e3802b2e2a52c9e52575c0f441d"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/_ops.py",
"blob_id": "2d997a763f85e5a06d8314ba639516e3480afbea"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/_quantization_uqov56fwnow5m.abi3.so",
"blob_id": "c6c8af5c698fd6ceabee534d6f6fdcef0e1c75c6"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/_ops.py",
"blob_id": "55fe31652a63808926dfc4f8121624a58ac70127"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/_quantization_fsprdzcdnww3s.abi3.so",
"blob_id": "44c36167d50fc259a9bcddd096f17b5753b4aa06"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/_ops.py",
"blob_id": "7a17663ca7f3ca9a384f37596cde7d40d27e8abb"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/_quantization_3atrjdvtz7xxc.abi3.so",
"blob_id": "9580cfcf2ecf031764586c88bf923cb1033c26e7"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/_ops.py",
"blob_id": "36513c1a6720abd21530e214cb95be46c8765f31"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/_quantization_ni5tfi7up4a5a.abi3.so",
"blob_id": "dc46548259a6007c83a21fb2beb4d54373e27c2c"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/_ops.py",
"blob_id": "7292bace29cb7b73f20e18617bdf132eddbf5151"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/_quantization_ydkmqrh62v5ym.abi3.so",
"blob_id": "8a22e3667dac661b0dc38aefb8747a7b94f02582"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py",
"blob_id": "16ab5cbbb5c096fd8b4ef2a774778d69b41ebeb7"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_2upd4ndb7oxlw.abi3.so",
"blob_id": "5fd1c287bcdcbb26aae4536dbc7e2e2a4142c672"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py",
"blob_id": "d46aed00defa6d8ebb2eae3489f6615b236cd9c3"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_srrzik3uw7tac.abi3.so",
"blob_id": "bf391756a4554227510604235c9b356f423b4270"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py",
"blob_id": "028a3e5cd871b9a0044a25489a4a0902d95ec229"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_jotyxqrlkrs6e.abi3.so",
"blob_id": "de00fd61cc6c9d11d0db7a400feb5f3fe2ff33bb"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py",
"blob_id": "a01b8fc5fefec490194fe5244e70055c47c1eb7b"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_nxw74vvv2coka.abi3.so",
"blob_id": "d536e2ac72e5baf6f3537e3aadc24f57551a1dc5"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py",
"blob_id": "3a4ef6a12f4eef0b3383338788645fafa9a9661e"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_7fvkxfdjondcu.abi3.so",
"blob_id": "c0a36487f427f3c55fb8965a5b65147bc9e8cef6"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py",
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py",
"blob_id": "a424f0f1664801162e520d128baddadd317dfa68"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_cf2wponrik3xk.abi3.so",
"blob_id": "2ce34c9cd1a364ee5064d18b2cb92e21e3c2fc52"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/marlin.py",
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
},
{
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
}
]
}
]

View File

@ -33,6 +33,13 @@ dependencies = [
"transformers>=4.48.0"
]
[build-system]
requires = ["hf-kernels", "setuptools"]
build-backend = "setuptools.build_meta"
[tool.kernels.dependencies]
"kernels-community/quantization" = ">=0.0.2"
[project.scripts]
text-generation-server = "text_generation_server.cli:app"
@ -62,21 +69,24 @@ quantize = [
]
moe = [ "moe-kernels" ]
attention = [ "attention-kernels" ]
marlin = [ "marlin-kernels" ]
gen = [
"grpcio-tools>=1.69.0",
"mypy-protobuf>=3.6.0",
]
[tool.uv.sources]
attention-kernels.url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
marlin-kernels = [
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
attention-kernels = [
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
]
moe-kernels = [
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
]
moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
[tool.pytest.ini_options]
markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]

View File

@ -1,6 +1,7 @@
from typing import List, Optional, Union, TypeVar
from dataclasses import dataclass
from hf_kernels import load_kernel
from loguru import logger
import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationType
@ -10,7 +11,7 @@ from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
try:
import marlin_kernels
marlin_kernels = load_kernel("kernels-community/quantization")
except ImportError:
marlin_kernels = None

View File

@ -2,6 +2,7 @@ from dataclasses import dataclass
import os
from typing import Optional, Tuple, Type, Union, List
from hf_kernels import load_kernel
import torch
from loguru import logger
@ -15,7 +16,7 @@ from text_generation_server.utils.weights import (
from text_generation_server.utils.log import log_once
try:
import marlin_kernels
marlin_kernels = load_kernel("kernels-community/quantization")
except ImportError:
marlin_kernels = None

View File

@ -2,6 +2,7 @@ from typing import Optional
import torch
import torch.nn as nn
from hf_kernels import load_kernel
from text_generation_server.layers.fp8 import fp8_quantize
from text_generation_server.layers.marlin.gptq import _check_valid_shape
from text_generation_server.layers.marlin.util import (
@ -10,7 +11,7 @@ from text_generation_server.layers.marlin.util import (
)
try:
import marlin_kernels
marlin_kernels = load_kernel("kernels-community/quantization")
except ImportError:
marlin_kernels = None

View File

@ -4,6 +4,7 @@ from typing import List, Optional, Union
import numpy
import torch
import torch.nn as nn
from hf_kernels import load_kernel
from loguru import logger
from text_generation_server.layers.marlin.util import (
_check_marlin_kernels,
@ -16,7 +17,7 @@ from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
try:
import marlin_kernels
marlin_kernels = load_kernel("kernels-community/quantization")
except ImportError:
marlin_kernels = None
@ -385,7 +386,20 @@ class GPTQMarlinLinear(nn.Module):
out_features = weight.scales.shape[1]
_check_valid_shape(in_features=in_features, out_features=out_features)
self.bits = weight.bits
if weight.bits not in (4, 8):
raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization")
if weight.qzeros.numel() > 0:
if weight.bits == 4:
self.quant_type = marlin_kernels.scalar_types.uint4
else:
self.quant_type = marlin_kernels.scalar_types.uint8
else:
if weight.bits == 4:
self.quant_type = marlin_kernels.scalar_types.uint4b8
else:
self.quant_type = marlin_kernels.scalar_types.uint8b128
self.is_full_k = weight.is_full_k
self.qweight = weight.qweight
@ -414,7 +428,7 @@ class GPTQMarlinLinear(nn.Module):
self.g_idx,
self.perm,
self.workspace,
self.bits,
self.quant_type,
A_flat.shape[0],
self.scales.shape[1],
A_flat.shape[1],

View File

@ -1,13 +1,14 @@
from dataclasses import dataclass
from typing import List, Optional, Union
from hf_kernels import load_kernel
import torch
import torch.nn as nn
from text_generation_server.layers.marlin.util import _check_marlin_kernels
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
try:
import marlin_kernels
marlin_kernels = load_kernel("kernels-community/quantization")
except ImportError:
marlin_kernels = None
@ -303,8 +304,11 @@ class GPTQMarlin24Linear(nn.Module):
f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
)
self.bits = weight.bits
weights_per_int32 = 32 // self.bits
if weight.bits == 4:
self.quant_type = marlin_kernels.scalar_types.uint4b8
else:
self.quant_type = marlin_kernels.scalar_types.uint8b128
weights_per_int32 = 32 // weight.bits
assert (
out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
@ -344,7 +348,7 @@ class GPTQMarlin24Linear(nn.Module):
self.meta,
self.scale_packed,
self.workspace,
self.bits,
self.quant_type,
A.shape[0],
self.scale_packed.shape[1],
A.shape[1],

View File

@ -1,12 +1,13 @@
import functools
from typing import List, Tuple
from hf_kernels import load_kernel
import numpy
import torch
from text_generation_server.utils.import_utils import SYSTEM
try:
import marlin_kernels
marlin_kernels = load_kernel("kernels-community/quantization")
except ImportError:
marlin_kernels = None