diff --git a/flake.lock b/flake.lock index 3ad3d698..8c5b3ef3 100644 --- a/flake.lock +++ b/flake.lock @@ -853,11 +853,11 @@ ] }, "locked": { - "lastModified": 1737685583, - "narHash": "sha256-p+NVABRpGi+pT+xxf9HcLcFVxG6L+vEEy+NwzB9T0f8=", + "lastModified": 1738549608, + "narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "eb64cbcc8eee0fa87ebded92805280d2ec97415a", + "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d", "type": "github" }, "original": { @@ -978,15 +978,16 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1738323634, - "narHash": "sha256-lKPzgEm7pEuQJVhacsxFHqg1MOtrUMZvr+9IuJzC5J4=", + "lastModified": 1737970302, + "narHash": "sha256-uoArelKpaixLDozNTrXii2hOWXwJzonPqAgxwZyjzM0=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "eb5fede2756f544f75e01f55a4097f9c9a8c5005", + "rev": "f43f30042a435e22ab0dbdda8a3d62ad05ff0ada", "type": "github" }, "original": { "owner": "huggingface", + "ref": "hf-kernels", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index 83cedfa6..9c221645 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix/hf-kernels"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { diff --git a/nix/impure-shell.nix b/nix/impure-shell.nix index aebdff84..9b8d7e35 100644 --- a/nix/impure-shell.nix +++ b/nix/impure-shell.nix @@ -90,7 +90,7 @@ mkShell { postVenvCreation = '' unset SOURCE_DATE_EPOCH - ( cd server ; python -m pip install --no-dependencies -e . ) + ( cd server ; python -m pip install --no-build-isolation --no-dependencies -e . ) ( cd clients/python ; python -m pip install --no-dependencies -e . ) ''; diff --git a/nix/server.nix b/nix/server.nix index 237102a8..9505c7d8 100644 --- a/nix/server.nix +++ b/nix/server.nix @@ -19,10 +19,10 @@ grpcio-reflection, grpcio-status, grpcio-tools, + hf-kernels, hf-transfer, loguru, mamba-ssm, - marlin-kernels, moe-kernels, opentelemetry-api, opentelemetry-exporter-otlp, @@ -93,10 +93,10 @@ buildPythonPackage { grpcio-reflection grpcio-status grpcio-tools + hf-kernels hf-transfer loguru mamba-ssm - marlin-kernels moe-kernels opentelemetry-api opentelemetry-exporter-otlp diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock new file mode 100644 index 00000000..2e563237 --- /dev/null +++ b/server/hf-kernels.lock @@ -0,0 +1,680 @@ +[ + { + "repo_id": "kernels-community/quantization", + "sha": "329999a39b1f1329972d2e478c5466b0464441a3", + "files": [ + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/_ops.py", + "blob_id": "fc64e2f1be8e8c8c79acb4e85c43aceb317a50c6" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/_quantization_nikdbevszxcfa.abi3.so", + "blob_id": "83c971b999a73e3802b2e2a52c9e52575c0f441d" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/_ops.py", + "blob_id": "2d997a763f85e5a06d8314ba639516e3480afbea" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/_quantization_uqov56fwnow5m.abi3.so", + "blob_id": "c6c8af5c698fd6ceabee534d6f6fdcef0e1c75c6" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/_ops.py", + "blob_id": "55fe31652a63808926dfc4f8121624a58ac70127" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/_quantization_fsprdzcdnww3s.abi3.so", + "blob_id": "44c36167d50fc259a9bcddd096f17b5753b4aa06" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/_ops.py", + "blob_id": "7a17663ca7f3ca9a384f37596cde7d40d27e8abb" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/_quantization_3atrjdvtz7xxc.abi3.so", + "blob_id": "9580cfcf2ecf031764586c88bf923cb1033c26e7" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/_ops.py", + "blob_id": "36513c1a6720abd21530e214cb95be46c8765f31" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/_quantization_ni5tfi7up4a5a.abi3.so", + "blob_id": "dc46548259a6007c83a21fb2beb4d54373e27c2c" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/_ops.py", + "blob_id": "7292bace29cb7b73f20e18617bdf132eddbf5151" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/_quantization_ydkmqrh62v5ym.abi3.so", + "blob_id": "8a22e3667dac661b0dc38aefb8747a7b94f02582" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py", + "blob_id": "16ab5cbbb5c096fd8b4ef2a774778d69b41ebeb7" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_2upd4ndb7oxlw.abi3.so", + "blob_id": "5fd1c287bcdcbb26aae4536dbc7e2e2a4142c672" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py", + "blob_id": "d46aed00defa6d8ebb2eae3489f6615b236cd9c3" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_srrzik3uw7tac.abi3.so", + "blob_id": "bf391756a4554227510604235c9b356f423b4270" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py", + "blob_id": "028a3e5cd871b9a0044a25489a4a0902d95ec229" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_jotyxqrlkrs6e.abi3.so", + "blob_id": "de00fd61cc6c9d11d0db7a400feb5f3fe2ff33bb" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py", + "blob_id": "a01b8fc5fefec490194fe5244e70055c47c1eb7b" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_nxw74vvv2coka.abi3.so", + "blob_id": "d536e2ac72e5baf6f3537e3aadc24f57551a1dc5" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py", + "blob_id": "3a4ef6a12f4eef0b3383338788645fafa9a9661e" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_7fvkxfdjondcu.abi3.so", + "blob_id": "c0a36487f427f3c55fb8965a5b65147bc9e8cef6" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py", + "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py", + "blob_id": "a424f0f1664801162e520d128baddadd317dfa68" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_cf2wponrik3xk.abi3.so", + "blob_id": "2ce34c9cd1a364ee5064d18b2cb92e21e3c2fc52" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py", + "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/cutlass.py", + "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/marlin.py", + "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/scalar_type.py", + "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py", + "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py", + "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", + "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", + "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", + "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", + "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" + }, + { + "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py", + "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" + } + ] + } +] \ No newline at end of file diff --git a/server/pyproject.toml b/server/pyproject.toml index d64a143f..ebc3e129 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -33,6 +33,13 @@ dependencies = [ "transformers>=4.48.0" ] +[build-system] +requires = ["hf-kernels", "setuptools"] +build-backend = "setuptools.build_meta" + +[tool.kernels.dependencies] +"kernels-community/quantization" = ">=0.0.2" + [project.scripts] text-generation-server = "text_generation_server.cli:app" @@ -62,21 +69,24 @@ quantize = [ ] moe = [ "moe-kernels" ] attention = [ "attention-kernels" ] -marlin = [ "marlin-kernels" ] gen = [ "grpcio-tools>=1.69.0", "mypy-protobuf>=3.6.0", ] [tool.uv.sources] -attention-kernels.url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" -marlin-kernels = [ - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" }, - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" }, - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" }, - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" }, +attention-kernels = [ + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" }, + { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" }, +] +moe-kernels = [ + { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" }, + { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" }, + { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" }, + { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" }, ] -moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" [tool.pytest.ini_options] markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"] diff --git a/server/text_generation_server/layers/compressed_tensors/w8a8_int.py b/server/text_generation_server/layers/compressed_tensors/w8a8_int.py index fc6d81e4..8da8c8a0 100644 --- a/server/text_generation_server/layers/compressed_tensors/w8a8_int.py +++ b/server/text_generation_server/layers/compressed_tensors/w8a8_int.py @@ -1,6 +1,7 @@ from typing import List, Optional, Union, TypeVar from dataclasses import dataclass +from hf_kernels import load_kernel from loguru import logger import torch from compressed_tensors.quantization import QuantizationArgs, QuantizationType @@ -10,7 +11,7 @@ from text_generation_server.utils.log import log_once from text_generation_server.utils.weights import Weight, Weights, WeightsLoader try: - import marlin_kernels + marlin_kernels = load_kernel("kernels-community/quantization") except ImportError: marlin_kernels = None diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py index ae20235d..818e3c09 100644 --- a/server/text_generation_server/layers/fp8.py +++ b/server/text_generation_server/layers/fp8.py @@ -2,6 +2,7 @@ from dataclasses import dataclass import os from typing import Optional, Tuple, Type, Union, List +from hf_kernels import load_kernel import torch from loguru import logger @@ -15,7 +16,7 @@ from text_generation_server.utils.weights import ( from text_generation_server.utils.log import log_once try: - import marlin_kernels + marlin_kernels = load_kernel("kernels-community/quantization") except ImportError: marlin_kernels = None diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py index e07b9fc6..851b5edb 100644 --- a/server/text_generation_server/layers/marlin/fp8.py +++ b/server/text_generation_server/layers/marlin/fp8.py @@ -2,6 +2,7 @@ from typing import Optional import torch import torch.nn as nn +from hf_kernels import load_kernel from text_generation_server.layers.fp8 import fp8_quantize from text_generation_server.layers.marlin.gptq import _check_valid_shape from text_generation_server.layers.marlin.util import ( @@ -10,7 +11,7 @@ from text_generation_server.layers.marlin.util import ( ) try: - import marlin_kernels + marlin_kernels = load_kernel("kernels-community/quantization") except ImportError: marlin_kernels = None diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py index 5c1bb549..88f12bde 100644 --- a/server/text_generation_server/layers/marlin/gptq.py +++ b/server/text_generation_server/layers/marlin/gptq.py @@ -4,6 +4,7 @@ from typing import List, Optional, Union import numpy import torch import torch.nn as nn +from hf_kernels import load_kernel from loguru import logger from text_generation_server.layers.marlin.util import ( _check_marlin_kernels, @@ -16,7 +17,7 @@ from text_generation_server.utils.log import log_once from text_generation_server.utils.weights import Weight, Weights, WeightsLoader try: - import marlin_kernels + marlin_kernels = load_kernel("kernels-community/quantization") except ImportError: marlin_kernels = None @@ -385,7 +386,20 @@ class GPTQMarlinLinear(nn.Module): out_features = weight.scales.shape[1] _check_valid_shape(in_features=in_features, out_features=out_features) - self.bits = weight.bits + if weight.bits not in (4, 8): + raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization") + + if weight.qzeros.numel() > 0: + if weight.bits == 4: + self.quant_type = marlin_kernels.scalar_types.uint4 + else: + self.quant_type = marlin_kernels.scalar_types.uint8 + else: + if weight.bits == 4: + self.quant_type = marlin_kernels.scalar_types.uint4b8 + else: + self.quant_type = marlin_kernels.scalar_types.uint8b128 + self.is_full_k = weight.is_full_k self.qweight = weight.qweight @@ -414,7 +428,7 @@ class GPTQMarlinLinear(nn.Module): self.g_idx, self.perm, self.workspace, - self.bits, + self.quant_type, A_flat.shape[0], self.scales.shape[1], A_flat.shape[1], diff --git a/server/text_generation_server/layers/marlin/marlin.py b/server/text_generation_server/layers/marlin/marlin.py index 1c80e31e..5e828fce 100644 --- a/server/text_generation_server/layers/marlin/marlin.py +++ b/server/text_generation_server/layers/marlin/marlin.py @@ -1,13 +1,14 @@ from dataclasses import dataclass from typing import List, Optional, Union +from hf_kernels import load_kernel import torch import torch.nn as nn from text_generation_server.layers.marlin.util import _check_marlin_kernels from text_generation_server.utils.weights import Weight, Weights, WeightsLoader try: - import marlin_kernels + marlin_kernels = load_kernel("kernels-community/quantization") except ImportError: marlin_kernels = None @@ -303,8 +304,11 @@ class GPTQMarlin24Linear(nn.Module): f"Group size {groupsize} is not supported, must be one of: {supported_sizes}" ) - self.bits = weight.bits - weights_per_int32 = 32 // self.bits + if weight.bits == 4: + self.quant_type = marlin_kernels.scalar_types.uint4b8 + else: + self.quant_type = marlin_kernels.scalar_types.uint8b128 + weights_per_int32 = 32 // weight.bits assert ( out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0 @@ -344,7 +348,7 @@ class GPTQMarlin24Linear(nn.Module): self.meta, self.scale_packed, self.workspace, - self.bits, + self.quant_type, A.shape[0], self.scale_packed.shape[1], A.shape[1], diff --git a/server/text_generation_server/layers/marlin/util.py b/server/text_generation_server/layers/marlin/util.py index 250d1714..380a3246 100644 --- a/server/text_generation_server/layers/marlin/util.py +++ b/server/text_generation_server/layers/marlin/util.py @@ -1,12 +1,13 @@ import functools from typing import List, Tuple +from hf_kernels import load_kernel import numpy import torch from text_generation_server.utils.import_utils import SYSTEM try: - import marlin_kernels + marlin_kernels = load_kernel("kernels-community/quantization") except ImportError: marlin_kernels = None