mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
Use Hub kernels for Marlin and cutlass quantization kernels
This commit is contained in:
parent
4b8cda684b
commit
aab6141b92
13
flake.lock
13
flake.lock
@ -853,11 +853,11 @@
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1737685583,
|
||||
"narHash": "sha256-p+NVABRpGi+pT+xxf9HcLcFVxG6L+vEEy+NwzB9T0f8=",
|
||||
"lastModified": 1738549608,
|
||||
"narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "eb64cbcc8eee0fa87ebded92805280d2ec97415a",
|
||||
"rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -978,15 +978,16 @@
|
||||
"nixpkgs": "nixpkgs_6"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1738323634,
|
||||
"narHash": "sha256-lKPzgEm7pEuQJVhacsxFHqg1MOtrUMZvr+9IuJzC5J4=",
|
||||
"lastModified": 1737970302,
|
||||
"narHash": "sha256-uoArelKpaixLDozNTrXii2hOWXwJzonPqAgxwZyjzM0=",
|
||||
"owner": "huggingface",
|
||||
"repo": "text-generation-inference-nix",
|
||||
"rev": "eb5fede2756f544f75e01f55a4097f9c9a8c5005",
|
||||
"rev": "f43f30042a435e22ab0dbdda8a3d62ad05ff0ada",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "huggingface",
|
||||
"ref": "hf-kernels",
|
||||
"repo": "text-generation-inference-nix",
|
||||
"type": "github"
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
|
||||
};
|
||||
nix-filter.url = "github:numtide/nix-filter";
|
||||
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
|
||||
tgi-nix.url = "github:huggingface/text-generation-inference-nix/hf-kernels";
|
||||
nixpkgs.follows = "tgi-nix/nixpkgs";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
rust-overlay = {
|
||||
|
@ -90,7 +90,7 @@ mkShell {
|
||||
|
||||
postVenvCreation = ''
|
||||
unset SOURCE_DATE_EPOCH
|
||||
( cd server ; python -m pip install --no-dependencies -e . )
|
||||
( cd server ; python -m pip install --no-build-isolation --no-dependencies -e . )
|
||||
( cd clients/python ; python -m pip install --no-dependencies -e . )
|
||||
'';
|
||||
|
||||
|
@ -19,10 +19,10 @@
|
||||
grpcio-reflection,
|
||||
grpcio-status,
|
||||
grpcio-tools,
|
||||
hf-kernels,
|
||||
hf-transfer,
|
||||
loguru,
|
||||
mamba-ssm,
|
||||
marlin-kernels,
|
||||
moe-kernels,
|
||||
opentelemetry-api,
|
||||
opentelemetry-exporter-otlp,
|
||||
@ -93,10 +93,10 @@ buildPythonPackage {
|
||||
grpcio-reflection
|
||||
grpcio-status
|
||||
grpcio-tools
|
||||
hf-kernels
|
||||
hf-transfer
|
||||
loguru
|
||||
mamba-ssm
|
||||
marlin-kernels
|
||||
moe-kernels
|
||||
opentelemetry-api
|
||||
opentelemetry-exporter-otlp
|
||||
|
680
server/hf-kernels.lock
Normal file
680
server/hf-kernels.lock
Normal file
@ -0,0 +1,680 @@
|
||||
[
|
||||
{
|
||||
"repo_id": "kernels-community/quantization",
|
||||
"sha": "329999a39b1f1329972d2e478c5466b0464441a3",
|
||||
"files": [
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "fc64e2f1be8e8c8c79acb4e85c43aceb317a50c6"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/_quantization_nikdbevszxcfa.abi3.so",
|
||||
"blob_id": "83c971b999a73e3802b2e2a52c9e52575c0f441d"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "2d997a763f85e5a06d8314ba639516e3480afbea"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/_quantization_uqov56fwnow5m.abi3.so",
|
||||
"blob_id": "c6c8af5c698fd6ceabee534d6f6fdcef0e1c75c6"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "55fe31652a63808926dfc4f8121624a58ac70127"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/_quantization_fsprdzcdnww3s.abi3.so",
|
||||
"blob_id": "44c36167d50fc259a9bcddd096f17b5753b4aa06"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "7a17663ca7f3ca9a384f37596cde7d40d27e8abb"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/_quantization_3atrjdvtz7xxc.abi3.so",
|
||||
"blob_id": "9580cfcf2ecf031764586c88bf923cb1033c26e7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "36513c1a6720abd21530e214cb95be46c8765f31"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/_quantization_ni5tfi7up4a5a.abi3.so",
|
||||
"blob_id": "dc46548259a6007c83a21fb2beb4d54373e27c2c"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "7292bace29cb7b73f20e18617bdf132eddbf5151"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/_quantization_ydkmqrh62v5ym.abi3.so",
|
||||
"blob_id": "8a22e3667dac661b0dc38aefb8747a7b94f02582"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch24-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "16ab5cbbb5c096fd8b4ef2a774778d69b41ebeb7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_2upd4ndb7oxlw.abi3.so",
|
||||
"blob_id": "5fd1c287bcdcbb26aae4536dbc7e2e2a4142c672"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "d46aed00defa6d8ebb2eae3489f6615b236cd9c3"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_srrzik3uw7tac.abi3.so",
|
||||
"blob_id": "bf391756a4554227510604235c9b356f423b4270"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "028a3e5cd871b9a0044a25489a4a0902d95ec229"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_jotyxqrlkrs6e.abi3.so",
|
||||
"blob_id": "de00fd61cc6c9d11d0db7a400feb5f3fe2ff33bb"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "a01b8fc5fefec490194fe5244e70055c47c1eb7b"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_nxw74vvv2coka.abi3.so",
|
||||
"blob_id": "d536e2ac72e5baf6f3537e3aadc24f57551a1dc5"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "3a4ef6a12f4eef0b3383338788645fafa9a9661e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_7fvkxfdjondcu.abi3.so",
|
||||
"blob_id": "c0a36487f427f3c55fb8965a5b65147bc9e8cef6"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py",
|
||||
"blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py",
|
||||
"blob_id": "a424f0f1664801162e520d128baddadd317dfa68"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_cf2wponrik3xk.abi3.so",
|
||||
"blob_id": "2ce34c9cd1a364ee5064d18b2cb92e21e3c2fc52"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
|
||||
"blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
|
||||
"blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/marlin.py",
|
||||
"blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
|
||||
"blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
|
||||
"blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
|
||||
"blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
|
||||
"blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
|
||||
"blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
|
||||
"blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
|
||||
"blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
|
||||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
@ -33,6 +33,13 @@ dependencies = [
|
||||
"transformers>=4.48.0"
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hf-kernels", "setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.kernels.dependencies]
|
||||
"kernels-community/quantization" = ">=0.0.2"
|
||||
|
||||
[project.scripts]
|
||||
text-generation-server = "text_generation_server.cli:app"
|
||||
|
||||
@ -62,21 +69,24 @@ quantize = [
|
||||
]
|
||||
moe = [ "moe-kernels" ]
|
||||
attention = [ "attention-kernels" ]
|
||||
marlin = [ "marlin-kernels" ]
|
||||
gen = [
|
||||
"grpcio-tools>=1.69.0",
|
||||
"mypy-protobuf>=3.6.0",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
attention-kernels.url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
|
||||
marlin-kernels = [
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
|
||||
attention-kernels = [
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
|
||||
]
|
||||
moe-kernels = [
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
|
||||
{ url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
|
||||
]
|
||||
moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
|
||||
|
@ -1,6 +1,7 @@
|
||||
from typing import List, Optional, Union, TypeVar
|
||||
from dataclasses import dataclass
|
||||
|
||||
from hf_kernels import load_kernel
|
||||
from loguru import logger
|
||||
import torch
|
||||
from compressed_tensors.quantization import QuantizationArgs, QuantizationType
|
||||
@ -10,7 +11,7 @@ from text_generation_server.utils.log import log_once
|
||||
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
|
||||
|
||||
try:
|
||||
import marlin_kernels
|
||||
marlin_kernels = load_kernel("kernels-community/quantization")
|
||||
except ImportError:
|
||||
marlin_kernels = None
|
||||
|
||||
|
@ -2,6 +2,7 @@ from dataclasses import dataclass
|
||||
import os
|
||||
from typing import Optional, Tuple, Type, Union, List
|
||||
|
||||
from hf_kernels import load_kernel
|
||||
import torch
|
||||
from loguru import logger
|
||||
|
||||
@ -15,7 +16,7 @@ from text_generation_server.utils.weights import (
|
||||
from text_generation_server.utils.log import log_once
|
||||
|
||||
try:
|
||||
import marlin_kernels
|
||||
marlin_kernels = load_kernel("kernels-community/quantization")
|
||||
except ImportError:
|
||||
marlin_kernels = None
|
||||
|
||||
|
@ -2,6 +2,7 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from hf_kernels import load_kernel
|
||||
from text_generation_server.layers.fp8 import fp8_quantize
|
||||
from text_generation_server.layers.marlin.gptq import _check_valid_shape
|
||||
from text_generation_server.layers.marlin.util import (
|
||||
@ -10,7 +11,7 @@ from text_generation_server.layers.marlin.util import (
|
||||
)
|
||||
|
||||
try:
|
||||
import marlin_kernels
|
||||
marlin_kernels = load_kernel("kernels-community/quantization")
|
||||
except ImportError:
|
||||
marlin_kernels = None
|
||||
|
||||
|
@ -4,6 +4,7 @@ from typing import List, Optional, Union
|
||||
import numpy
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from hf_kernels import load_kernel
|
||||
from loguru import logger
|
||||
from text_generation_server.layers.marlin.util import (
|
||||
_check_marlin_kernels,
|
||||
@ -16,7 +17,7 @@ from text_generation_server.utils.log import log_once
|
||||
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
|
||||
|
||||
try:
|
||||
import marlin_kernels
|
||||
marlin_kernels = load_kernel("kernels-community/quantization")
|
||||
except ImportError:
|
||||
marlin_kernels = None
|
||||
|
||||
@ -385,7 +386,20 @@ class GPTQMarlinLinear(nn.Module):
|
||||
out_features = weight.scales.shape[1]
|
||||
_check_valid_shape(in_features=in_features, out_features=out_features)
|
||||
|
||||
self.bits = weight.bits
|
||||
if weight.bits not in (4, 8):
|
||||
raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization")
|
||||
|
||||
if weight.qzeros.numel() > 0:
|
||||
if weight.bits == 4:
|
||||
self.quant_type = marlin_kernels.scalar_types.uint4
|
||||
else:
|
||||
self.quant_type = marlin_kernels.scalar_types.uint8
|
||||
else:
|
||||
if weight.bits == 4:
|
||||
self.quant_type = marlin_kernels.scalar_types.uint4b8
|
||||
else:
|
||||
self.quant_type = marlin_kernels.scalar_types.uint8b128
|
||||
|
||||
self.is_full_k = weight.is_full_k
|
||||
|
||||
self.qweight = weight.qweight
|
||||
@ -414,7 +428,7 @@ class GPTQMarlinLinear(nn.Module):
|
||||
self.g_idx,
|
||||
self.perm,
|
||||
self.workspace,
|
||||
self.bits,
|
||||
self.quant_type,
|
||||
A_flat.shape[0],
|
||||
self.scales.shape[1],
|
||||
A_flat.shape[1],
|
||||
|
@ -1,13 +1,14 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from hf_kernels import load_kernel
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from text_generation_server.layers.marlin.util import _check_marlin_kernels
|
||||
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
|
||||
|
||||
try:
|
||||
import marlin_kernels
|
||||
marlin_kernels = load_kernel("kernels-community/quantization")
|
||||
except ImportError:
|
||||
marlin_kernels = None
|
||||
|
||||
@ -303,8 +304,11 @@ class GPTQMarlin24Linear(nn.Module):
|
||||
f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
|
||||
)
|
||||
|
||||
self.bits = weight.bits
|
||||
weights_per_int32 = 32 // self.bits
|
||||
if weight.bits == 4:
|
||||
self.quant_type = marlin_kernels.scalar_types.uint4b8
|
||||
else:
|
||||
self.quant_type = marlin_kernels.scalar_types.uint8b128
|
||||
weights_per_int32 = 32 // weight.bits
|
||||
|
||||
assert (
|
||||
out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
|
||||
@ -344,7 +348,7 @@ class GPTQMarlin24Linear(nn.Module):
|
||||
self.meta,
|
||||
self.scale_packed,
|
||||
self.workspace,
|
||||
self.bits,
|
||||
self.quant_type,
|
||||
A.shape[0],
|
||||
self.scale_packed.shape[1],
|
||||
A.shape[1],
|
||||
|
@ -1,12 +1,13 @@
|
||||
import functools
|
||||
from typing import List, Tuple
|
||||
|
||||
from hf_kernels import load_kernel
|
||||
import numpy
|
||||
import torch
|
||||
from text_generation_server.utils.import_utils import SYSTEM
|
||||
|
||||
try:
|
||||
import marlin_kernels
|
||||
marlin_kernels = load_kernel("kernels-community/quantization")
|
||||
except ImportError:
|
||||
marlin_kernels = None
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user