diff --git a/flake.lock b/flake.lock index e57990c8..bad8cb11 100644 --- a/flake.lock +++ b/flake.lock @@ -586,15 +586,16 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1747919133, - "narHash": "sha256-VvF1naQOvv7yulQ5/cDiaxkNxlh1Y84QMZnderv1szk=", + "lastModified": 1751978820, + "narHash": "sha256-MzZszOGoIsn/wleAuHbF7xd5tRuMxXQMbaDYijDfPEY=", "owner": "huggingface", "repo": "hf-nix", - "rev": "9c71e026d6c7c8588ef85a5f7c77f57d598e038c", + "rev": "8fd72d076f14ee0d6a3800d9e4434a70a7ea48f7", "type": "github" }, "original": { "owner": "huggingface", + "ref": "quantization-0.1.0", "repo": "hf-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index b5b13cad..4afff8b4 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "hf-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - hf-nix.url = "github:huggingface/hf-nix"; + hf-nix.url = "github:huggingface/hf-nix/quantization-0.1.0"; nixpkgs.follows = "hf-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { @@ -33,7 +33,7 @@ }; pkgs = import nixpkgs { inherit system; - inherit (hf-nix.lib) config; + config = hf-nix.lib.config system; overlays = [ rust-overlay.overlays.default hf-nix.overlays.default diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json index 7dbfc627..13c1272f 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json @@ -1,469 +1,613 @@ { "details": { "best_of_sequences": null, - "finish_reason": "eos_token", - "generated_tokens": 76, + "finish_reason": "length", + "generated_tokens": 100, "prefill": [], "seed": null, "tokens": [ { "id": 18183, - "logprob": -1.5195312, + "logprob": -1.5371094, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.06817627, + "logprob": -0.08483887, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.13122559, + "logprob": -0.13378906, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.13415527, + "logprob": -0.14562988, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.8769531, + "logprob": -0.78222656, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0011396408, + "logprob": -0.0013389587, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.16442871, + "logprob": -0.15234375, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0026416779, + "logprob": -0.0018444061, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.48754883, + "logprob": -0.45507812, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2294922, + "logprob": -1.1435547, "special": false, "text": " uses" }, { "id": 29728, - "logprob": -0.66503906, + "logprob": -0.78515625, "special": false, "text": " neural" }, { "id": 14155, - "logprob": -0.02960205, + "logprob": -0.022445679, "special": false, "text": " networks" }, { "id": 311, - "logprob": -0.7236328, + "logprob": -0.6767578, "special": false, "text": " to" }, { "id": 3960, - "logprob": -1.1914062, + "logprob": -1.1796875, "special": false, "text": " learn" }, { "id": 504, - "logprob": -0.7089844, + "logprob": -0.77441406, "special": false, "text": " from" }, { "id": 821, - "logprob": -0.7729492, + "logprob": -0.67089844, "special": false, "text": " data" }, { "id": 13, - "logprob": -0.7836914, + "logprob": -0.64404297, "special": false, "text": "." }, { "id": 1084, - "logprob": -0.9941406, + "logprob": -1.1582031, "special": false, "text": " It" }, { "id": 374, - "logprob": -0.52441406, + "logprob": -0.5810547, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.9511719, + "logprob": -1.1416016, "special": false, "text": " a" }, { "id": 943, - "logprob": -0.8642578, + "logprob": -0.9873047, "special": false, "text": " type" }, { "id": 315, - "logprob": -0.00030231476, + "logprob": -0.0001975298, "special": false, "text": " of" }, { "id": 20443, - "logprob": -0.14416504, + "logprob": -0.22302246, "special": false, "text": " artificial" }, { "id": 11229, - "logprob": -0.013824463, + "logprob": -0.012550354, "special": false, "text": " intelligence" }, { "id": 429, - "logprob": -0.18762207, + "logprob": -0.2130127, "special": false, "text": " that" }, { "id": 646, - "logprob": -1.0087891, + "logprob": -1.1347656, "special": false, "text": " can" }, { "id": 3960, - "logprob": -0.90234375, + "logprob": -0.97802734, "special": false, "text": " learn" }, { "id": 504, - "logprob": -0.54345703, + "logprob": -0.4489746, "special": false, "text": " from" }, { "id": 323, - "logprob": -1.0400391, + "logprob": -0.9038086, "special": false, "text": " and" }, { "id": 1281, - "logprob": -0.072509766, + "logprob": -0.10961914, "special": false, "text": " make" }, { "id": 19898, - "logprob": -0.16516113, + "logprob": -0.3503418, "special": false, "text": " predictions" }, { "id": 389, - "logprob": -0.4416504, + "logprob": -0.62939453, "special": false, "text": " on" }, { "id": 3460, - "logprob": -0.5385742, + "logprob": -0.9458008, "special": false, "text": " large" }, { "id": 14713, - "logprob": -0.4387207, + "logprob": -0.33813477, "special": false, "text": " amounts" }, { "id": 315, - "logprob": -0.00015091896, + "logprob": -0.00013554096, "special": false, "text": " of" }, { "id": 821, - "logprob": -0.061431885, + "logprob": -0.06390381, "special": false, "text": " data" }, { "id": 13, - "logprob": -0.71875, + "logprob": -0.6826172, "special": false, "text": "." }, { "id": 18183, - "logprob": -0.23632812, + "logprob": -0.3503418, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.0017204285, + "logprob": -0.0020923615, "special": false, "text": " learning" }, { "id": 374, - "logprob": -1.1738281, + "logprob": -1.1357422, "special": false, "text": " is" }, { "id": 1483, - "logprob": -0.61083984, + "logprob": -0.76416016, "special": false, "text": " used" }, { "id": 304, - "logprob": -0.035003662, + "logprob": -0.04458618, "special": false, "text": " in" }, { "id": 264, - "logprob": -0.118652344, + "logprob": -0.09295654, "special": false, "text": " a" }, { "id": 8045, - "logprob": -0.42016602, + "logprob": -0.54003906, "special": false, "text": " variety" }, { "id": 315, - "logprob": -1.6212463e-05, + "logprob": -1.6450882e-05, "special": false, "text": " of" }, { "id": 8357, - "logprob": -0.1315918, + "logprob": -0.095947266, "special": false, "text": " applications" }, { "id": 11, - "logprob": -0.12915039, + "logprob": -0.10650635, "special": false, "text": "," }, { "id": 2670, - "logprob": -0.12463379, + "logprob": -0.079589844, "special": false, "text": " including" }, { "id": 2168, - "logprob": -0.37402344, + "logprob": -0.40551758, "special": false, "text": " image" }, { "id": 323, - "logprob": -0.1451416, + "logprob": -0.13012695, "special": false, "text": " and" }, { "id": 8806, - "logprob": -0.028869629, + "logprob": -0.02720642, "special": false, "text": " speech" }, { "id": 17843, - "logprob": -0.00024068356, + "logprob": -0.00020062923, "special": false, "text": " recognition" }, { "id": 11, - "logprob": -0.00031018257, + "logprob": -0.00056505203, "special": false, "text": "," }, { "id": 5810, - "logprob": -0.019821167, + "logprob": -0.022247314, "special": false, "text": " natural" }, { "id": 4128, - "logprob": -0.00012528896, + "logprob": -0.00017559528, "special": false, "text": " language" }, { "id": 8692, - "logprob": -0.00089263916, + "logprob": -0.0007171631, "special": false, "text": " processing" }, { "id": 11, - "logprob": -0.00073862076, + "logprob": -0.0007882118, "special": false, "text": "," }, { "id": 323, - "logprob": -0.040161133, + "logprob": -0.027862549, "special": false, "text": " and" }, { "id": 38193, - "logprob": -0.4519043, + "logprob": -0.39111328, "special": false, "text": " autonomous" }, { "id": 11474, - "logprob": -0.39941406, + "logprob": -0.38427734, "special": false, "text": " vehicles" }, { "id": 13, - "logprob": -0.21166992, + "logprob": -0.23461914, "special": false, "text": "." }, { "id": 1084, - "logprob": -0.9082031, + "logprob": -1.0439453, "special": false, "text": " It" }, { "id": 374, - "logprob": -0.44213867, + "logprob": -0.44580078, "special": false, "text": " is" }, { "id": 264, - "logprob": -1.2177734, + "logprob": -0.86865234, "special": false, "text": " a" }, { "id": 18512, - "logprob": -0.5205078, + "logprob": -0.5263672, "special": false, "text": " rapidly" }, { "id": 7826, - "logprob": -0.15332031, + "logprob": -0.15881348, "special": false, "text": " growing" }, { "id": 2070, - "logprob": -0.0039978027, + "logprob": -0.0044059753, "special": false, "text": " field" }, { "id": 448, - "logprob": -0.9091797, + "logprob": -0.921875, "special": false, "text": " with" }, { "id": 1657, - "logprob": -0.17114258, + "logprob": -0.18737793, "special": false, "text": " many" }, { "id": 4650, - "logprob": -0.70703125, + "logprob": -0.8857422, "special": false, "text": " potential" }, { "id": 8357, - "logprob": -0.025131226, + "logprob": -0.036193848, "special": false, "text": " applications" }, { "id": 304, - "logprob": -0.6699219, + "logprob": -0.65283203, "special": false, "text": " in" }, { "id": 279, - "logprob": -0.35205078, + "logprob": -0.4411621, "special": false, "text": " the" }, { "id": 3853, - "logprob": -0.049194336, + "logprob": -0.059326172, "special": false, "text": " future" }, { "id": 13, - "logprob": -0.21972656, + "logprob": -0.23278809, "special": false, "text": "." }, { - "id": 151643, - "logprob": -2.0019531, - "special": true, - "text": "<|endoftext|>" + "id": 3555, + "logprob": -1.90625, + "special": false, + "text": " What" + }, + { + "id": 525, + "logprob": -0.48291016, + "special": false, + "text": " are" + }, + { + "id": 1045, + "logprob": -0.1484375, + "special": false, + "text": " some" + }, + { + "id": 10295, + "logprob": -1.4072266, + "special": false, + "text": " examples" + }, + { + "id": 315, + "logprob": -0.00091028214, + "special": false, + "text": " of" + }, + { + "id": 5538, + "logprob": -0.47192383, + "special": false, + "text": " deep" + }, + { + "id": 6832, + "logprob": -0.0005393028, + "special": false, + "text": " learning" + }, + { + "id": 8357, + "logprob": -0.33569336, + "special": false, + "text": " applications" + }, + { + "id": 30, + "logprob": -0.19299316, + "special": false, + "text": "?" + }, + { + "id": 2619, + "logprob": -1.3320312, + "special": false, + "text": " There" + }, + { + "id": 525, + "logprob": -0.0027637482, + "special": false, + "text": " are" + }, + { + "id": 1657, + "logprob": -0.0574646, + "special": false, + "text": " many" + }, + { + "id": 10295, + "logprob": -0.093811035, + "special": false, + "text": " examples" + }, + { + "id": 315, + "logprob": -0.000106692314, + "special": false, + "text": " of" + }, + { + "id": 5538, + "logprob": -0.013023376, + "special": false, + "text": " deep" + }, + { + "id": 6832, + "logprob": -7.081032e-05, + "special": false, + "text": " learning" + }, + { + "id": 8357, + "logprob": -0.010604858, + "special": false, + "text": " applications" + }, + { + "id": 11, + "logprob": -0.28125, + "special": false, + "text": "," + }, + { + "id": 2670, + "logprob": -0.5209961, + "special": false, + "text": " including" + }, + { + "id": 1447, + "logprob": -0.8300781, + "special": false, + "text": ":\n\n" + }, + { + "id": 220, + "logprob": -0.8071289, + "special": false, + "text": " " + }, + { + "id": 353, + "logprob": -0.07385254, + "special": false, + "text": " *" + }, + { + "id": 4654, + "logprob": -0.12548828, + "special": false, + "text": " Image" + }, + { + "id": 17843, + "logprob": -0.4790039, + "special": false, + "text": " recognition" + }, + { + "id": 25, + "logprob": -0.25634766, + "special": false, + "text": ":" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future." + "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future. What are some examples of deep learning applications? There are many examples of deep learning applications, including:\n\n * Image recognition:" } diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json index 2c840e67..08343ade 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json @@ -8,25 +8,25 @@ "tokens": [ { "id": 5267, - "logprob": -1.1464844, + "logprob": -1.0410156, "special": false, "text": "?\n" }, { "id": 33464, - "logprob": -0.83203125, + "logprob": -0.6147461, "special": false, "text": "Deep" }, { "id": 20909, - "logprob": -0.5625, + "logprob": -0.5229492, "special": false, "text": " Learning" }, { "id": 320, - "logprob": -2.1464844, + "logprob": -1.7451172, "special": false, "text": " (" }, @@ -38,36 +38,36 @@ }, { "id": 701, - "logprob": -2.2089844, + "logprob": -2.2382812, "special": false, "text": ")," }, { "id": 476, - "logprob": -0.27368164, + "logprob": -0.22546387, "special": false, "text": " or" }, { "id": 20443, - "logprob": -0.09442139, + "logprob": -0.16967773, "special": false, "text": " artificial" }, { - "id": 29728, - "logprob": 0.0, + "id": 11229, + "logprob": -2.265625, "special": false, - "text": " neural" + "text": " intelligence" }, { - "id": 14155, + "id": 11, "logprob": 0.0, "special": false, - "text": " networks" + "text": "," } ], "top_tokens": null }, - "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks" + "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial intelligence," } diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json index aee5698b..57ae816c 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json @@ -9,61 +9,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.5195312, + "logprob": -1.5371094, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.06817627, + "logprob": -0.08483887, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.13122559, + "logprob": -0.13378906, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.13415527, + "logprob": -0.14562988, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.87353516, + "logprob": -0.78222656, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0011396408, + "logprob": -0.0013389587, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.16442871, + "logprob": -0.15234375, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0026416779, + "logprob": -0.0018444061, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.48754883, + "logprob": -0.45507812, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2294922, + "logprob": -1.1435547, "special": false, "text": " uses" } @@ -82,61 +82,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.5195312, + "logprob": -1.5371094, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.06817627, + "logprob": -0.08483887, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.13122559, + "logprob": -0.13378906, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.13415527, + "logprob": -0.14562988, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.87353516, + "logprob": -0.78222656, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0011396408, + "logprob": -0.0013389587, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.16442871, + "logprob": -0.15234375, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0026416779, + "logprob": -0.0018444061, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.48754883, + "logprob": -0.45507812, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2294922, + "logprob": -1.1435547, "special": false, "text": " uses" } @@ -155,61 +155,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.5195312, + "logprob": -1.5371094, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.06817627, + "logprob": -0.08483887, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.13122559, + "logprob": -0.13378906, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.13415527, + "logprob": -0.14562988, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.87353516, + "logprob": -0.78222656, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0011396408, + "logprob": -0.0013389587, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.16442871, + "logprob": -0.15234375, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0026416779, + "logprob": -0.0018444061, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.48754883, + "logprob": -0.45507812, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2294922, + "logprob": -1.1435547, "special": false, "text": " uses" } @@ -228,61 +228,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.5195312, + "logprob": -1.5371094, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.06817627, + "logprob": -0.08483887, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.13122559, + "logprob": -0.13378906, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.13415527, + "logprob": -0.14562988, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.87353516, + "logprob": -0.78222656, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0011396408, + "logprob": -0.0013389587, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.16442871, + "logprob": -0.15234375, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0026416779, + "logprob": -0.0018444061, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.48754883, + "logprob": -0.45507812, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2294922, + "logprob": -1.1435547, "special": false, "text": " uses" } diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json index adae8e63..5bd164ae 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json @@ -14,25 +14,25 @@ }, { "id": 34564, - "logprob": -0.12512207, + "logprob": -0.12695312, "special": false, "text": "Deep" }, { "id": 6975, - "logprob": 0.0, + "logprob": -0.10632324, "special": false, "text": " learning" }, { "id": 320, - "logprob": -0.23840332, + "logprob": -0.22546387, "special": false, "text": " (" }, { "id": 16931, - "logprob": -2.0175781, + "logprob": -1.8837891, "special": false, "text": "DL" }, @@ -44,7 +44,7 @@ }, { "id": 374, - "logprob": -0.8613281, + "logprob": -0.8798828, "special": false, "text": " is" }, @@ -56,7 +56,7 @@ }, { "id": 1207, - "logprob": -1.2451172, + "logprob": -1.2675781, "special": false, "text": " sub" }, diff --git a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py index 17e12c22..fd29e1ab 100644 --- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py +++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py @@ -28,15 +28,15 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight( response = await compressed_tensors_w8a8_int_dynamic_weight.generate( "What is deep learning?", # prefer a longer response than the default, allow the llm to end generation - max_new_tokens=1000, + max_new_tokens=100, decoder_input_details=True, ) assert ( response.generated_text - == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future." + == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future. What are some examples of deep learning applications? There are many examples of deep learning applications, including:\n\n * Image recognition:" ) - assert response.details.generated_tokens == 76 + assert response.details.generated_tokens == 100 assert response == response_snapshot @@ -65,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params( assert response.details.generated_tokens == 10 assert ( response.generated_text - == "What is deep learning?\nDeep Learning (DL), or artificial neural networks" + == "What is deep learning?\nDeep Learning (DL), or artificial intelligence," ) assert response == response_snapshot diff --git a/server/kernels.lock b/server/kernels.lock index a06cbff3..f18af45b 100644 --- a/server/kernels.lock +++ b/server/kernels.lock @@ -223,82 +223,58 @@ }, { "repo_id": "kernels-community/quantization", - "sha": "6470f9b005797e00279eb9103463dfe0f8b7da00", + "sha": "b65f8ab1edf7f2f3174f9583de38e3c594c80d20", "variants": { - "torch25-cxx11-cu118-x86_64-linux": { - "hash": "sha256-f52c9b1a7cd98fb389c6d2a0b22a293cb36eb96af3a624f5aec761735861c96d", - "hash_type": "git_lfs_concat" - }, - "torch25-cxx11-cu121-x86_64-linux": { - "hash": "sha256-e5f0da343363a562ce52f147a9534cd54a3efa90e70671f606cc2516f02a3876", - "hash_type": "git_lfs_concat" - }, - "torch25-cxx11-cu124-x86_64-linux": { - "hash": "sha256-caad9300c155faf79c26426f10951ba75f931a05e741a5b39a24b064daabc040", - "hash_type": "git_lfs_concat" - }, - "torch25-cxx98-cu118-x86_64-linux": { - "hash": "sha256-4fc87893de14a29ba4b55f5026ea05ec5901c0b52abd5ebae681ea0b791e858c", - "hash_type": "git_lfs_concat" - }, - "torch25-cxx98-cu121-x86_64-linux": { - "hash": "sha256-72c975ea63fc524a38fcee5b2dbdb566eff0a0ea546ee5756441d04908e4e896", - "hash_type": "git_lfs_concat" - }, - "torch25-cxx98-cu124-x86_64-linux": { - "hash": "sha256-28c5510e3b07eae2b3846b880f6111da65df024e1f24f81077d187a97c015364", - "hash_type": "git_lfs_concat" - }, "torch26-cxx11-cu118-x86_64-linux": { - "hash": "sha256-8444cf77686578a6b0f7e2fd29bf2783ba120ebf7df41573f61d2521fd0acc10", + "hash": "sha256-86c9ea9d12090a1bd457350acf85cbabac7af7253b6ce2603f8a7c6a3f03058b", "hash_type": "git_lfs_concat" }, "torch26-cxx11-cu124-x86_64-linux": { - "hash": "sha256-6ea8e00625b5fe799fbe407e7de0fc08228cac26f9bbed2d70a6500026fe3bab", + "hash": "sha256-03a2d88dd1d725e6c7cb7781f53d92006570fd12d226e1d9f3b2c1aed980eed2", "hash_type": "git_lfs_concat" }, "torch26-cxx11-cu126-aarch64-linux": { - "hash": "sha256-0b8b8afbdaf9aa533895cb9e884e3ad3e9a34d483f05a1bbde1b8902f9dbeb0f", + "hash": "sha256-c2907b0538618ec896d495b9a6bb62c9076d5c0aa7233d5524aa408379042b29", "hash_type": "git_lfs_concat" }, "torch26-cxx11-cu126-x86_64-linux": { - "hash": "sha256-e115e855d7ca4b97787f04c88e128432256c6b43d4823fb8889ab9985dc4cf36", + "hash": "sha256-e5a1790e97648bf90dd269606298bbf1ee17e8504a6c4d2a6416190266c9f57a", "hash_type": "git_lfs_concat" }, "torch26-cxx98-cu118-x86_64-linux": { - "hash": "sha256-509f08c48a05584cc85c058607277fcbe3193e6cc61846dd2416d39e27c1d68e", + "hash": "sha256-6934e8f14d95fad43603b08be07e37102bf70890d634c062fd6064cc7b83d718", "hash_type": "git_lfs_concat" }, "torch26-cxx98-cu124-x86_64-linux": { - "hash": "sha256-a10236bffd435296c736ae2762ab0836da2421297e46b377368a17b39d70c27b", + "hash": "sha256-9adda61755e617db96fa30542dd305dc612ccac0ddcf9fb4906f74ed51770354", "hash_type": "git_lfs_concat" }, "torch26-cxx98-cu126-aarch64-linux": { - "hash": "sha256-ca2cb56f3eea4c399a61e21ba9b577d718b250aa60a13f42f01019ddd5cd8b0c", + "hash": "sha256-9c389fd1c556edd2db7080bff0f78b03a63ac8382c73c79671c557b28a1c7e72", "hash_type": "git_lfs_concat" }, "torch26-cxx98-cu126-x86_64-linux": { - "hash": "sha256-8fcd62d8243a30b63a03751cc0c15d24f6e00e43eae79f7281627f24e078bf9a", + "hash": "sha256-4a86f8a5609fd35b81bebee4eb25c2d0733537e2a09011b8e704271470f3b0a3", "hash_type": "git_lfs_concat" }, "torch27-cxx11-cu118-x86_64-linux": { - "hash": "sha256-60f5807ee3da937c57c1b6080c30632305aa4875ed5a52bf4e81968770b61b13", + "hash": "sha256-39f6740ee44bf8fae873bbe8aed6f3596e59ca069e73c0b77d5e344ff2d5f7b7", "hash_type": "git_lfs_concat" }, "torch27-cxx11-cu126-aarch64-linux": { - "hash": "sha256-64298b1713dc1d950915dc6569a06e2f541de3ed80aa5b32084246c1fdc7a958", + "hash": "sha256-34dadff3669c3b256bf2637b76bac68991e3b6013f8a4ecaa51454cf2c80fa86", "hash_type": "git_lfs_concat" }, "torch27-cxx11-cu126-x86_64-linux": { - "hash": "sha256-d9e219890dc28e8582ef21d6f81f2ebc361de218a86b742be63bc4714f102e5e", + "hash": "sha256-f568e601daede1173e9952d32a835658f41791affe64069d7fd2d722002922e2", "hash_type": "git_lfs_concat" }, "torch27-cxx11-cu128-aarch64-linux": { - "hash": "sha256-d72549f51aefcf020bc74262bbbccb78094638c5ab9adc8667873d247c1cce86", + "hash": "sha256-f58b0b96c23b5340da08cd0db0221875c15b757640a783c0ad836bfc176799fa", "hash_type": "git_lfs_concat" }, "torch27-cxx11-cu128-x86_64-linux": { - "hash": "sha256-d31ac5f87d7c7f62c63c72946479193aed467c9417c0acead5137e0e1fa968f8", + "hash": "sha256-b2c9f8e42541fd24d553bb52172ac3bea8407c0bb32a3448632e4763a68f21f3", "hash_type": "git_lfs_concat" } } diff --git a/server/pyproject.toml b/server/pyproject.toml index 4e174a59..5f9722f3 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -59,7 +59,7 @@ build-backend = "setuptools.build_meta" "kernels-community/paged-attention" = ">=0.0.2" "kernels-community/moe" = ">=0.1.1" "kernels-community/punica-sgmv" = ">=0.0.1" -"kernels-community/quantization" = ">=0.0.3" +"kernels-community/quantization" = ">=0.1.2" "kernels-community/quantization-eetq" = ">=0.0.1" "kernels-community/rotary" = ">=0.0.1" diff --git a/server/text_generation_server/layers/compressed_tensors/w8a8_int.py b/server/text_generation_server/layers/compressed_tensors/w8a8_int.py index b66057ec..d65fdc3c 100644 --- a/server/text_generation_server/layers/compressed_tensors/w8a8_int.py +++ b/server/text_generation_server/layers/compressed_tensors/w8a8_int.py @@ -144,17 +144,6 @@ class W8A8IntLoader(WeightsLoader): OtherT = TypeVar("OtherT") -def _get_tensor_or_else( - weights: Weights, prefix: str, other: OtherT -) -> Union[torch.Tensor, OtherT]: - # Even if a checkpoint uses e.g. zero-points, they can be elided: - # https://github.com/neuralmagic/compressed-tensors/blob/db6ccb25b265e8370813ecab5e95714a6728b5a6/src/compressed_tensors/compressors/quantized_compressors/base.py#L105 - if weights.has_tensor(prefix): - return weights.get_tensor(prefix, to_dtype=False) - else: - return other - - @dataclass class Int8Weight(Weight): input_symmetric: bool diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py index 10751a05..39f0efb6 100644 --- a/server/text_generation_server/layers/marlin/fp8.py +++ b/server/text_generation_server/layers/marlin/fp8.py @@ -76,15 +76,21 @@ class GPTQMarlinFP8Linear(nn.Module): assert quantization is not None A_flat = A.view(-1, A.shape[-1]) - C = quantization.fp8_marlin_gemm( - A_flat, - self.qweight, - self.scales, - self.workspace, - 8, - A_flat.shape[0], - self.scales.shape[1], - A_flat.shape[1], + C = quantization.gptq_marlin_gemm( + a=A_flat, + c=None, + b_q_weight=self.qweight, + b_scales=self.scales, + global_scale=None, + b_zeros=None, + g_idx=None, + perm=None, + workspace=self.workspace, + b_q_type=quantization.scalar_type.scalar_types.float8_e4m3fn, + size_m=A_flat.shape[0], + size_n=self.scales.shape[1], + size_k=A_flat.shape[1], + use_fp32_reduce=True, ) C = C.reshape(A.shape[:-1] + (self.scales.shape[1],)) @@ -143,5 +149,6 @@ def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor): ) scales = permute_scales(scales) + scales = quantization.marlin_utils_fp8.fp8_fused_exponent_bias_into_scales(scales) return repacked, scales diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py index e85c8333..cec0b9dc 100644 --- a/server/text_generation_server/layers/marlin/gptq.py +++ b/server/text_generation_server/layers/marlin/gptq.py @@ -256,7 +256,7 @@ class GPTQMarlinWeight(Weight): """ qweight: torch.Tensor - qzeros: torch.Tensor + qzeros: Optional[torch.Tensor] scales: torch.Tensor g_idx: torch.Tensor perm: torch.Tensor @@ -268,6 +268,7 @@ class GPTQMarlinWeight(Weight): assert self.scales.dtype in (torch.float16, torch.bfloat16) assert self.g_idx.dtype == torch.int32 assert self.perm.dtype == torch.int32 + assert self.qzeros is None or self.qzeros.numel() > 0 def get_linear(self, bias: torch.Tensor): return GPTQMarlinLinear( @@ -350,9 +351,6 @@ def repack_gptq_for_marlin( qweight, perm, in_features, out_features, bits ) - if qzeros is None: - qzeros = torch.empty(0, dtype=torch.int, device=qweight.device) - scales = permute_scales(scales) is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures) @@ -392,7 +390,7 @@ class GPTQMarlinLinear(nn.Module): if weight.bits not in (4, 8): raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization") - if weight.qzeros.numel() > 0: + if weight.qzeros is not None: if weight.bits == 4: self.quant_type = quantization.scalar_types.uint4 else: @@ -424,20 +422,21 @@ class GPTQMarlinLinear(nn.Module): A_flat = A.view(-1, A.shape[-1]) C = quantization.gptq_marlin_gemm( - A_flat, - self.qweight, - self.scales, - self.qzeros, - self.g_idx, - self.perm, - self.workspace, - self.quant_type, - A_flat.shape[0], - self.scales.shape[1], - A_flat.shape[1], - self.is_full_k, - self.qzeros.numel() > 0, - True, + a=A_flat, + c=None, + b_q_weight=self.qweight, + b_scales=self.scales, + global_scale=None, + b_zeros=self.qzeros, + g_idx=self.g_idx, + perm=self.perm, + workspace=self.workspace, + b_q_type=self.quant_type, + size_m=A_flat.shape[0], + size_n=self.scales.shape[1], + size_k=A_flat.shape[1], + is_k_full=self.is_full_k, + use_fp32_reduce=True, ) C = C.reshape(A.shape[:-1] + (self.scales.shape[1],)) diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py index d1ce4f3e..6c5b7256 100644 --- a/server/text_generation_server/layers/moe/gptq_marlin.py +++ b/server/text_generation_server/layers/moe/gptq_marlin.py @@ -202,9 +202,13 @@ def _pack_weight( device=weight.qweight.device, ) qzeros = torch.empty( - (n_experts,) + weight.qzeros.shape, - dtype=weight.qzeros.dtype, - device=weight.qzeros.device, + (n_experts,) + ((0,) if weight.qzeros is None else weight.qzeros.shape), + dtype=( + weight.qweight.dtype if weight.qzeros is None else weight.qzeros.dtype + ), + device=( + weight.qweight.device if weight.qzeros is None else weight.qzeros.device + ), ) scales = torch.empty( (n_experts,) + weight.scales.shape, @@ -232,7 +236,13 @@ def _pack_weight( ) moe_weight.qweight[expert] = weight.qweight - moe_weight.qzeros[expert] = weight.qzeros + moe_weight.qzeros[expert] = ( + torch.zeros( + (0,), device=moe_weight.qzeros.device, dtype=moe_weight.qzeros.dtype + ) + if weight.qzeros is None + else weight.qzeros + ) moe_weight.scales[expert] = weight.scales moe_weight.g_idx[expert] = weight.g_idx moe_weight.perm[expert] = weight.perm