mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-08 19:04:52 +00:00
Merge 75ebb228f4
into 06d9d88b95
This commit is contained in:
commit
f858028741
@ -586,15 +586,16 @@
|
||||
"nixpkgs": "nixpkgs_6"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1747919133,
|
||||
"narHash": "sha256-VvF1naQOvv7yulQ5/cDiaxkNxlh1Y84QMZnderv1szk=",
|
||||
"lastModified": 1751978820,
|
||||
"narHash": "sha256-MzZszOGoIsn/wleAuHbF7xd5tRuMxXQMbaDYijDfPEY=",
|
||||
"owner": "huggingface",
|
||||
"repo": "hf-nix",
|
||||
"rev": "9c71e026d6c7c8588ef85a5f7c77f57d598e038c",
|
||||
"rev": "8fd72d076f14ee0d6a3800d9e4434a70a7ea48f7",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "huggingface",
|
||||
"ref": "quantization-0.1.0",
|
||||
"repo": "hf-nix",
|
||||
"type": "github"
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
inputs.nixpkgs.follows = "hf-nix/nixpkgs";
|
||||
};
|
||||
nix-filter.url = "github:numtide/nix-filter";
|
||||
hf-nix.url = "github:huggingface/hf-nix";
|
||||
hf-nix.url = "github:huggingface/hf-nix/quantization-0.1.0";
|
||||
nixpkgs.follows = "hf-nix/nixpkgs";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
rust-overlay = {
|
||||
@ -33,7 +33,7 @@
|
||||
};
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
inherit (hf-nix.lib) config;
|
||||
config = hf-nix.lib.config system;
|
||||
overlays = [
|
||||
rust-overlay.overlays.default
|
||||
hf-nix.overlays.default
|
||||
|
@ -1,469 +1,613 @@
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "eos_token",
|
||||
"generated_tokens": 76,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 100,
|
||||
"prefill": [],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18183,
|
||||
"logprob": -1.5195312,
|
||||
"logprob": -1.5371094,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.06817627,
|
||||
"logprob": -0.08483887,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.13122559,
|
||||
"logprob": -0.13378906,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.13415527,
|
||||
"logprob": -0.14562988,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 25993,
|
||||
"logprob": -0.8769531,
|
||||
"logprob": -0.78222656,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.0011396408,
|
||||
"logprob": -0.0013389587,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5662,
|
||||
"logprob": -0.16442871,
|
||||
"logprob": -0.15234375,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.0026416779,
|
||||
"logprob": -0.0018444061,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"logprob": -0.48754883,
|
||||
"logprob": -0.45507812,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 5711,
|
||||
"logprob": -1.2294922,
|
||||
"logprob": -1.1435547,
|
||||
"special": false,
|
||||
"text": " uses"
|
||||
},
|
||||
{
|
||||
"id": 29728,
|
||||
"logprob": -0.66503906,
|
||||
"logprob": -0.78515625,
|
||||
"special": false,
|
||||
"text": " neural"
|
||||
},
|
||||
{
|
||||
"id": 14155,
|
||||
"logprob": -0.02960205,
|
||||
"logprob": -0.022445679,
|
||||
"special": false,
|
||||
"text": " networks"
|
||||
},
|
||||
{
|
||||
"id": 311,
|
||||
"logprob": -0.7236328,
|
||||
"logprob": -0.6767578,
|
||||
"special": false,
|
||||
"text": " to"
|
||||
},
|
||||
{
|
||||
"id": 3960,
|
||||
"logprob": -1.1914062,
|
||||
"logprob": -1.1796875,
|
||||
"special": false,
|
||||
"text": " learn"
|
||||
},
|
||||
{
|
||||
"id": 504,
|
||||
"logprob": -0.7089844,
|
||||
"logprob": -0.77441406,
|
||||
"special": false,
|
||||
"text": " from"
|
||||
},
|
||||
{
|
||||
"id": 821,
|
||||
"logprob": -0.7729492,
|
||||
"logprob": -0.67089844,
|
||||
"special": false,
|
||||
"text": " data"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"logprob": -0.7836914,
|
||||
"logprob": -0.64404297,
|
||||
"special": false,
|
||||
"text": "."
|
||||
},
|
||||
{
|
||||
"id": 1084,
|
||||
"logprob": -0.9941406,
|
||||
"logprob": -1.1582031,
|
||||
"special": false,
|
||||
"text": " It"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.52441406,
|
||||
"logprob": -0.5810547,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.9511719,
|
||||
"logprob": -1.1416016,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 943,
|
||||
"logprob": -0.8642578,
|
||||
"logprob": -0.9873047,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00030231476,
|
||||
"logprob": -0.0001975298,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 20443,
|
||||
"logprob": -0.14416504,
|
||||
"logprob": -0.22302246,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11229,
|
||||
"logprob": -0.013824463,
|
||||
"logprob": -0.012550354,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"logprob": -0.18762207,
|
||||
"logprob": -0.2130127,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 646,
|
||||
"logprob": -1.0087891,
|
||||
"logprob": -1.1347656,
|
||||
"special": false,
|
||||
"text": " can"
|
||||
},
|
||||
{
|
||||
"id": 3960,
|
||||
"logprob": -0.90234375,
|
||||
"logprob": -0.97802734,
|
||||
"special": false,
|
||||
"text": " learn"
|
||||
},
|
||||
{
|
||||
"id": 504,
|
||||
"logprob": -0.54345703,
|
||||
"logprob": -0.4489746,
|
||||
"special": false,
|
||||
"text": " from"
|
||||
},
|
||||
{
|
||||
"id": 323,
|
||||
"logprob": -1.0400391,
|
||||
"logprob": -0.9038086,
|
||||
"special": false,
|
||||
"text": " and"
|
||||
},
|
||||
{
|
||||
"id": 1281,
|
||||
"logprob": -0.072509766,
|
||||
"logprob": -0.10961914,
|
||||
"special": false,
|
||||
"text": " make"
|
||||
},
|
||||
{
|
||||
"id": 19898,
|
||||
"logprob": -0.16516113,
|
||||
"logprob": -0.3503418,
|
||||
"special": false,
|
||||
"text": " predictions"
|
||||
},
|
||||
{
|
||||
"id": 389,
|
||||
"logprob": -0.4416504,
|
||||
"logprob": -0.62939453,
|
||||
"special": false,
|
||||
"text": " on"
|
||||
},
|
||||
{
|
||||
"id": 3460,
|
||||
"logprob": -0.5385742,
|
||||
"logprob": -0.9458008,
|
||||
"special": false,
|
||||
"text": " large"
|
||||
},
|
||||
{
|
||||
"id": 14713,
|
||||
"logprob": -0.4387207,
|
||||
"logprob": -0.33813477,
|
||||
"special": false,
|
||||
"text": " amounts"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00015091896,
|
||||
"logprob": -0.00013554096,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 821,
|
||||
"logprob": -0.061431885,
|
||||
"logprob": -0.06390381,
|
||||
"special": false,
|
||||
"text": " data"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"logprob": -0.71875,
|
||||
"logprob": -0.6826172,
|
||||
"special": false,
|
||||
"text": "."
|
||||
},
|
||||
{
|
||||
"id": 18183,
|
||||
"logprob": -0.23632812,
|
||||
"logprob": -0.3503418,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.0017204285,
|
||||
"logprob": -0.0020923615,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.1738281,
|
||||
"logprob": -1.1357422,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 1483,
|
||||
"logprob": -0.61083984,
|
||||
"logprob": -0.76416016,
|
||||
"special": false,
|
||||
"text": " used"
|
||||
},
|
||||
{
|
||||
"id": 304,
|
||||
"logprob": -0.035003662,
|
||||
"logprob": -0.04458618,
|
||||
"special": false,
|
||||
"text": " in"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.118652344,
|
||||
"logprob": -0.09295654,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 8045,
|
||||
"logprob": -0.42016602,
|
||||
"logprob": -0.54003906,
|
||||
"special": false,
|
||||
"text": " variety"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -1.6212463e-05,
|
||||
"logprob": -1.6450882e-05,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 8357,
|
||||
"logprob": -0.1315918,
|
||||
"logprob": -0.095947266,
|
||||
"special": false,
|
||||
"text": " applications"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"logprob": -0.12915039,
|
||||
"logprob": -0.10650635,
|
||||
"special": false,
|
||||
"text": ","
|
||||
},
|
||||
{
|
||||
"id": 2670,
|
||||
"logprob": -0.12463379,
|
||||
"logprob": -0.079589844,
|
||||
"special": false,
|
||||
"text": " including"
|
||||
},
|
||||
{
|
||||
"id": 2168,
|
||||
"logprob": -0.37402344,
|
||||
"logprob": -0.40551758,
|
||||
"special": false,
|
||||
"text": " image"
|
||||
},
|
||||
{
|
||||
"id": 323,
|
||||
"logprob": -0.1451416,
|
||||
"logprob": -0.13012695,
|
||||
"special": false,
|
||||
"text": " and"
|
||||
},
|
||||
{
|
||||
"id": 8806,
|
||||
"logprob": -0.028869629,
|
||||
"logprob": -0.02720642,
|
||||
"special": false,
|
||||
"text": " speech"
|
||||
},
|
||||
{
|
||||
"id": 17843,
|
||||
"logprob": -0.00024068356,
|
||||
"logprob": -0.00020062923,
|
||||
"special": false,
|
||||
"text": " recognition"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"logprob": -0.00031018257,
|
||||
"logprob": -0.00056505203,
|
||||
"special": false,
|
||||
"text": ","
|
||||
},
|
||||
{
|
||||
"id": 5810,
|
||||
"logprob": -0.019821167,
|
||||
"logprob": -0.022247314,
|
||||
"special": false,
|
||||
"text": " natural"
|
||||
},
|
||||
{
|
||||
"id": 4128,
|
||||
"logprob": -0.00012528896,
|
||||
"logprob": -0.00017559528,
|
||||
"special": false,
|
||||
"text": " language"
|
||||
},
|
||||
{
|
||||
"id": 8692,
|
||||
"logprob": -0.00089263916,
|
||||
"logprob": -0.0007171631,
|
||||
"special": false,
|
||||
"text": " processing"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"logprob": -0.00073862076,
|
||||
"logprob": -0.0007882118,
|
||||
"special": false,
|
||||
"text": ","
|
||||
},
|
||||
{
|
||||
"id": 323,
|
||||
"logprob": -0.040161133,
|
||||
"logprob": -0.027862549,
|
||||
"special": false,
|
||||
"text": " and"
|
||||
},
|
||||
{
|
||||
"id": 38193,
|
||||
"logprob": -0.4519043,
|
||||
"logprob": -0.39111328,
|
||||
"special": false,
|
||||
"text": " autonomous"
|
||||
},
|
||||
{
|
||||
"id": 11474,
|
||||
"logprob": -0.39941406,
|
||||
"logprob": -0.38427734,
|
||||
"special": false,
|
||||
"text": " vehicles"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"logprob": -0.21166992,
|
||||
"logprob": -0.23461914,
|
||||
"special": false,
|
||||
"text": "."
|
||||
},
|
||||
{
|
||||
"id": 1084,
|
||||
"logprob": -0.9082031,
|
||||
"logprob": -1.0439453,
|
||||
"special": false,
|
||||
"text": " It"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.44213867,
|
||||
"logprob": -0.44580078,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -1.2177734,
|
||||
"logprob": -0.86865234,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 18512,
|
||||
"logprob": -0.5205078,
|
||||
"logprob": -0.5263672,
|
||||
"special": false,
|
||||
"text": " rapidly"
|
||||
},
|
||||
{
|
||||
"id": 7826,
|
||||
"logprob": -0.15332031,
|
||||
"logprob": -0.15881348,
|
||||
"special": false,
|
||||
"text": " growing"
|
||||
},
|
||||
{
|
||||
"id": 2070,
|
||||
"logprob": -0.0039978027,
|
||||
"logprob": -0.0044059753,
|
||||
"special": false,
|
||||
"text": " field"
|
||||
},
|
||||
{
|
||||
"id": 448,
|
||||
"logprob": -0.9091797,
|
||||
"logprob": -0.921875,
|
||||
"special": false,
|
||||
"text": " with"
|
||||
},
|
||||
{
|
||||
"id": 1657,
|
||||
"logprob": -0.17114258,
|
||||
"logprob": -0.18737793,
|
||||
"special": false,
|
||||
"text": " many"
|
||||
},
|
||||
{
|
||||
"id": 4650,
|
||||
"logprob": -0.70703125,
|
||||
"logprob": -0.8857422,
|
||||
"special": false,
|
||||
"text": " potential"
|
||||
},
|
||||
{
|
||||
"id": 8357,
|
||||
"logprob": -0.025131226,
|
||||
"logprob": -0.036193848,
|
||||
"special": false,
|
||||
"text": " applications"
|
||||
},
|
||||
{
|
||||
"id": 304,
|
||||
"logprob": -0.6699219,
|
||||
"logprob": -0.65283203,
|
||||
"special": false,
|
||||
"text": " in"
|
||||
},
|
||||
{
|
||||
"id": 279,
|
||||
"logprob": -0.35205078,
|
||||
"logprob": -0.4411621,
|
||||
"special": false,
|
||||
"text": " the"
|
||||
},
|
||||
{
|
||||
"id": 3853,
|
||||
"logprob": -0.049194336,
|
||||
"logprob": -0.059326172,
|
||||
"special": false,
|
||||
"text": " future"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"logprob": -0.21972656,
|
||||
"logprob": -0.23278809,
|
||||
"special": false,
|
||||
"text": "."
|
||||
},
|
||||
{
|
||||
"id": 151643,
|
||||
"logprob": -2.0019531,
|
||||
"special": true,
|
||||
"text": "<|endoftext|>"
|
||||
"id": 3555,
|
||||
"logprob": -1.90625,
|
||||
"special": false,
|
||||
"text": " What"
|
||||
},
|
||||
{
|
||||
"id": 525,
|
||||
"logprob": -0.48291016,
|
||||
"special": false,
|
||||
"text": " are"
|
||||
},
|
||||
{
|
||||
"id": 1045,
|
||||
"logprob": -0.1484375,
|
||||
"special": false,
|
||||
"text": " some"
|
||||
},
|
||||
{
|
||||
"id": 10295,
|
||||
"logprob": -1.4072266,
|
||||
"special": false,
|
||||
"text": " examples"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00091028214,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5538,
|
||||
"logprob": -0.47192383,
|
||||
"special": false,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.0005393028,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 8357,
|
||||
"logprob": -0.33569336,
|
||||
"special": false,
|
||||
"text": " applications"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -0.19299316,
|
||||
"special": false,
|
||||
"text": "?"
|
||||
},
|
||||
{
|
||||
"id": 2619,
|
||||
"logprob": -1.3320312,
|
||||
"special": false,
|
||||
"text": " There"
|
||||
},
|
||||
{
|
||||
"id": 525,
|
||||
"logprob": -0.0027637482,
|
||||
"special": false,
|
||||
"text": " are"
|
||||
},
|
||||
{
|
||||
"id": 1657,
|
||||
"logprob": -0.0574646,
|
||||
"special": false,
|
||||
"text": " many"
|
||||
},
|
||||
{
|
||||
"id": 10295,
|
||||
"logprob": -0.093811035,
|
||||
"special": false,
|
||||
"text": " examples"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.000106692314,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5538,
|
||||
"logprob": -0.013023376,
|
||||
"special": false,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -7.081032e-05,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 8357,
|
||||
"logprob": -0.010604858,
|
||||
"special": false,
|
||||
"text": " applications"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"logprob": -0.28125,
|
||||
"special": false,
|
||||
"text": ","
|
||||
},
|
||||
{
|
||||
"id": 2670,
|
||||
"logprob": -0.5209961,
|
||||
"special": false,
|
||||
"text": " including"
|
||||
},
|
||||
{
|
||||
"id": 1447,
|
||||
"logprob": -0.8300781,
|
||||
"special": false,
|
||||
"text": ":\n\n"
|
||||
},
|
||||
{
|
||||
"id": 220,
|
||||
"logprob": -0.8071289,
|
||||
"special": false,
|
||||
"text": " "
|
||||
},
|
||||
{
|
||||
"id": 353,
|
||||
"logprob": -0.07385254,
|
||||
"special": false,
|
||||
"text": " *"
|
||||
},
|
||||
{
|
||||
"id": 4654,
|
||||
"logprob": -0.12548828,
|
||||
"special": false,
|
||||
"text": " Image"
|
||||
},
|
||||
{
|
||||
"id": 17843,
|
||||
"logprob": -0.4790039,
|
||||
"special": false,
|
||||
"text": " recognition"
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"logprob": -0.25634766,
|
||||
"special": false,
|
||||
"text": ":"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
|
||||
"generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future. What are some examples of deep learning applications? There are many examples of deep learning applications, including:\n\n * Image recognition:"
|
||||
}
|
||||
|
@ -8,25 +8,25 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 5267,
|
||||
"logprob": -1.1464844,
|
||||
"logprob": -1.0410156,
|
||||
"special": false,
|
||||
"text": "?\n"
|
||||
},
|
||||
{
|
||||
"id": 33464,
|
||||
"logprob": -0.83203125,
|
||||
"logprob": -0.6147461,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 20909,
|
||||
"logprob": -0.5625,
|
||||
"logprob": -0.5229492,
|
||||
"special": false,
|
||||
"text": " Learning"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -2.1464844,
|
||||
"logprob": -1.7451172,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
@ -38,36 +38,36 @@
|
||||
},
|
||||
{
|
||||
"id": 701,
|
||||
"logprob": -2.2089844,
|
||||
"logprob": -2.2382812,
|
||||
"special": false,
|
||||
"text": "),"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.27368164,
|
||||
"logprob": -0.22546387,
|
||||
"special": false,
|
||||
"text": " or"
|
||||
},
|
||||
{
|
||||
"id": 20443,
|
||||
"logprob": -0.09442139,
|
||||
"logprob": -0.16967773,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 29728,
|
||||
"logprob": 0.0,
|
||||
"id": 11229,
|
||||
"logprob": -2.265625,
|
||||
"special": false,
|
||||
"text": " neural"
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 14155,
|
||||
"id": 11,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " networks"
|
||||
"text": ","
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
|
||||
"generated_text": "What is deep learning?\nDeep Learning (DL), or artificial intelligence,"
|
||||
}
|
||||
|
@ -9,61 +9,61 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18183,
|
||||
"logprob": -1.5195312,
|
||||
"logprob": -1.5371094,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.06817627,
|
||||
"logprob": -0.08483887,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.13122559,
|
||||
"logprob": -0.13378906,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.13415527,
|
||||
"logprob": -0.14562988,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 25993,
|
||||
"logprob": -0.87353516,
|
||||
"logprob": -0.78222656,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.0011396408,
|
||||
"logprob": -0.0013389587,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5662,
|
||||
"logprob": -0.16442871,
|
||||
"logprob": -0.15234375,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.0026416779,
|
||||
"logprob": -0.0018444061,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"logprob": -0.48754883,
|
||||
"logprob": -0.45507812,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 5711,
|
||||
"logprob": -1.2294922,
|
||||
"logprob": -1.1435547,
|
||||
"special": false,
|
||||
"text": " uses"
|
||||
}
|
||||
@ -82,61 +82,61 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18183,
|
||||
"logprob": -1.5195312,
|
||||
"logprob": -1.5371094,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.06817627,
|
||||
"logprob": -0.08483887,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.13122559,
|
||||
"logprob": -0.13378906,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.13415527,
|
||||
"logprob": -0.14562988,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 25993,
|
||||
"logprob": -0.87353516,
|
||||
"logprob": -0.78222656,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.0011396408,
|
||||
"logprob": -0.0013389587,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5662,
|
||||
"logprob": -0.16442871,
|
||||
"logprob": -0.15234375,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.0026416779,
|
||||
"logprob": -0.0018444061,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"logprob": -0.48754883,
|
||||
"logprob": -0.45507812,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 5711,
|
||||
"logprob": -1.2294922,
|
||||
"logprob": -1.1435547,
|
||||
"special": false,
|
||||
"text": " uses"
|
||||
}
|
||||
@ -155,61 +155,61 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18183,
|
||||
"logprob": -1.5195312,
|
||||
"logprob": -1.5371094,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.06817627,
|
||||
"logprob": -0.08483887,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.13122559,
|
||||
"logprob": -0.13378906,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.13415527,
|
||||
"logprob": -0.14562988,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 25993,
|
||||
"logprob": -0.87353516,
|
||||
"logprob": -0.78222656,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.0011396408,
|
||||
"logprob": -0.0013389587,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5662,
|
||||
"logprob": -0.16442871,
|
||||
"logprob": -0.15234375,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.0026416779,
|
||||
"logprob": -0.0018444061,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"logprob": -0.48754883,
|
||||
"logprob": -0.45507812,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 5711,
|
||||
"logprob": -1.2294922,
|
||||
"logprob": -1.1435547,
|
||||
"special": false,
|
||||
"text": " uses"
|
||||
}
|
||||
@ -228,61 +228,61 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18183,
|
||||
"logprob": -1.5195312,
|
||||
"logprob": -1.5371094,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.06817627,
|
||||
"logprob": -0.08483887,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.13122559,
|
||||
"logprob": -0.13378906,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.13415527,
|
||||
"logprob": -0.14562988,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 25993,
|
||||
"logprob": -0.87353516,
|
||||
"logprob": -0.78222656,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.0011396408,
|
||||
"logprob": -0.0013389587,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5662,
|
||||
"logprob": -0.16442871,
|
||||
"logprob": -0.15234375,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6832,
|
||||
"logprob": -0.0026416779,
|
||||
"logprob": -0.0018444061,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"logprob": -0.48754883,
|
||||
"logprob": -0.45507812,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 5711,
|
||||
"logprob": -1.2294922,
|
||||
"logprob": -1.1435547,
|
||||
"special": false,
|
||||
"text": " uses"
|
||||
}
|
||||
|
@ -14,25 +14,25 @@
|
||||
},
|
||||
{
|
||||
"id": 34564,
|
||||
"logprob": -0.12512207,
|
||||
"logprob": -0.12695312,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": 0.0,
|
||||
"logprob": -0.10632324,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.23840332,
|
||||
"logprob": -0.22546387,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 16931,
|
||||
"logprob": -2.0175781,
|
||||
"logprob": -1.8837891,
|
||||
"special": false,
|
||||
"text": "DL"
|
||||
},
|
||||
@ -44,7 +44,7 @@
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.8613281,
|
||||
"logprob": -0.8798828,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
@ -56,7 +56,7 @@
|
||||
},
|
||||
{
|
||||
"id": 1207,
|
||||
"logprob": -1.2451172,
|
||||
"logprob": -1.2675781,
|
||||
"special": false,
|
||||
"text": " sub"
|
||||
},
|
||||
|
@ -28,15 +28,15 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight(
|
||||
response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
|
||||
"What is deep learning?",
|
||||
# prefer a longer response than the default, allow the llm to end generation
|
||||
max_new_tokens=1000,
|
||||
max_new_tokens=100,
|
||||
decoder_input_details=True,
|
||||
)
|
||||
|
||||
assert (
|
||||
response.generated_text
|
||||
== " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
|
||||
== " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future. What are some examples of deep learning applications? There are many examples of deep learning applications, including:\n\n * Image recognition:"
|
||||
)
|
||||
assert response.details.generated_tokens == 76
|
||||
assert response.details.generated_tokens == 100
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@ -65,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
|
||||
assert response.details.generated_tokens == 10
|
||||
assert (
|
||||
response.generated_text
|
||||
== "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
|
||||
== "What is deep learning?\nDeep Learning (DL), or artificial intelligence,"
|
||||
)
|
||||
assert response == response_snapshot
|
||||
|
||||
|
@ -223,82 +223,58 @@
|
||||
},
|
||||
{
|
||||
"repo_id": "kernels-community/quantization",
|
||||
"sha": "6470f9b005797e00279eb9103463dfe0f8b7da00",
|
||||
"sha": "b65f8ab1edf7f2f3174f9583de38e3c594c80d20",
|
||||
"variants": {
|
||||
"torch25-cxx11-cu118-x86_64-linux": {
|
||||
"hash": "sha256-f52c9b1a7cd98fb389c6d2a0b22a293cb36eb96af3a624f5aec761735861c96d",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch25-cxx11-cu121-x86_64-linux": {
|
||||
"hash": "sha256-e5f0da343363a562ce52f147a9534cd54a3efa90e70671f606cc2516f02a3876",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch25-cxx11-cu124-x86_64-linux": {
|
||||
"hash": "sha256-caad9300c155faf79c26426f10951ba75f931a05e741a5b39a24b064daabc040",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch25-cxx98-cu118-x86_64-linux": {
|
||||
"hash": "sha256-4fc87893de14a29ba4b55f5026ea05ec5901c0b52abd5ebae681ea0b791e858c",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch25-cxx98-cu121-x86_64-linux": {
|
||||
"hash": "sha256-72c975ea63fc524a38fcee5b2dbdb566eff0a0ea546ee5756441d04908e4e896",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch25-cxx98-cu124-x86_64-linux": {
|
||||
"hash": "sha256-28c5510e3b07eae2b3846b880f6111da65df024e1f24f81077d187a97c015364",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx11-cu118-x86_64-linux": {
|
||||
"hash": "sha256-8444cf77686578a6b0f7e2fd29bf2783ba120ebf7df41573f61d2521fd0acc10",
|
||||
"hash": "sha256-86c9ea9d12090a1bd457350acf85cbabac7af7253b6ce2603f8a7c6a3f03058b",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx11-cu124-x86_64-linux": {
|
||||
"hash": "sha256-6ea8e00625b5fe799fbe407e7de0fc08228cac26f9bbed2d70a6500026fe3bab",
|
||||
"hash": "sha256-03a2d88dd1d725e6c7cb7781f53d92006570fd12d226e1d9f3b2c1aed980eed2",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx11-cu126-aarch64-linux": {
|
||||
"hash": "sha256-0b8b8afbdaf9aa533895cb9e884e3ad3e9a34d483f05a1bbde1b8902f9dbeb0f",
|
||||
"hash": "sha256-c2907b0538618ec896d495b9a6bb62c9076d5c0aa7233d5524aa408379042b29",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx11-cu126-x86_64-linux": {
|
||||
"hash": "sha256-e115e855d7ca4b97787f04c88e128432256c6b43d4823fb8889ab9985dc4cf36",
|
||||
"hash": "sha256-e5a1790e97648bf90dd269606298bbf1ee17e8504a6c4d2a6416190266c9f57a",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx98-cu118-x86_64-linux": {
|
||||
"hash": "sha256-509f08c48a05584cc85c058607277fcbe3193e6cc61846dd2416d39e27c1d68e",
|
||||
"hash": "sha256-6934e8f14d95fad43603b08be07e37102bf70890d634c062fd6064cc7b83d718",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx98-cu124-x86_64-linux": {
|
||||
"hash": "sha256-a10236bffd435296c736ae2762ab0836da2421297e46b377368a17b39d70c27b",
|
||||
"hash": "sha256-9adda61755e617db96fa30542dd305dc612ccac0ddcf9fb4906f74ed51770354",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx98-cu126-aarch64-linux": {
|
||||
"hash": "sha256-ca2cb56f3eea4c399a61e21ba9b577d718b250aa60a13f42f01019ddd5cd8b0c",
|
||||
"hash": "sha256-9c389fd1c556edd2db7080bff0f78b03a63ac8382c73c79671c557b28a1c7e72",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch26-cxx98-cu126-x86_64-linux": {
|
||||
"hash": "sha256-8fcd62d8243a30b63a03751cc0c15d24f6e00e43eae79f7281627f24e078bf9a",
|
||||
"hash": "sha256-4a86f8a5609fd35b81bebee4eb25c2d0733537e2a09011b8e704271470f3b0a3",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch27-cxx11-cu118-x86_64-linux": {
|
||||
"hash": "sha256-60f5807ee3da937c57c1b6080c30632305aa4875ed5a52bf4e81968770b61b13",
|
||||
"hash": "sha256-39f6740ee44bf8fae873bbe8aed6f3596e59ca069e73c0b77d5e344ff2d5f7b7",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch27-cxx11-cu126-aarch64-linux": {
|
||||
"hash": "sha256-64298b1713dc1d950915dc6569a06e2f541de3ed80aa5b32084246c1fdc7a958",
|
||||
"hash": "sha256-34dadff3669c3b256bf2637b76bac68991e3b6013f8a4ecaa51454cf2c80fa86",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch27-cxx11-cu126-x86_64-linux": {
|
||||
"hash": "sha256-d9e219890dc28e8582ef21d6f81f2ebc361de218a86b742be63bc4714f102e5e",
|
||||
"hash": "sha256-f568e601daede1173e9952d32a835658f41791affe64069d7fd2d722002922e2",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch27-cxx11-cu128-aarch64-linux": {
|
||||
"hash": "sha256-d72549f51aefcf020bc74262bbbccb78094638c5ab9adc8667873d247c1cce86",
|
||||
"hash": "sha256-f58b0b96c23b5340da08cd0db0221875c15b757640a783c0ad836bfc176799fa",
|
||||
"hash_type": "git_lfs_concat"
|
||||
},
|
||||
"torch27-cxx11-cu128-x86_64-linux": {
|
||||
"hash": "sha256-d31ac5f87d7c7f62c63c72946479193aed467c9417c0acead5137e0e1fa968f8",
|
||||
"hash": "sha256-b2c9f8e42541fd24d553bb52172ac3bea8407c0bb32a3448632e4763a68f21f3",
|
||||
"hash_type": "git_lfs_concat"
|
||||
}
|
||||
}
|
||||
|
@ -59,7 +59,7 @@ build-backend = "setuptools.build_meta"
|
||||
"kernels-community/paged-attention" = ">=0.0.2"
|
||||
"kernels-community/moe" = ">=0.1.1"
|
||||
"kernels-community/punica-sgmv" = ">=0.0.1"
|
||||
"kernels-community/quantization" = ">=0.0.3"
|
||||
"kernels-community/quantization" = ">=0.1.2"
|
||||
"kernels-community/quantization-eetq" = ">=0.0.1"
|
||||
"kernels-community/rotary" = ">=0.0.1"
|
||||
|
||||
|
@ -144,17 +144,6 @@ class W8A8IntLoader(WeightsLoader):
|
||||
OtherT = TypeVar("OtherT")
|
||||
|
||||
|
||||
def _get_tensor_or_else(
|
||||
weights: Weights, prefix: str, other: OtherT
|
||||
) -> Union[torch.Tensor, OtherT]:
|
||||
# Even if a checkpoint uses e.g. zero-points, they can be elided:
|
||||
# https://github.com/neuralmagic/compressed-tensors/blob/db6ccb25b265e8370813ecab5e95714a6728b5a6/src/compressed_tensors/compressors/quantized_compressors/base.py#L105
|
||||
if weights.has_tensor(prefix):
|
||||
return weights.get_tensor(prefix, to_dtype=False)
|
||||
else:
|
||||
return other
|
||||
|
||||
|
||||
@dataclass
|
||||
class Int8Weight(Weight):
|
||||
input_symmetric: bool
|
||||
|
@ -76,15 +76,21 @@ class GPTQMarlinFP8Linear(nn.Module):
|
||||
assert quantization is not None
|
||||
|
||||
A_flat = A.view(-1, A.shape[-1])
|
||||
C = quantization.fp8_marlin_gemm(
|
||||
A_flat,
|
||||
self.qweight,
|
||||
self.scales,
|
||||
self.workspace,
|
||||
8,
|
||||
A_flat.shape[0],
|
||||
self.scales.shape[1],
|
||||
A_flat.shape[1],
|
||||
C = quantization.gptq_marlin_gemm(
|
||||
a=A_flat,
|
||||
c=None,
|
||||
b_q_weight=self.qweight,
|
||||
b_scales=self.scales,
|
||||
global_scale=None,
|
||||
b_zeros=None,
|
||||
g_idx=None,
|
||||
perm=None,
|
||||
workspace=self.workspace,
|
||||
b_q_type=quantization.scalar_type.scalar_types.float8_e4m3fn,
|
||||
size_m=A_flat.shape[0],
|
||||
size_n=self.scales.shape[1],
|
||||
size_k=A_flat.shape[1],
|
||||
use_fp32_reduce=True,
|
||||
)
|
||||
C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
|
||||
|
||||
@ -143,5 +149,6 @@ def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):
|
||||
)
|
||||
|
||||
scales = permute_scales(scales)
|
||||
scales = quantization.marlin_utils_fp8.fp8_fused_exponent_bias_into_scales(scales)
|
||||
|
||||
return repacked, scales
|
||||
|
@ -256,7 +256,7 @@ class GPTQMarlinWeight(Weight):
|
||||
"""
|
||||
|
||||
qweight: torch.Tensor
|
||||
qzeros: torch.Tensor
|
||||
qzeros: Optional[torch.Tensor]
|
||||
scales: torch.Tensor
|
||||
g_idx: torch.Tensor
|
||||
perm: torch.Tensor
|
||||
@ -268,6 +268,7 @@ class GPTQMarlinWeight(Weight):
|
||||
assert self.scales.dtype in (torch.float16, torch.bfloat16)
|
||||
assert self.g_idx.dtype == torch.int32
|
||||
assert self.perm.dtype == torch.int32
|
||||
assert self.qzeros is None or self.qzeros.numel() > 0
|
||||
|
||||
def get_linear(self, bias: torch.Tensor):
|
||||
return GPTQMarlinLinear(
|
||||
@ -350,9 +351,6 @@ def repack_gptq_for_marlin(
|
||||
qweight, perm, in_features, out_features, bits
|
||||
)
|
||||
|
||||
if qzeros is None:
|
||||
qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)
|
||||
|
||||
scales = permute_scales(scales)
|
||||
|
||||
is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)
|
||||
@ -392,7 +390,7 @@ class GPTQMarlinLinear(nn.Module):
|
||||
if weight.bits not in (4, 8):
|
||||
raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization")
|
||||
|
||||
if weight.qzeros.numel() > 0:
|
||||
if weight.qzeros is not None:
|
||||
if weight.bits == 4:
|
||||
self.quant_type = quantization.scalar_types.uint4
|
||||
else:
|
||||
@ -424,20 +422,21 @@ class GPTQMarlinLinear(nn.Module):
|
||||
|
||||
A_flat = A.view(-1, A.shape[-1])
|
||||
C = quantization.gptq_marlin_gemm(
|
||||
A_flat,
|
||||
self.qweight,
|
||||
self.scales,
|
||||
self.qzeros,
|
||||
self.g_idx,
|
||||
self.perm,
|
||||
self.workspace,
|
||||
self.quant_type,
|
||||
A_flat.shape[0],
|
||||
self.scales.shape[1],
|
||||
A_flat.shape[1],
|
||||
self.is_full_k,
|
||||
self.qzeros.numel() > 0,
|
||||
True,
|
||||
a=A_flat,
|
||||
c=None,
|
||||
b_q_weight=self.qweight,
|
||||
b_scales=self.scales,
|
||||
global_scale=None,
|
||||
b_zeros=self.qzeros,
|
||||
g_idx=self.g_idx,
|
||||
perm=self.perm,
|
||||
workspace=self.workspace,
|
||||
b_q_type=self.quant_type,
|
||||
size_m=A_flat.shape[0],
|
||||
size_n=self.scales.shape[1],
|
||||
size_k=A_flat.shape[1],
|
||||
is_k_full=self.is_full_k,
|
||||
use_fp32_reduce=True,
|
||||
)
|
||||
C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
|
||||
|
||||
|
@ -202,9 +202,13 @@ def _pack_weight(
|
||||
device=weight.qweight.device,
|
||||
)
|
||||
qzeros = torch.empty(
|
||||
(n_experts,) + weight.qzeros.shape,
|
||||
dtype=weight.qzeros.dtype,
|
||||
device=weight.qzeros.device,
|
||||
(n_experts,) + ((0,) if weight.qzeros is None else weight.qzeros.shape),
|
||||
dtype=(
|
||||
weight.qweight.dtype if weight.qzeros is None else weight.qzeros.dtype
|
||||
),
|
||||
device=(
|
||||
weight.qweight.device if weight.qzeros is None else weight.qzeros.device
|
||||
),
|
||||
)
|
||||
scales = torch.empty(
|
||||
(n_experts,) + weight.scales.shape,
|
||||
@ -232,7 +236,13 @@ def _pack_weight(
|
||||
)
|
||||
|
||||
moe_weight.qweight[expert] = weight.qweight
|
||||
moe_weight.qzeros[expert] = weight.qzeros
|
||||
moe_weight.qzeros[expert] = (
|
||||
torch.zeros(
|
||||
(0,), device=moe_weight.qzeros.device, dtype=moe_weight.qzeros.dtype
|
||||
)
|
||||
if weight.qzeros is None
|
||||
else weight.qzeros
|
||||
)
|
||||
moe_weight.scales[expert] = weight.scales
|
||||
moe_weight.g_idx[expert] = weight.g_idx
|
||||
moe_weight.perm[expert] = weight.perm
|
||||
|
Loading…
Reference in New Issue
Block a user