diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json index 0d8c05ed..dcd37cb9 100644 --- a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json +++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json @@ -10,80 +10,95 @@ "text": "" }, { - "id": 4321, - "logprob": -8.515625, - "text": "Test" + "id": 1724, + "logprob": -7.703125, + "text": "What" }, { - "id": 2009, - "logprob": -15.4140625, - "text": "request" + "id": 338, + "logprob": -1.4765625, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8583984, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7548828, + "text": "?" } ], "seed": null, "tokens": [ - { - "id": 29896, - "logprob": -2.0292969, - "special": false, - "text": "1" - }, { "id": 13, - "logprob": -2.2597656, + "logprob": -1.9306641, "special": false, "text": "\n" }, { - "id": 30166, - "logprob": -3.8671875, + "id": 5618, + "logprob": -2.4550781, "special": false, - "text": "​" + "text": "What" }, { - "id": 30166, - "logprob": -1.0488281, + "id": 338, + "logprob": -0.5732422, "special": false, - "text": "​" + "text": " is" }, { - "id": 30166, - "logprob": -0.24523926, + "id": 278, + "logprob": -1.5761719, "special": false, - "text": "​" + "text": " the" }, { - "id": 30166, - "logprob": -0.07897949, + "id": 4328, + "logprob": -1.5888672, "special": false, - "text": "​" + "text": " difference" }, { - "id": 30166, - "logprob": -0.023513794, + "id": 1546, + "logprob": -0.026504517, "special": false, - "text": "​" + "text": " between" }, { - "id": 30166, - "logprob": -0.011444092, + "id": 21784, + "logprob": -1.4287109, "special": false, - "text": "​" + "text": " Deep" }, { - "id": 30166, - "logprob": -0.008430481, + "id": 29257, + "logprob": -0.15856934, "special": false, - "text": "​" + "text": " Learning" }, { - "id": 30166, - "logprob": -0.007648468, + "id": 322, + "logprob": -0.17456055, "special": false, - "text": "​" + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62646484, + "special": false, + "text": " Machine" } ], "top_tokens": null }, - "generated_text": "1\n​​​​​​​​" + "generated_text": "\nWhat is the difference between Deep Learning and Machine" } diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json index 8e3f5571..d16d34f9 100644 --- a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json +++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json @@ -10,80 +10,90 @@ "text": "" }, { - "id": 4321, - "logprob": -8.515625, - "text": "Test" + "id": 338, + "logprob": -9.0859375, + "text": "is" }, { - "id": 2009, - "logprob": -15.4140625, - "text": "request" + "id": 21784, + "logprob": -10.90625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -2.65625, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -4.8085938, + "text": "?" } ], "seed": 0, "tokens": [ - { - "id": 29896, - "logprob": 0.0, - "special": false, - "text": "1" - }, { "id": 13, - "logprob": -0.6254883, + "logprob": -0.19958496, "special": false, "text": "\n" }, { - "id": 30166, + "id": 4013, + "logprob": -2.203125, + "special": false, + "text": "This" + }, + { + "id": 1139, + "logprob": -0.23693848, + "special": false, + "text": " question" + }, + { + "id": 756, "logprob": 0.0, "special": false, - "text": "​" + "text": " has" }, { - "id": 29918, - "logprob": -0.20141602, + "id": 1063, + "logprob": -0.076538086, "special": false, - "text": "_" + "text": " been" }, { - "id": 29906, - "logprob": -0.6254883, - "special": false, - "text": "2" - }, - { - "id": 29871, + "id": 4433, "logprob": 0.0, "special": false, - "text": " " + "text": " asked" }, { - "id": 30166, - "logprob": 0.0, + "id": 1784, + "logprob": -1.1367188, "special": false, - "text": "​" + "text": " many" }, { - "id": 30166, + "id": 3064, "logprob": 0.0, "special": false, - "text": "​" + "text": " times" }, { - "id": 30166, - "logprob": 0.0, + "id": 322, + "logprob": -1.7460938, "special": false, - "text": "​" + "text": " and" }, { - "id": 30166, + "id": 306, "logprob": 0.0, "special": false, - "text": "​" + "text": " I" } ], "top_tokens": null }, - "generated_text": "Test request1\n​_2 ​​​​" + "generated_text": "What is Deep Learning?\nThis question has been asked many times and I" } diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json index 42b085f8..e6fb3dc0 100644 --- a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json +++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json @@ -11,82 +11,97 @@ "text": "" }, { - "id": 4321, - "logprob": -8.515625, - "text": "Test" + "id": 1724, + "logprob": -7.703125, + "text": "What" }, { - "id": 2009, - "logprob": -15.4140625, - "text": "request" + "id": 338, + "logprob": -1.4765625, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8652344, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7548828, + "text": "?" } ], "seed": null, "tokens": [ - { - "id": 29896, - "logprob": -2.0292969, - "special": false, - "text": "1" - }, { "id": 13, - "logprob": -2.2617188, + "logprob": -1.9306641, "special": false, "text": "\n" }, { - "id": 30166, - "logprob": -3.8671875, + "id": 5618, + "logprob": -2.4550781, "special": false, - "text": "​" + "text": "What" }, { - "id": 30166, - "logprob": -1.0498047, + "id": 338, + "logprob": -0.5732422, "special": false, - "text": "​" + "text": " is" }, { - "id": 30166, - "logprob": -0.24523926, + "id": 278, + "logprob": -1.5761719, "special": false, - "text": "​" + "text": " the" }, { - "id": 30166, - "logprob": -0.07897949, + "id": 4328, + "logprob": -1.5888672, "special": false, - "text": "​" + "text": " difference" }, { - "id": 30166, - "logprob": -0.023529053, + "id": 1546, + "logprob": -0.026504517, "special": false, - "text": "​" + "text": " between" }, { - "id": 30166, - "logprob": -0.011444092, + "id": 21784, + "logprob": -1.4287109, "special": false, - "text": "​" + "text": " Deep" }, { - "id": 30166, - "logprob": -0.008300781, + "id": 29257, + "logprob": -0.15856934, "special": false, - "text": "​" + "text": " Learning" }, { - "id": 30166, - "logprob": -0.007648468, + "id": 322, + "logprob": -0.17456055, "special": false, - "text": "​" + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62646484, + "special": false, + "text": " Machine" } ], "top_tokens": null }, - "generated_text": "1\n​​​​​​​​" + "generated_text": "\nWhat is the difference between Deep Learning and Machine" }, { "details": { @@ -100,82 +115,97 @@ "text": "" }, { - "id": 4321, - "logprob": -8.515625, - "text": "Test" + "id": 1724, + "logprob": -7.703125, + "text": "What" }, { - "id": 2009, - "logprob": -15.4140625, - "text": "request" + "id": 338, + "logprob": -1.4765625, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8583984, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7548828, + "text": "?" } ], "seed": null, "tokens": [ - { - "id": 29896, - "logprob": -2.0292969, - "special": false, - "text": "1" - }, { "id": 13, - "logprob": -2.2617188, + "logprob": -1.9306641, "special": false, "text": "\n" }, { - "id": 30166, - "logprob": -3.8671875, + "id": 5618, + "logprob": -2.4550781, "special": false, - "text": "​" + "text": "What" }, { - "id": 30166, - "logprob": -1.0498047, + "id": 338, + "logprob": -0.5732422, "special": false, - "text": "​" + "text": " is" }, { - "id": 30166, - "logprob": -0.24523926, + "id": 278, + "logprob": -1.5761719, "special": false, - "text": "​" + "text": " the" }, { - "id": 30166, - "logprob": -0.07897949, + "id": 4328, + "logprob": -1.5888672, "special": false, - "text": "​" + "text": " difference" }, { - "id": 30166, - "logprob": -0.023529053, + "id": 1546, + "logprob": -0.026504517, "special": false, - "text": "​" + "text": " between" }, { - "id": 30166, - "logprob": -0.011444092, + "id": 21784, + "logprob": -1.4287109, "special": false, - "text": "​" + "text": " Deep" }, { - "id": 30166, - "logprob": -0.008300781, + "id": 29257, + "logprob": -0.15856934, "special": false, - "text": "​" + "text": " Learning" }, { - "id": 30166, - "logprob": -0.007648468, + "id": 322, + "logprob": -0.17456055, "special": false, - "text": "​" + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62646484, + "special": false, + "text": " Machine" } ], "top_tokens": null }, - "generated_text": "1\n​​​​​​​​" + "generated_text": "\nWhat is the difference between Deep Learning and Machine" }, { "details": { @@ -189,82 +219,97 @@ "text": "" }, { - "id": 4321, - "logprob": -8.515625, - "text": "Test" + "id": 1724, + "logprob": -7.703125, + "text": "What" }, { - "id": 2009, - "logprob": -15.4140625, - "text": "request" + "id": 338, + "logprob": -1.4765625, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8652344, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7548828, + "text": "?" } ], "seed": null, "tokens": [ - { - "id": 29896, - "logprob": -2.0292969, - "special": false, - "text": "1" - }, { "id": 13, - "logprob": -2.2617188, + "logprob": -1.9306641, "special": false, "text": "\n" }, { - "id": 30166, - "logprob": -3.8671875, + "id": 5618, + "logprob": -2.4550781, "special": false, - "text": "​" + "text": "What" }, { - "id": 30166, - "logprob": -1.0498047, + "id": 338, + "logprob": -0.5732422, "special": false, - "text": "​" + "text": " is" }, { - "id": 30166, - "logprob": -0.24523926, + "id": 278, + "logprob": -1.5761719, "special": false, - "text": "​" + "text": " the" }, { - "id": 30166, - "logprob": -0.07897949, + "id": 4328, + "logprob": -1.5888672, "special": false, - "text": "​" + "text": " difference" }, { - "id": 30166, - "logprob": -0.023529053, + "id": 1546, + "logprob": -0.026504517, "special": false, - "text": "​" + "text": " between" }, { - "id": 30166, - "logprob": -0.011444092, + "id": 21784, + "logprob": -1.4287109, "special": false, - "text": "​" + "text": " Deep" }, { - "id": 30166, - "logprob": -0.008300781, + "id": 29257, + "logprob": -0.15856934, "special": false, - "text": "​" + "text": " Learning" }, { - "id": 30166, - "logprob": -0.007648468, + "id": 322, + "logprob": -0.17456055, "special": false, - "text": "​" + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62646484, + "special": false, + "text": " Machine" } ], "top_tokens": null }, - "generated_text": "1\n​​​​​​​​" + "generated_text": "\nWhat is the difference between Deep Learning and Machine" }, { "details": { @@ -278,81 +323,96 @@ "text": "" }, { - "id": 4321, - "logprob": -8.515625, - "text": "Test" + "id": 1724, + "logprob": -7.703125, + "text": "What" }, { - "id": 2009, - "logprob": -15.4140625, - "text": "request" + "id": 338, + "logprob": -1.4765625, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8652344, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7548828, + "text": "?" } ], "seed": null, "tokens": [ - { - "id": 29896, - "logprob": -2.0292969, - "special": false, - "text": "1" - }, { "id": 13, - "logprob": -2.2617188, + "logprob": -1.9306641, "special": false, "text": "\n" }, { - "id": 30166, - "logprob": -3.8671875, + "id": 5618, + "logprob": -2.4550781, "special": false, - "text": "​" + "text": "What" }, { - "id": 30166, - "logprob": -1.0498047, + "id": 338, + "logprob": -0.5732422, "special": false, - "text": "​" + "text": " is" }, { - "id": 30166, - "logprob": -0.24523926, + "id": 278, + "logprob": -1.5761719, "special": false, - "text": "​" + "text": " the" }, { - "id": 30166, - "logprob": -0.07897949, + "id": 4328, + "logprob": -1.5888672, "special": false, - "text": "​" + "text": " difference" }, { - "id": 30166, - "logprob": -0.023529053, + "id": 1546, + "logprob": -0.026504517, "special": false, - "text": "​" + "text": " between" }, { - "id": 30166, - "logprob": -0.011444092, + "id": 21784, + "logprob": -1.4287109, "special": false, - "text": "​" + "text": " Deep" }, { - "id": 30166, - "logprob": -0.008300781, + "id": 29257, + "logprob": -0.15856934, "special": false, - "text": "​" + "text": " Learning" }, { - "id": 30166, - "logprob": -0.007648468, + "id": 322, + "logprob": -0.17456055, "special": false, - "text": "​" + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62646484, + "special": false, + "text": " Machine" } ], "top_tokens": null }, - "generated_text": "1\n​​​​​​​​" + "generated_text": "\nWhat is the difference between Deep Learning and Machine" } ] diff --git a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json new file mode 100644 index 00000000..f1d9129d --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json @@ -0,0 +1,418 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -7.6914062, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4746094, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8623047, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7558594, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -1.9228516, + "special": false, + "text": "\n" + }, + { + "id": 5618, + "logprob": -2.4609375, + "special": false, + "text": "What" + }, + { + "id": 338, + "logprob": -0.57177734, + "special": false, + "text": " is" + }, + { + "id": 278, + "logprob": -1.5722656, + "special": false, + "text": " the" + }, + { + "id": 4328, + "logprob": -1.5859375, + "special": false, + "text": " difference" + }, + { + "id": 1546, + "logprob": -0.02633667, + "special": false, + "text": " between" + }, + { + "id": 21784, + "logprob": -1.4335938, + "special": false, + "text": " Deep" + }, + { + "id": 29257, + "logprob": -0.15991211, + "special": false, + "text": " Learning" + }, + { + "id": 322, + "logprob": -0.17456055, + "special": false, + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62060547, + "special": false, + "text": " Machine" + } + ], + "top_tokens": null + }, + "generated_text": "\nWhat is the difference between Deep Learning and Machine" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -7.6914062, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4746094, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8623047, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7558594, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -1.9228516, + "special": false, + "text": "\n" + }, + { + "id": 5618, + "logprob": -2.4609375, + "special": false, + "text": "What" + }, + { + "id": 338, + "logprob": -0.57177734, + "special": false, + "text": " is" + }, + { + "id": 278, + "logprob": -1.5722656, + "special": false, + "text": " the" + }, + { + "id": 4328, + "logprob": -1.5859375, + "special": false, + "text": " difference" + }, + { + "id": 1546, + "logprob": -0.02633667, + "special": false, + "text": " between" + }, + { + "id": 21784, + "logprob": -1.4335938, + "special": false, + "text": " Deep" + }, + { + "id": 29257, + "logprob": -0.15991211, + "special": false, + "text": " Learning" + }, + { + "id": 322, + "logprob": -0.17456055, + "special": false, + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62060547, + "special": false, + "text": " Machine" + } + ], + "top_tokens": null + }, + "generated_text": "\nWhat is the difference between Deep Learning and Machine" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -7.6914062, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4746094, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8623047, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7558594, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -1.9228516, + "special": false, + "text": "\n" + }, + { + "id": 5618, + "logprob": -2.4609375, + "special": false, + "text": "What" + }, + { + "id": 338, + "logprob": -0.57177734, + "special": false, + "text": " is" + }, + { + "id": 278, + "logprob": -1.5722656, + "special": false, + "text": " the" + }, + { + "id": 4328, + "logprob": -1.5859375, + "special": false, + "text": " difference" + }, + { + "id": 1546, + "logprob": -0.02633667, + "special": false, + "text": " between" + }, + { + "id": 21784, + "logprob": -1.4335938, + "special": false, + "text": " Deep" + }, + { + "id": 29257, + "logprob": -0.15991211, + "special": false, + "text": " Learning" + }, + { + "id": 322, + "logprob": -0.17456055, + "special": false, + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62060547, + "special": false, + "text": " Machine" + } + ], + "top_tokens": null + }, + "generated_text": "\nWhat is the difference between Deep Learning and Machine" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -7.6914062, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4746094, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8623047, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7558594, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -1.9228516, + "special": false, + "text": "\n" + }, + { + "id": 5618, + "logprob": -2.4609375, + "special": false, + "text": "What" + }, + { + "id": 338, + "logprob": -0.57177734, + "special": false, + "text": " is" + }, + { + "id": 278, + "logprob": -1.5722656, + "special": false, + "text": " the" + }, + { + "id": 4328, + "logprob": -1.5859375, + "special": false, + "text": " difference" + }, + { + "id": 1546, + "logprob": -0.02633667, + "special": false, + "text": " between" + }, + { + "id": 21784, + "logprob": -1.4335938, + "special": false, + "text": " Deep" + }, + { + "id": 29257, + "logprob": -0.15991211, + "special": false, + "text": " Learning" + }, + { + "id": 322, + "logprob": -0.17456055, + "special": false, + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62060547, + "special": false, + "text": " Machine" + } + ], + "top_tokens": null + }, + "generated_text": "\nWhat is the difference between Deep Learning and Machine" + } +] diff --git a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json new file mode 100644 index 00000000..0f91eb36 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json @@ -0,0 +1,104 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -7.6914062, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4746094, + "text": "is" + }, + { + "id": 21784, + "logprob": -9.390625, + "text": "Deep" + }, + { + "id": 29257, + "logprob": -1.8623047, + "text": "Learning" + }, + { + "id": 29973, + "logprob": -0.7558594, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -1.9228516, + "special": false, + "text": "\n" + }, + { + "id": 5618, + "logprob": -2.4609375, + "special": false, + "text": "What" + }, + { + "id": 338, + "logprob": -0.57177734, + "special": false, + "text": " is" + }, + { + "id": 278, + "logprob": -1.5722656, + "special": false, + "text": " the" + }, + { + "id": 4328, + "logprob": -1.5927734, + "special": false, + "text": " difference" + }, + { + "id": 1546, + "logprob": -0.026428223, + "special": false, + "text": " between" + }, + { + "id": 21784, + "logprob": -1.4267578, + "special": false, + "text": " Deep" + }, + { + "id": 29257, + "logprob": -0.16015625, + "special": false, + "text": " Learning" + }, + { + "id": 322, + "logprob": -0.17382812, + "special": false, + "text": " and" + }, + { + "id": 6189, + "logprob": -0.62060547, + "special": false, + "text": " Machine" + } + ], + "top_tokens": null + }, + "generated_text": "\nWhat is the difference between Deep Learning and Machine" +} diff --git a/integration-tests/models/test_flash_awq.py b/integration-tests/models/test_flash_awq.py index ca474d37..f0b99a3b 100644 --- a/integration-tests/models/test_flash_awq.py +++ b/integration-tests/models/test_flash_awq.py @@ -3,7 +3,7 @@ import pytest @pytest.fixture(scope="module") def flash_llama_awq_handle(launcher): - with launcher("abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq", num_shard=2, quantize="awq") as handle: + with launcher("abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq", num_shard=1, quantize="awq") as handle: yield handle @@ -12,23 +12,24 @@ async def flash_llama_awq(flash_llama_awq_handle): await flash_llama_awq_handle.health(300) return flash_llama_awq_handle.client - @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_awq(flash_llama_awq, response_snapshot): response = await flash_llama_awq.generate( - "Test request", max_new_tokens=10, decoder_input_details=True + "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True ) assert response.details.generated_tokens == 10 + assert response.generated_text == "\nWhat is the difference between Deep Learning and Machine" assert response == response_snapshot + @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot): response = await flash_llama_awq.generate( - "Test request", + "What is Deep Learning?", max_new_tokens=10, repetition_penalty=1.2, return_full_text=True, @@ -52,10 +53,12 @@ async def test_flash_llama_awq_load( flash_llama_awq, generate_load, response_snapshot ): responses = await generate_load( - flash_llama_awq, "Test request", max_new_tokens=10, n=4 + flash_llama_awq, "What is Deep Learning?", max_new_tokens=10, n=4 ) assert len(responses) == 4 - assert all([r.generated_text == responses[0].generated_text for r in responses]) + assert all([r.generated_text == "\nWhat is the difference between Deep Learning and Machine" for r in responses]) assert responses == response_snapshot + + diff --git a/integration-tests/models/test_flash_awq_sharded.py b/integration-tests/models/test_flash_awq_sharded.py new file mode 100644 index 00000000..39ea464a --- /dev/null +++ b/integration-tests/models/test_flash_awq_sharded.py @@ -0,0 +1,36 @@ +import pytest + +@pytest.fixture(scope="module") +def flash_llama_awq_handle_sharded(launcher): + with launcher("abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq", num_shard=2, quantize="awq") as handle: + yield handle + +@pytest.fixture(scope="module") +async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded): + await flash_llama_awq_handle_sharded.health(300) + return flash_llama_awq_handle_sharded.client + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapshot): + response = await flash_llama_awq_sharded.generate( + "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True + ) + + assert response.details.generated_tokens == 10 + assert response.generated_text == "\nWhat is the difference between Deep Learning and Machine" + assert response == response_snapshot + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_awq_load_sharded( + flash_llama_awq_sharded, generate_load, response_snapshot +): + responses = await generate_load( + flash_llama_awq_sharded, "What is Deep Learning?", max_new_tokens=10, n=4 + ) + + assert len(responses) == 4 + assert all([r.generated_text == "\nWhat is the difference between Deep Learning and Machine" for r in responses]) + + assert responses == response_snapshot diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index 45807949..266fcccb 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -299,8 +299,8 @@ class Weights: "Cannot load `awq` weight, make sure the model is already quantized" ) - qzeros = self.get_tensor(f"{prefix}.qzeros") - scales = self.get_tensor(f"{prefix}.scales") + qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0) + scales = self.get_sharded(f"{prefix}.scales", dim=0) g_idx = None use_exllama = False