mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
Update FP8 KV cache test to use checkpoint with scales
This commit is contained in:
parent
ba4ac96399
commit
1f18cb6aa6
@ -11,27 +11,27 @@
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -5.6328125,
|
||||
"logprob": -6.1875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.2265625,
|
||||
"logprob": -0.93359375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -9.1015625,
|
||||
"logprob": -9.875,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -1.8085938,
|
||||
"logprob": -1.1796875,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -1.0439453,
|
||||
"logprob": -1.75,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
@ -39,66 +39,66 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -2.1992188,
|
||||
"logprob": -1.109375,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.079956055,
|
||||
"logprob": -0.005432129,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.2763672,
|
||||
"logprob": -0.028808594,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.37548828,
|
||||
"logprob": -0.013671875,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 27084,
|
||||
"logprob": -1.4628906,
|
||||
"logprob": -0.69921875,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.02885437,
|
||||
"logprob": -0.0005874634,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5780,
|
||||
"logprob": -0.2565918,
|
||||
"logprob": -0.026855469,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0063438416,
|
||||
"logprob": -0.00020885468,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 430,
|
||||
"logprob": -1.3056641,
|
||||
"logprob": -0.17773438,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.6035156,
|
||||
"id": 18065,
|
||||
"logprob": -0.703125,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
"text": " involves"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a subset of machine learning that is"
|
||||
"generated_text": " Deep learning is a subset of machine learning that involves"
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "eos_token",
|
||||
"generated_tokens": 3,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
@ -11,22 +11,22 @@
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -22.96875,
|
||||
"logprob": -18.0,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.71875,
|
||||
"logprob": -11.75,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -2.6992188,
|
||||
"logprob": -2.0625,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -4.8398438,
|
||||
"logprob": -6.0,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
@ -34,24 +34,66 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 720,
|
||||
"logprob": -0.4411621,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " \n"
|
||||
},
|
||||
{
|
||||
"id": 220,
|
||||
"logprob": -0.35864258,
|
||||
"id": 34564,
|
||||
"logprob": -0.11279297,
|
||||
"special": false,
|
||||
"text": " "
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 128001,
|
||||
"id": 6975,
|
||||
"logprob": -0.16015625,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.25195312,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 16931,
|
||||
"logprob": -1.703125,
|
||||
"special": false,
|
||||
"text": "DL"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"logprob": 0.0,
|
||||
"special": true,
|
||||
"text": "<|end_of_text|>"
|
||||
"special": false,
|
||||
"text": ")"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.140625,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 1207,
|
||||
"logprob": -1.3125,
|
||||
"special": false,
|
||||
"text": " sub"
|
||||
},
|
||||
{
|
||||
"id": 2630,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "field"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "What is deep learning? \n "
|
||||
"generated_text": "What is deep learning? \nDeep learning (DL) is a subfield"
|
||||
}
|
||||
|
@ -12,27 +12,27 @@
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -5.6328125,
|
||||
"logprob": -6.1875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.2265625,
|
||||
"logprob": -0.93359375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -9.1015625,
|
||||
"logprob": -9.875,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -1.8085938,
|
||||
"logprob": -1.1796875,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -1.0439453,
|
||||
"logprob": -1.75,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
@ -40,68 +40,68 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -2.1992188,
|
||||
"logprob": -1.109375,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.07897949,
|
||||
"logprob": -0.0047912598,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.27734375,
|
||||
"logprob": -0.025512695,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.37402344,
|
||||
"logprob": -0.012145996,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 27084,
|
||||
"logprob": -1.4511719,
|
||||
"logprob": -0.72265625,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.02909851,
|
||||
"logprob": -0.0005760193,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5780,
|
||||
"logprob": -0.25854492,
|
||||
"logprob": -0.02722168,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0061798096,
|
||||
"logprob": -0.00023651123,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 430,
|
||||
"logprob": -1.3046875,
|
||||
"logprob": -0.17285156,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.5537109,
|
||||
"id": 18065,
|
||||
"logprob": -0.703125,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
"text": " involves"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a subset of machine learning that is"
|
||||
"generated_text": " Deep learning is a subset of machine learning that involves"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
@ -116,27 +116,27 @@
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -5.6328125,
|
||||
"logprob": -6.21875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.2265625,
|
||||
"logprob": -0.95703125,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -9.1015625,
|
||||
"logprob": -9.9375,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -1.8085938,
|
||||
"logprob": -1.1328125,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -1.0439453,
|
||||
"logprob": -1.75,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
@ -144,68 +144,68 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -2.1992188,
|
||||
"logprob": -1.1796875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.07897949,
|
||||
"logprob": -0.005432129,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.27734375,
|
||||
"logprob": -0.02758789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.37402344,
|
||||
"logprob": -0.013366699,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 27084,
|
||||
"logprob": -1.4511719,
|
||||
"logprob": -0.6953125,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.02909851,
|
||||
"logprob": -0.0004863739,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5780,
|
||||
"logprob": -0.25854492,
|
||||
"logprob": -0.02709961,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0061798096,
|
||||
"logprob": -0.00022506714,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 430,
|
||||
"logprob": -1.3046875,
|
||||
"logprob": -0.19726562,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.5537109,
|
||||
"id": 18065,
|
||||
"logprob": -0.77734375,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
"text": " involves"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a subset of machine learning that is"
|
||||
"generated_text": " Deep learning is a subset of machine learning that involves"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
@ -220,27 +220,27 @@
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -5.6328125,
|
||||
"logprob": -6.21875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.2265625,
|
||||
"logprob": -0.95703125,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -9.1015625,
|
||||
"logprob": -9.9375,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -1.8085938,
|
||||
"logprob": -1.1328125,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -1.0439453,
|
||||
"logprob": -1.75,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
@ -248,68 +248,68 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -2.1992188,
|
||||
"logprob": -1.1796875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.07897949,
|
||||
"logprob": -0.005432129,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.27734375,
|
||||
"logprob": -0.02758789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.37402344,
|
||||
"logprob": -0.013366699,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 27084,
|
||||
"logprob": -1.4511719,
|
||||
"logprob": -0.6953125,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.02909851,
|
||||
"logprob": -0.0004863739,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5780,
|
||||
"logprob": -0.25854492,
|
||||
"logprob": -0.02709961,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0061798096,
|
||||
"logprob": -0.00022506714,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 430,
|
||||
"logprob": -1.3046875,
|
||||
"logprob": -0.19726562,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.5537109,
|
||||
"id": 18065,
|
||||
"logprob": -0.77734375,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
"text": " involves"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a subset of machine learning that is"
|
||||
"generated_text": " Deep learning is a subset of machine learning that involves"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
@ -324,27 +324,27 @@
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -5.6328125,
|
||||
"logprob": -6.21875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.2265625,
|
||||
"logprob": -0.95703125,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -9.1015625,
|
||||
"logprob": -9.9375,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -1.8085938,
|
||||
"logprob": -1.1328125,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -1.0439453,
|
||||
"logprob": -1.75,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
@ -352,67 +352,67 @@
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -2.1992188,
|
||||
"logprob": -1.1796875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.07897949,
|
||||
"logprob": -0.005432129,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.27734375,
|
||||
"logprob": -0.02758789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.37402344,
|
||||
"logprob": -0.013366699,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 27084,
|
||||
"logprob": -1.4511719,
|
||||
"logprob": -0.6953125,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.02909851,
|
||||
"logprob": -0.0004863739,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 5780,
|
||||
"logprob": -0.25854492,
|
||||
"logprob": -0.02709961,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0061798096,
|
||||
"logprob": -0.00022506714,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 430,
|
||||
"logprob": -1.3046875,
|
||||
"logprob": -0.19726562,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -1.5537109,
|
||||
"id": 18065,
|
||||
"logprob": -0.77734375,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
"text": " involves"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a subset of machine learning that is"
|
||||
"generated_text": " Deep learning is a subset of machine learning that involves"
|
||||
}
|
||||
]
|
||||
|
@ -4,7 +4,9 @@ import pytest
|
||||
@pytest.fixture(scope="module")
|
||||
def flash_llama_fp8_kv_cache_handle(launcher):
|
||||
with launcher(
|
||||
"meta-llama/Meta-Llama-3-8B", num_shard=2, kv_cache_dtype="fp8_e5m2"
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
num_shard=2,
|
||||
kv_cache_dtype="fp8_e4m3fn",
|
||||
) as handle:
|
||||
yield handle
|
||||
|
||||
@ -25,7 +27,7 @@ async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, response_snaps
|
||||
|
||||
assert (
|
||||
response.generated_text
|
||||
== " Deep learning is a subset of machine learning that is"
|
||||
== " Deep learning is a subset of machine learning that involves"
|
||||
)
|
||||
assert response.details.generated_tokens == 10
|
||||
assert response == response_snapshot
|
||||
@ -69,7 +71,7 @@ async def test_flash_llama_fp8_kv_cache_load(
|
||||
assert len(responses) == 4
|
||||
assert (
|
||||
responses[0].generated_text
|
||||
== " Deep learning is a subset of machine learning that is"
|
||||
== " Deep learning is a subset of machine learning that involves"
|
||||
)
|
||||
assert all(
|
||||
[r.generated_text == responses[0].generated_text for r in responses]
|
||||
|
Loading…
Reference in New Issue
Block a user