mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-18 15:22:09 +00:00
* Attempt at automatic max batch prefill. * Taking into account number of shards. * Adding more cards. * Adding A100 + H100 * Adding a few more cards. * Logprobs cost too much. * h100 better name, and keep factor of 2 * Damn inflated sparse tflops. * Typo in h100. * Updated the flops calculation (checked with fvcore). * chunking by default. * Fix prefix caching for chat completion since we removed logprobs. * More tests. * Dropping all the prefill logprobs. * Add a flag that enables users to get logprobs back. * Repairing prompt token counting. * Fixing a few tests. * Remove some scaffolding. * Attempting to reduces the issues (workarounds for now).
73 lines
1.4 KiB
JSON
73 lines
1.4 KiB
JSON
{
|
|
"details": {
|
|
"best_of_sequences": null,
|
|
"finish_reason": "length",
|
|
"generated_tokens": 10,
|
|
"prefill": [],
|
|
"seed": null,
|
|
"tokens": [
|
|
{
|
|
"id": 42,
|
|
"logprob": -0.86279297,
|
|
"special": false,
|
|
"text": "I"
|
|
},
|
|
{
|
|
"id": 1353,
|
|
"logprob": -0.94921875,
|
|
"special": false,
|
|
"text": "'m"
|
|
},
|
|
{
|
|
"id": 7016,
|
|
"logprob": -2.1835938,
|
|
"special": false,
|
|
"text": " sorry"
|
|
},
|
|
{
|
|
"id": 13,
|
|
"logprob": -0.074035645,
|
|
"special": false,
|
|
"text": ","
|
|
},
|
|
{
|
|
"id": 1394,
|
|
"logprob": -0.86376953,
|
|
"special": false,
|
|
"text": "You"
|
|
},
|
|
{
|
|
"id": 452,
|
|
"logprob": -1.2070312,
|
|
"special": false,
|
|
"text": " have"
|
|
},
|
|
{
|
|
"id": 247,
|
|
"logprob": -1.4365234,
|
|
"special": false,
|
|
"text": " a"
|
|
},
|
|
{
|
|
"id": 4327,
|
|
"logprob": -1.109375,
|
|
"special": false,
|
|
"text": " choice"
|
|
},
|
|
{
|
|
"id": 273,
|
|
"logprob": -0.93408203,
|
|
"special": false,
|
|
"text": " of"
|
|
},
|
|
{
|
|
"id": 752,
|
|
"logprob": -1.8808594,
|
|
"special": false,
|
|
"text": " what"
|
|
}
|
|
]
|
|
},
|
|
"generated_text": "I'm sorry,You have a choice of what"
|
|
}
|