change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.
This commit is contained in:
Nicolas Patry 2025-03-06 16:24:11 +01:00
parent 622908deab
commit 818c8db29a
No known key found for this signature in database
GPG Key ID: 4242CEF24CB6DBF9
13 changed files with 581 additions and 708 deletions

View File

@ -2,10 +2,16 @@
"nodes": {
"cachix": {
"inputs": {
"devenv": ["crate2nix"],
"flake-compat": ["crate2nix"],
"devenv": [
"crate2nix"
],
"flake-compat": [
"crate2nix"
],
"nixpkgs": "nixpkgs",
"pre-commit-hooks": ["crate2nix"]
"pre-commit-hooks": [
"crate2nix"
]
},
"locked": {
"lastModified": 1709700175,
@ -24,10 +30,19 @@
},
"cachix_2": {
"inputs": {
"devenv": ["crate2nix", "crate2nix_stable"],
"flake-compat": ["crate2nix", "crate2nix_stable"],
"devenv": [
"crate2nix",
"crate2nix_stable"
],
"flake-compat": [
"crate2nix",
"crate2nix_stable"
],
"nixpkgs": "nixpkgs_2",
"pre-commit-hooks": ["crate2nix", "crate2nix_stable"]
"pre-commit-hooks": [
"crate2nix",
"crate2nix_stable"
]
},
"locked": {
"lastModified": 1716549461,
@ -46,8 +61,16 @@
},
"cachix_3": {
"inputs": {
"devenv": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
"flake-compat": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
"devenv": [
"crate2nix",
"crate2nix_stable",
"crate2nix_stable"
],
"flake-compat": [
"crate2nix",
"crate2nix_stable",
"crate2nix_stable"
],
"nixpkgs": "nixpkgs_3",
"pre-commit-hooks": [
"crate2nix",
@ -78,15 +101,18 @@
"flake-compat": "flake-compat_3",
"flake-parts": "flake-parts_3",
"nix-test-runner": "nix-test-runner_3",
"nixpkgs": ["tgi-nix", "nixpkgs"],
"nixpkgs": [
"tgi-nix",
"nixpkgs"
],
"pre-commit-hooks": "pre-commit-hooks_3"
},
"locked": {
"lastModified": 1734429562,
"narHash": "sha256-V2XNs3Ir8WXNHdocfzkR/fu0FzkZ9uTDJkVecxJrGmQ=",
"lastModified": 1739473963,
"narHash": "sha256-ItAhpjNUzEWd/cgZVyW/jvoGbCec4TK29e1Mnmn1oJE=",
"owner": "nix-community",
"repo": "crate2nix",
"rev": "8537c2d7cb623679aaeff62c4c4c43a91566ab09",
"rev": "be31feae9a82c225c0fd1bdf978565dc452a483a",
"type": "github"
},
"original": {
@ -193,7 +219,11 @@
"devshell_2": {
"inputs": {
"flake-utils": "flake-utils_3",
"nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"]
"nixpkgs": [
"crate2nix",
"crate2nix_stable",
"nixpkgs"
]
},
"locked": {
"lastModified": 1717408969,
@ -212,7 +242,10 @@
"devshell_3": {
"inputs": {
"flake-utils": "flake-utils_4",
"nixpkgs": ["crate2nix", "nixpkgs"]
"nixpkgs": [
"crate2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1711099426,
@ -310,7 +343,11 @@
},
"flake-parts_2": {
"inputs": {
"nixpkgs-lib": ["crate2nix", "crate2nix_stable", "nixpkgs"]
"nixpkgs-lib": [
"crate2nix",
"crate2nix_stable",
"nixpkgs"
]
},
"locked": {
"lastModified": 1719745305,
@ -328,7 +365,10 @@
},
"flake-parts_3": {
"inputs": {
"nixpkgs-lib": ["crate2nix", "nixpkgs"]
"nixpkgs-lib": [
"crate2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1712014858,
@ -519,7 +559,11 @@
},
"gitignore_3": {
"inputs": {
"nixpkgs": ["crate2nix", "pre-commit-hooks", "nixpkgs"]
"nixpkgs": [
"crate2nix",
"pre-commit-hooks",
"nixpkgs"
]
},
"locked": {
"lastModified": 1709087332,
@ -726,10 +770,22 @@
},
"pre-commit-hooks_2": {
"inputs": {
"flake-compat": ["crate2nix", "crate2nix_stable", "flake-compat"],
"flake-compat": [
"crate2nix",
"crate2nix_stable",
"flake-compat"
],
"gitignore": "gitignore_2",
"nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"],
"nixpkgs-stable": ["crate2nix", "crate2nix_stable", "nixpkgs"]
"nixpkgs": [
"crate2nix",
"crate2nix_stable",
"nixpkgs"
],
"nixpkgs-stable": [
"crate2nix",
"crate2nix_stable",
"nixpkgs"
]
},
"locked": {
"lastModified": 1719259945,
@ -747,11 +803,20 @@
},
"pre-commit-hooks_3": {
"inputs": {
"flake-compat": ["crate2nix", "flake-compat"],
"flake-compat": [
"crate2nix",
"flake-compat"
],
"flake-utils": "flake-utils_5",
"gitignore": "gitignore_3",
"nixpkgs": ["crate2nix", "nixpkgs"],
"nixpkgs-stable": ["crate2nix", "nixpkgs"]
"nixpkgs": [
"crate2nix",
"nixpkgs"
],
"nixpkgs-stable": [
"crate2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1712055707,
@ -772,21 +837,27 @@
"crate2nix": "crate2nix",
"flake-utils": "flake-utils_6",
"nix-filter": "nix-filter",
"nixpkgs": ["tgi-nix", "nixpkgs"],
"nixpkgs": [
"tgi-nix",
"nixpkgs"
],
"rust-overlay": "rust-overlay",
"tgi-nix": "tgi-nix"
}
},
"rust-overlay": {
"inputs": {
"nixpkgs": ["tgi-nix", "nixpkgs"]
"nixpkgs": [
"tgi-nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1738549608,
"narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
"lastModified": 1741141853,
"narHash": "sha256-FauVtC+FbOgkKpGVuQTNxSqrvgbmVc7hFkjn/DacwMo=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
"rev": "02edad1f19d6dec824e0812e4cdc0aa7930ff8ae",
"type": "github"
},
"original": {

View File

@ -39,7 +39,13 @@ from typing import Dict, List, Optional
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
from syrupy.extensions.json import JSONSnapshotExtension
from huggingface_hub.inference._generated.types.chat_completion import (
ChatCompletionStreamOutput,
ChatCompletionOutput,
)
from openai.types.chat.chat_completion_chunk import (
ChatCompletionChunk as OAIChatCompletionChunk,
)
from text_generation import AsyncClient
from text_generation.types import (
BestOfSequence,

View File

@ -0,0 +1,62 @@
[
{
"choices": [
{
"delta": {
"content": "OK",
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741265520,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "!",
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741265520,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "",
"role": "assistant",
"tool_calls": null
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"created": 1741265520,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
}
]

View File

@ -0,0 +1,75 @@
[
{
"choices": [
{
"delta": {
"content": "OK",
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741266005,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "!",
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741266005,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "",
"role": "assistant",
"tool_calls": null
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"created": 1741266005,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [],
"created": 1741266005,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"system_fingerprint": "3.1.2-dev0-native",
"usage": {
"completion_tokens": 3,
"prompt_tokens": 39,
"total_tokens": 42
}
}
]

View File

@ -0,0 +1,71 @@
[
{
"choices": [
{
"delta": {
"content": "OK",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741265134,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "!",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741265134,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"created": 1741265134,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
}
]

View File

@ -0,0 +1,87 @@
[
{
"choices": [
{
"delta": {
"content": "OK",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741265133,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "!",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1741265133,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [
{
"delta": {
"content": "",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"created": 1741265133,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.1.2-dev0-native",
"usage": null
},
{
"choices": [],
"created": 1741265133,
"id": "",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.1.2-dev0-native",
"usage": {
"completion_tokens": 3,
"completion_tokens_details": null,
"prompt_tokens": 39,
"prompt_tokens_details": null,
"total_tokens": 42
}
}
]

View File

@ -1,17 +1,17 @@
{
"choices": [
{
"finish_reason": "length",
"index": 0,
"logprobs": null,
"text": " A Beginners Guide\nDeep learning is a subset"
},
{
"finish_reason": "length",
"index": 1,
"logprobs": null,
"text": " This is a question that has puzzled many people for"
},
{
"finish_reason": "length",
"index": 0,
"logprobs": null,
"text": " A Beginners Guide\nDeep learning is a subset"
},
{
"finish_reason": "length",
"index": 3,
@ -25,11 +25,11 @@
"text": " Paris\nWhat is the capital of France?\nThe"
}
],
"created": 1725877154,
"created": 1741264813,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native",
"system_fingerprint": "3.1.2-dev0-native",
"usage": {
"completion_tokens": 40,
"prompt_tokens": 22,

View File

@ -1,602 +0,0 @@
[
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " A"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " This"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": " Paris"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "us"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " Beginner"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " is"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": "\n"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "cul"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "s"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " a"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": "What"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "as"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " Guide"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " question"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": " is"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "_minus"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "\n"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " that"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": " the"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "cul"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "Deep"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " has"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": " capital"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "as"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " learning"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " puzzled"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": " of"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "(s"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " is"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " many"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": " France"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": "):\n"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " a"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 1,
"logprobs": null,
"text": " people"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 2,
"logprobs": null,
"text": "?\n"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "",
"index": 3,
"logprobs": null,
"text": " "
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "length",
"index": 0,
"logprobs": null,
"text": " subset"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "length",
"index": 1,
"logprobs": null,
"text": " for"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "length",
"index": 2,
"logprobs": null,
"text": "The"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
},
{
"choices": [
{
"finish_reason": "length",
"index": 3,
"logprobs": null,
"text": " \"\"\"\n"
}
],
"created": 1725883643,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native"
}
]

View File

@ -7,11 +7,11 @@
"text": " A Beginners Guide\nDeep learning is a subset"
}
],
"created": 1725876621,
"created": 1741264812,
"id": "",
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"object": "text_completion",
"system_fingerprint": "2.2.1-dev0-native",
"system_fingerprint": "3.1.2-dev0-native",
"usage": {
"completion_tokens": 10,
"prompt_tokens": 6,

View File

@ -0,0 +1,16 @@
import pytest
@pytest.fixture(scope="module")
def chat_handle(launcher):
with launcher(
"meta-llama/Meta-Llama-3.1-8B-Instruct",
) as handle:
yield handle
@pytest.fixture(scope="module")
async def chat_client(chat_handle):
await chat_handle.health(300)
return chat_handle.client

View File

@ -2,8 +2,8 @@ import pytest
import requests
import json
from aiohttp import ClientSession
from openai import OpenAI
from huggingface_hub import InferenceClient
from text_generation.types import Completion
@ -158,47 +158,30 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
async def test_flash_llama_completion_many_prompts_stream(
flash_llama_completion, response_snapshot
):
request = {
"model": "tgi",
"prompt": [
client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
stream = client.completion(
model="tgi",
prompt=[
"What is Deep Learning?",
"Is water wet?",
"What is the capital of France?",
"def mai",
],
"max_tokens": 10,
"seed": 0,
"temperature": 0.0,
"stream": True,
}
max_tokens=10,
seed=0,
temperature=0.0,
stream=True,
)
url = f"{flash_llama_completion.base_url}/v1/completions"
chunks = []
strings = [""] * 4
async with ClientSession(headers=flash_llama_completion.headers) as session:
async with session.post(url, json=request) as response:
# iterate over the stream
async for chunk in response.content.iter_any():
# remove "data:"
chunk = chunk.decode().split("\n\n")
# remove "data:" if present
chunk = [c.replace("data:", "") for c in chunk]
# remove empty strings
chunk = [c for c in chunk if c]
# remove completion marking chunk
chunk = [c for c in chunk if c != " [DONE]"]
# parse json
chunk = [json.loads(c) for c in chunk]
chunks = []
for chunk in stream:
chunks.append(chunk)
assert "choices" in chunk
index = chunk.choices[0].index
assert 0 <= index <= 4
strings[index] += chunk.choices[0].text
for c in chunk:
chunks.append(Completion(**c))
assert "choices" in c
index = c["choices"][0]["index"]
assert 0 <= index <= 4
strings[index] += c["choices"][0]["text"]
assert response.status == 200
assert list(strings) == [
" A Beginners Guide\nDeep learning is a subset",
" This is a question that has puzzled many people for",
@ -206,3 +189,92 @@ async def test_flash_llama_completion_many_prompts_stream(
'usculas_minusculas(s):\n """\n',
]
assert chunks == response_snapshot
@pytest.mark.release
async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
stream = client.chat.completions.create(
model="tgi",
messages=[{"role": "user", "content": "Say 'OK!'"}],
stream=True,
max_tokens=10,
seed=42,
stream_options={"include_usage": True},
)
chunks = []
for chunk in stream:
chunks.append(chunk)
for chunk in chunks[:-1]:
assert chunk.usage is None
for chunk in chunks[-1:]:
assert chunk.usage is not None
assert chunks == response_snapshot
@pytest.mark.release
async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
stream = client.chat.completions.create(
model="tgi",
messages=[{"role": "user", "content": "Say 'OK!'"}],
stream=True,
max_tokens=10,
seed=42,
stream_options={"include_usage": False},
)
chunks = []
for chunk in stream:
assert chunk.usage is None
chunks.append(chunk)
assert chunks == response_snapshot
@pytest.mark.release
async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
stream = client.chat_completion(
model="tgi",
messages=[{"role": "user", "content": "Say 'OK!'"}],
stream=True,
max_tokens=10,
seed=42,
stream_options={"include_usage": True},
)
chunks = []
for chunk in stream:
chunks.append(chunk)
for chunk in chunks[:-1]:
assert chunk.usage is None
for chunk in chunks[-1:]:
assert chunk.usage is not None
assert chunks == response_snapshot
@pytest.mark.release
async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
stream = client.chat_completion(
model="tgi",
messages=[{"role": "user", "content": "Say 'OK!'"}],
stream=True,
max_tokens=10,
seed=42,
stream_options={"include_usage": False},
)
chunks = []
for chunk in stream:
assert chunk.usage is None
chunks.append(chunk)
assert chunks == response_snapshot

View File

@ -764,7 +764,6 @@ impl ChatCompletionChunk {
created: u64,
logprobs: Option<ChatCompletionLogprobs>,
finish_reason: Option<String>,
usage: Option<Usage>,
) -> Self {
let delta = match (delta, tool_calls) {
(Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
@ -801,7 +800,7 @@ impl ChatCompletionChunk {
logprobs,
finish_reason,
}],
usage,
usage: None,
}
}
}

View File

@ -1124,7 +1124,6 @@ enum StreamState {
fn create_event_from_stream_token(
stream_token: &StreamResponse,
logprobs: bool,
stream_options: Option<StreamOptions>,
inner_using_tools: bool,
system_fingerprint: String,
model_id: String,
@ -1151,30 +1150,10 @@ fn create_event_from_stream_token(
(content, None)
};
let (usage, finish_reason) = match &stream_token.details {
Some(details) => {
let usage = if stream_options
.as_ref()
.map(|s| s.include_usage)
.unwrap_or(false)
{
let completion_tokens = details.generated_tokens;
let prompt_tokens = details.input_length;
let total_tokens = prompt_tokens + completion_tokens;
Some(Usage {
completion_tokens,
prompt_tokens,
total_tokens,
})
} else {
None
};
(usage, Some(details.finish_reason.format(true)))
}
None => (None, None),
};
let finish_reason = stream_token
.details
.as_ref()
.map(|details| details.finish_reason.format(true));
let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
model_id.clone(),
system_fingerprint.clone(),
@ -1183,7 +1162,6 @@ fn create_event_from_stream_token(
current_time,
logprobs,
finish_reason,
usage,
));
event.json_data(chat_complete).unwrap_or_else(|e| {
@ -1287,6 +1265,17 @@ pub(crate) async fn chat_completions(
match result{
Ok(stream_token) => {
let token_text = &stream_token.token.text.clone();
let usage = stream_token.details.as_ref().map(|details| {
let completion_tokens = details.generated_tokens;
let prompt_tokens = details.input_length;
let total_tokens = prompt_tokens + completion_tokens;
Usage {
completion_tokens,
prompt_tokens,
total_tokens,
}
});
match state {
StreamState::Buffering => {
json_buffer.push_str(&token_text.replace(" ", ""));
@ -1307,7 +1296,6 @@ pub(crate) async fn chat_completions(
let event = create_event_from_stream_token(
stream_token,
logprobs,
stream_options.clone(),
response_as_tool,
system_fingerprint.clone(),
model_id.clone(),
@ -1347,7 +1335,6 @@ pub(crate) async fn chat_completions(
current_time,
None,
None,
None,
));
yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
InferError::StreamSerializationError(e.to_string()).into()
@ -1369,7 +1356,6 @@ pub(crate) async fn chat_completions(
let event = create_event_from_stream_token(
&stream_token,
logprobs,
stream_options.clone(),
response_as_tool,
system_fingerprint.clone(),
model_id.clone(),
@ -1378,6 +1364,36 @@ pub(crate) async fn chat_completions(
yield Ok::<Event, Infallible>(event);
}
}
let should_send_usage = usage.is_some()
&& stream_options
.as_ref()
.is_some_and(|opts| opts.include_usage);
if should_send_usage {
let usage_data = usage.unwrap();
let current_time = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_else(|_| std::time::Duration::from_secs(0))
.as_secs();
let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk {
id: String::new(),
created: current_time,
model: model_id.clone(),
system_fingerprint: system_fingerprint.clone(),
choices: vec![],
usage: Some(Usage {
prompt_tokens: usage_data.prompt_tokens,
completion_tokens: usage_data.completion_tokens,
total_tokens: usage_data.total_tokens,
}),
});
yield Ok(Event::default()
.json_data(chat_complete)
.unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into()));
}
}
Err(err) => yield Ok(err.into_openai_event())
}