From 818c8db29a0fa60b0d88f6274c7105cda2192fde Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 6 Mar 2025 16:24:11 +0100 Subject: [PATCH] change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API" Moving after tool_calls2 Signed-off-by: Wang, Yi A add in Buffering.. Signed-off-by: Wang, Yi A fix: handle usage outside of stream state and add tests Simplifying everything quite a bit. Remove the unused model_dump. Clippy. Clippy ? Ruff. Uppgrade the flake for latest transformers. Upgrade after rebase. Remove potential footgun. Fix completion test. --- flake.lock | 127 +++- integration-tests/conftest.py | 8 +- .../test_chat_hfhub_nousage.json | 62 ++ .../test_chat_hfhub_usage.json | 75 +++ .../test_chat_openai_nousage.json | 71 +++ .../test_chat_openai_usage.json | 87 +++ ...t_flash_llama_completion_many_prompts.json | 18 +- ..._llama_completion_many_prompts_stream.json | 602 ------------------ ..._flash_llama_completion_single_prompt.json | 6 +- .../models/test_chat_stream_options.py | 16 + .../models/test_completion_prompts.py | 140 +++- router/src/lib.rs | 3 +- router/src/server.rs | 74 ++- 13 files changed, 581 insertions(+), 708 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json delete mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json create mode 100644 integration-tests/models/test_chat_stream_options.py diff --git a/flake.lock b/flake.lock index b6cf7e53..719cdeea 100644 --- a/flake.lock +++ b/flake.lock @@ -2,10 +2,16 @@ "nodes": { "cachix": { "inputs": { - "devenv": ["crate2nix"], - "flake-compat": ["crate2nix"], + "devenv": [ + "crate2nix" + ], + "flake-compat": [ + "crate2nix" + ], "nixpkgs": "nixpkgs", - "pre-commit-hooks": ["crate2nix"] + "pre-commit-hooks": [ + "crate2nix" + ] }, "locked": { "lastModified": 1709700175, @@ -24,10 +30,19 @@ }, "cachix_2": { "inputs": { - "devenv": ["crate2nix", "crate2nix_stable"], - "flake-compat": ["crate2nix", "crate2nix_stable"], + "devenv": [ + "crate2nix", + "crate2nix_stable" + ], + "flake-compat": [ + "crate2nix", + "crate2nix_stable" + ], "nixpkgs": "nixpkgs_2", - "pre-commit-hooks": ["crate2nix", "crate2nix_stable"] + "pre-commit-hooks": [ + "crate2nix", + "crate2nix_stable" + ] }, "locked": { "lastModified": 1716549461, @@ -46,8 +61,16 @@ }, "cachix_3": { "inputs": { - "devenv": ["crate2nix", "crate2nix_stable", "crate2nix_stable"], - "flake-compat": ["crate2nix", "crate2nix_stable", "crate2nix_stable"], + "devenv": [ + "crate2nix", + "crate2nix_stable", + "crate2nix_stable" + ], + "flake-compat": [ + "crate2nix", + "crate2nix_stable", + "crate2nix_stable" + ], "nixpkgs": "nixpkgs_3", "pre-commit-hooks": [ "crate2nix", @@ -78,15 +101,18 @@ "flake-compat": "flake-compat_3", "flake-parts": "flake-parts_3", "nix-test-runner": "nix-test-runner_3", - "nixpkgs": ["tgi-nix", "nixpkgs"], + "nixpkgs": [ + "tgi-nix", + "nixpkgs" + ], "pre-commit-hooks": "pre-commit-hooks_3" }, "locked": { - "lastModified": 1734429562, - "narHash": "sha256-V2XNs3Ir8WXNHdocfzkR/fu0FzkZ9uTDJkVecxJrGmQ=", + "lastModified": 1739473963, + "narHash": "sha256-ItAhpjNUzEWd/cgZVyW/jvoGbCec4TK29e1Mnmn1oJE=", "owner": "nix-community", "repo": "crate2nix", - "rev": "8537c2d7cb623679aaeff62c4c4c43a91566ab09", + "rev": "be31feae9a82c225c0fd1bdf978565dc452a483a", "type": "github" }, "original": { @@ -193,7 +219,11 @@ "devshell_2": { "inputs": { "flake-utils": "flake-utils_3", - "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"] + "nixpkgs": [ + "crate2nix", + "crate2nix_stable", + "nixpkgs" + ] }, "locked": { "lastModified": 1717408969, @@ -212,7 +242,10 @@ "devshell_3": { "inputs": { "flake-utils": "flake-utils_4", - "nixpkgs": ["crate2nix", "nixpkgs"] + "nixpkgs": [ + "crate2nix", + "nixpkgs" + ] }, "locked": { "lastModified": 1711099426, @@ -310,7 +343,11 @@ }, "flake-parts_2": { "inputs": { - "nixpkgs-lib": ["crate2nix", "crate2nix_stable", "nixpkgs"] + "nixpkgs-lib": [ + "crate2nix", + "crate2nix_stable", + "nixpkgs" + ] }, "locked": { "lastModified": 1719745305, @@ -328,7 +365,10 @@ }, "flake-parts_3": { "inputs": { - "nixpkgs-lib": ["crate2nix", "nixpkgs"] + "nixpkgs-lib": [ + "crate2nix", + "nixpkgs" + ] }, "locked": { "lastModified": 1712014858, @@ -519,7 +559,11 @@ }, "gitignore_3": { "inputs": { - "nixpkgs": ["crate2nix", "pre-commit-hooks", "nixpkgs"] + "nixpkgs": [ + "crate2nix", + "pre-commit-hooks", + "nixpkgs" + ] }, "locked": { "lastModified": 1709087332, @@ -726,10 +770,22 @@ }, "pre-commit-hooks_2": { "inputs": { - "flake-compat": ["crate2nix", "crate2nix_stable", "flake-compat"], + "flake-compat": [ + "crate2nix", + "crate2nix_stable", + "flake-compat" + ], "gitignore": "gitignore_2", - "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"], - "nixpkgs-stable": ["crate2nix", "crate2nix_stable", "nixpkgs"] + "nixpkgs": [ + "crate2nix", + "crate2nix_stable", + "nixpkgs" + ], + "nixpkgs-stable": [ + "crate2nix", + "crate2nix_stable", + "nixpkgs" + ] }, "locked": { "lastModified": 1719259945, @@ -747,11 +803,20 @@ }, "pre-commit-hooks_3": { "inputs": { - "flake-compat": ["crate2nix", "flake-compat"], + "flake-compat": [ + "crate2nix", + "flake-compat" + ], "flake-utils": "flake-utils_5", "gitignore": "gitignore_3", - "nixpkgs": ["crate2nix", "nixpkgs"], - "nixpkgs-stable": ["crate2nix", "nixpkgs"] + "nixpkgs": [ + "crate2nix", + "nixpkgs" + ], + "nixpkgs-stable": [ + "crate2nix", + "nixpkgs" + ] }, "locked": { "lastModified": 1712055707, @@ -772,21 +837,27 @@ "crate2nix": "crate2nix", "flake-utils": "flake-utils_6", "nix-filter": "nix-filter", - "nixpkgs": ["tgi-nix", "nixpkgs"], + "nixpkgs": [ + "tgi-nix", + "nixpkgs" + ], "rust-overlay": "rust-overlay", "tgi-nix": "tgi-nix" } }, "rust-overlay": { "inputs": { - "nixpkgs": ["tgi-nix", "nixpkgs"] + "nixpkgs": [ + "tgi-nix", + "nixpkgs" + ] }, "locked": { - "lastModified": 1738549608, - "narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=", + "lastModified": 1741141853, + "narHash": "sha256-FauVtC+FbOgkKpGVuQTNxSqrvgbmVc7hFkjn/DacwMo=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d", + "rev": "02edad1f19d6dec824e0812e4cdc0aa7930ff8ae", "type": "github" }, "original": { diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index 6490f833..e7e64072 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -39,7 +39,13 @@ from typing import Dict, List, Optional from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from docker.errors import NotFound from syrupy.extensions.json import JSONSnapshotExtension - +from huggingface_hub.inference._generated.types.chat_completion import ( + ChatCompletionStreamOutput, + ChatCompletionOutput, +) +from openai.types.chat.chat_completion_chunk import ( + ChatCompletionChunk as OAIChatCompletionChunk, +) from text_generation import AsyncClient from text_generation.types import ( BestOfSequence, diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json new file mode 100644 index 00000000..a05b685e --- /dev/null +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json @@ -0,0 +1,62 @@ +[ + { + "choices": [ + { + "delta": { + "content": "OK", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741265520, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "!", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741265520, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null + } + ], + "created": 1741265520, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + } +] diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json new file mode 100644 index 00000000..d2c969b2 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json @@ -0,0 +1,75 @@ +[ + { + "choices": [ + { + "delta": { + "content": "OK", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741266005, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "!", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741266005, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null + } + ], + "created": 1741266005, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [], + "created": 1741266005, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": { + "completion_tokens": 3, + "prompt_tokens": 39, + "total_tokens": 42 + } + } +] diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json new file mode 100644 index 00000000..6c362059 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json @@ -0,0 +1,71 @@ +[ + { + "choices": [ + { + "delta": { + "content": "OK", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741265134, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "!", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741265134, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null + } + ], + "created": 1741265134, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + } +] diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json new file mode 100644 index 00000000..feb32567 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json @@ -0,0 +1,87 @@ +[ + { + "choices": [ + { + "delta": { + "content": "OK", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741265133, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "!", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741265133, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null + } + ], + "created": 1741265133, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [], + "created": 1741265133, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": { + "completion_tokens": 3, + "completion_tokens_details": null, + "prompt_tokens": 39, + "prompt_tokens_details": null, + "total_tokens": 42 + } + } +] diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json index 25b8120d..5bef4172 100644 --- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json @@ -1,17 +1,17 @@ { "choices": [ - { - "finish_reason": "length", - "index": 0, - "logprobs": null, - "text": " A Beginner’s Guide\nDeep learning is a subset" - }, { "finish_reason": "length", "index": 1, "logprobs": null, "text": " This is a question that has puzzled many people for" }, + { + "finish_reason": "length", + "index": 0, + "logprobs": null, + "text": " A Beginner’s Guide\nDeep learning is a subset" + }, { "finish_reason": "length", "index": 3, @@ -25,11 +25,11 @@ "text": " Paris\nWhat is the capital of France?\nThe" } ], - "created": 1725877154, + "created": 1741264813, "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": { "completion_tokens": 40, "prompt_tokens": 22, diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json deleted file mode 100644 index dd22ceae..00000000 --- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json +++ /dev/null @@ -1,602 +0,0 @@ -[ - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " A" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " This" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " Paris" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "us" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " Beginner" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " is" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "\n" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "cul" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": "’s" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " a" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "What" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "as" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " Guide" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " question" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " is" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "_minus" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": "\n" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " that" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " the" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "cul" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": "Deep" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " has" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " capital" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "as" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " learning" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " puzzled" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " of" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "(s" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " is" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " many" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": " France" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": "):\n" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 0, - "logprobs": null, - "text": " a" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 1, - "logprobs": null, - "text": " people" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 2, - "logprobs": null, - "text": "?\n" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "", - "index": 3, - "logprobs": null, - "text": " " - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "length", - "index": 0, - "logprobs": null, - "text": " subset" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "length", - "index": 1, - "logprobs": null, - "text": " for" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "length", - "index": 2, - "logprobs": null, - "text": "The" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - }, - { - "choices": [ - { - "finish_reason": "length", - "index": 3, - "logprobs": null, - "text": " \"\"\"\n" - } - ], - "created": 1725883643, - "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native" - } -] diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json index 7ad56271..1cb8c103 100644 --- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json +++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json @@ -7,11 +7,11 @@ "text": " A Beginner’s Guide\nDeep learning is a subset" } ], - "created": 1725876621, + "created": 1741264812, "id": "", - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "text_completion", - "system_fingerprint": "2.2.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": { "completion_tokens": 10, "prompt_tokens": 6, diff --git a/integration-tests/models/test_chat_stream_options.py b/integration-tests/models/test_chat_stream_options.py new file mode 100644 index 00000000..41f4f741 --- /dev/null +++ b/integration-tests/models/test_chat_stream_options.py @@ -0,0 +1,16 @@ +import pytest + + +@pytest.fixture(scope="module") +def chat_handle(launcher): + with launcher( + "meta-llama/Meta-Llama-3.1-8B-Instruct", + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def chat_client(chat_handle): + await chat_handle.health(300) + return chat_handle.client + diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py index 27988ef9..de04d85b 100644 --- a/integration-tests/models/test_completion_prompts.py +++ b/integration-tests/models/test_completion_prompts.py @@ -2,8 +2,8 @@ import pytest import requests import json from aiohttp import ClientSession +from openai import OpenAI from huggingface_hub import InferenceClient - from text_generation.types import Completion @@ -158,47 +158,30 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn async def test_flash_llama_completion_many_prompts_stream( flash_llama_completion, response_snapshot ): - request = { - "model": "tgi", - "prompt": [ + client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1") + stream = client.completion( + model="tgi", + prompt=[ "What is Deep Learning?", "Is water wet?", "What is the capital of France?", "def mai", ], - "max_tokens": 10, - "seed": 0, - "temperature": 0.0, - "stream": True, - } + max_tokens=10, + seed=0, + temperature=0.0, + stream=True, + ) - url = f"{flash_llama_completion.base_url}/v1/completions" - - chunks = [] strings = [""] * 4 - async with ClientSession(headers=flash_llama_completion.headers) as session: - async with session.post(url, json=request) as response: - # iterate over the stream - async for chunk in response.content.iter_any(): - # remove "data:" - chunk = chunk.decode().split("\n\n") - # remove "data:" if present - chunk = [c.replace("data:", "") for c in chunk] - # remove empty strings - chunk = [c for c in chunk if c] - # remove completion marking chunk - chunk = [c for c in chunk if c != " [DONE]"] - # parse json - chunk = [json.loads(c) for c in chunk] + chunks = [] + for chunk in stream: + chunks.append(chunk) + assert "choices" in chunk + index = chunk.choices[0].index + assert 0 <= index <= 4 + strings[index] += chunk.choices[0].text - for c in chunk: - chunks.append(Completion(**c)) - assert "choices" in c - index = c["choices"][0]["index"] - assert 0 <= index <= 4 - strings[index] += c["choices"][0]["text"] - - assert response.status == 200 assert list(strings) == [ " A Beginner’s Guide\nDeep learning is a subset", " This is a question that has puzzled many people for", @@ -206,3 +189,92 @@ async def test_flash_llama_completion_many_prompts_stream( 'usculas_minusculas(s):\n """\n', ] assert chunks == response_snapshot + + +@pytest.mark.release +async def test_chat_openai_usage(flash_llama_completion, response_snapshot): + client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1") + + stream = client.chat.completions.create( + model="tgi", + messages=[{"role": "user", "content": "Say 'OK!'"}], + stream=True, + max_tokens=10, + seed=42, + stream_options={"include_usage": True}, + ) + + chunks = [] + for chunk in stream: + chunks.append(chunk) + for chunk in chunks[:-1]: + assert chunk.usage is None + for chunk in chunks[-1:]: + assert chunk.usage is not None + + assert chunks == response_snapshot + + +@pytest.mark.release +async def test_chat_openai_nousage(flash_llama_completion, response_snapshot): + client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1") + + stream = client.chat.completions.create( + model="tgi", + messages=[{"role": "user", "content": "Say 'OK!'"}], + stream=True, + max_tokens=10, + seed=42, + stream_options={"include_usage": False}, + ) + + chunks = [] + for chunk in stream: + assert chunk.usage is None + chunks.append(chunk) + + assert chunks == response_snapshot + + +@pytest.mark.release +async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot): + client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1") + stream = client.chat_completion( + model="tgi", + messages=[{"role": "user", "content": "Say 'OK!'"}], + stream=True, + max_tokens=10, + seed=42, + stream_options={"include_usage": True}, + ) + + chunks = [] + for chunk in stream: + chunks.append(chunk) + + for chunk in chunks[:-1]: + assert chunk.usage is None + for chunk in chunks[-1:]: + assert chunk.usage is not None + + assert chunks == response_snapshot + + +@pytest.mark.release +async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot): + client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1") + stream = client.chat_completion( + model="tgi", + messages=[{"role": "user", "content": "Say 'OK!'"}], + stream=True, + max_tokens=10, + seed=42, + stream_options={"include_usage": False}, + ) + + chunks = [] + for chunk in stream: + assert chunk.usage is None + chunks.append(chunk) + + assert chunks == response_snapshot diff --git a/router/src/lib.rs b/router/src/lib.rs index a7923c4c..08c31b64 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -764,7 +764,6 @@ impl ChatCompletionChunk { created: u64, logprobs: Option, finish_reason: Option, - usage: Option, ) -> Self { let delta = match (delta, tool_calls) { (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage { @@ -801,7 +800,7 @@ impl ChatCompletionChunk { logprobs, finish_reason, }], - usage, + usage: None, } } } diff --git a/router/src/server.rs b/router/src/server.rs index 6e55d2bc..9f312316 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1124,7 +1124,6 @@ enum StreamState { fn create_event_from_stream_token( stream_token: &StreamResponse, logprobs: bool, - stream_options: Option, inner_using_tools: bool, system_fingerprint: String, model_id: String, @@ -1151,30 +1150,10 @@ fn create_event_from_stream_token( (content, None) }; - - let (usage, finish_reason) = match &stream_token.details { - Some(details) => { - let usage = if stream_options - .as_ref() - .map(|s| s.include_usage) - .unwrap_or(false) - { - let completion_tokens = details.generated_tokens; - let prompt_tokens = details.input_length; - let total_tokens = prompt_tokens + completion_tokens; - Some(Usage { - completion_tokens, - prompt_tokens, - total_tokens, - }) - } else { - None - }; - (usage, Some(details.finish_reason.format(true))) - } - None => (None, None), - }; - + let finish_reason = stream_token + .details + .as_ref() + .map(|details| details.finish_reason.format(true)); let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new( model_id.clone(), system_fingerprint.clone(), @@ -1183,7 +1162,6 @@ fn create_event_from_stream_token( current_time, logprobs, finish_reason, - usage, )); event.json_data(chat_complete).unwrap_or_else(|e| { @@ -1287,6 +1265,17 @@ pub(crate) async fn chat_completions( match result{ Ok(stream_token) => { let token_text = &stream_token.token.text.clone(); + let usage = stream_token.details.as_ref().map(|details| { + let completion_tokens = details.generated_tokens; + let prompt_tokens = details.input_length; + let total_tokens = prompt_tokens + completion_tokens; + + Usage { + completion_tokens, + prompt_tokens, + total_tokens, + } + }); match state { StreamState::Buffering => { json_buffer.push_str(&token_text.replace(" ", "")); @@ -1307,7 +1296,6 @@ pub(crate) async fn chat_completions( let event = create_event_from_stream_token( stream_token, logprobs, - stream_options.clone(), response_as_tool, system_fingerprint.clone(), model_id.clone(), @@ -1347,7 +1335,6 @@ pub(crate) async fn chat_completions( current_time, None, None, - None, )); yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| { InferError::StreamSerializationError(e.to_string()).into() @@ -1369,7 +1356,6 @@ pub(crate) async fn chat_completions( let event = create_event_from_stream_token( &stream_token, logprobs, - stream_options.clone(), response_as_tool, system_fingerprint.clone(), model_id.clone(), @@ -1378,6 +1364,36 @@ pub(crate) async fn chat_completions( yield Ok::(event); } } + + let should_send_usage = usage.is_some() + && stream_options + .as_ref() + .is_some_and(|opts| opts.include_usage); + + if should_send_usage { + let usage_data = usage.unwrap(); + let current_time = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_else(|_| std::time::Duration::from_secs(0)) + .as_secs(); + + let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk { + id: String::new(), + created: current_time, + model: model_id.clone(), + system_fingerprint: system_fingerprint.clone(), + choices: vec![], + usage: Some(Usage { + prompt_tokens: usage_data.prompt_tokens, + completion_tokens: usage_data.completion_tokens, + total_tokens: usage_data.total_tokens, + }), + }); + + yield Ok(Event::default() + .json_data(chat_complete) + .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into())); + } } Err(err) => yield Ok(err.into_openai_event()) }