From 8e92942a18f51b3670c5285baef1885526b64da0 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 5 Mar 2025 22:32:31 +0100 Subject: [PATCH] Making `tool_calls` a vector. (#3075) * Making `tool_calls` a vector. * Update doc. * Fixing the nix overlay with updated version. * Add openai dependency. * Updating the old tests. * Trying to reduce the logs in the case of errors. * Less spammy logs too. --- backends/v3/src/queue.rs | 4 +- clients/python/text_generation/types.py | 2 +- docs/openapi.json | 5 +- integration-tests/conftest.py | 14 +- .../test_flash_llama_grammar_tools.json | 4 +- .../test_flash_llama_grammar_tools_auto.json | 4 +- ...test_flash_llama_grammar_tools_choice.json | 20 +- ...rammar_tools_insufficient_information.json | 10 +- ...tools_insufficient_information_stream.json | 4 +- ...test_flash_llama_grammar_tools_openai.json | 992 ++++++++++++++++++ ...ma_grammar_tools_sea_creatures_stream.json | 4 +- ..._sea_creatures_stream_function_object.json | 24 +- ...ammar_tools_sea_creatures_stream_none.json | 4 +- ...r_tools_sea_creatures_stream_required.json | 24 +- ...test_flash_llama_grammar_tools_stream.json | 24 +- .../test_flash_llama_tool_reply_response.json | 4 +- integration-tests/models/test_tools_llama.py | 45 +- integration-tests/pyproject.toml | 1 + integration-tests/requirements.txt | 38 +- nix/overlay.nix | 4 +- router/src/lib.rs | 6 +- 21 files changed, 1158 insertions(+), 79 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs index 249eebf76..d3bf4b9c0 100644 --- a/backends/v3/src/queue.rs +++ b/backends/v3/src/queue.rs @@ -311,7 +311,7 @@ impl State { + entry.request.stopping_parameters.max_new_tokens + self.speculate - 1; - tracing::debug!("Allocating {tokens} with {input_ids:?}"); + // tracing::debug!("Allocating {tokens} with {input_ids:?}"); let block_allocation = match block_allocator.allocate(tokens, input_ids).await { None => { @@ -322,7 +322,7 @@ impl State { break 'entry_loop; } Some(mut block_allocation) => { - tracing::debug!("Allocation: {block_allocation:?}"); + // tracing::debug!("Allocation: {block_allocation:?}"); max_blocks = max(max_blocks, block_allocation.blocks.len() as u32); if block_allocation.prefix_len == entry.request.input_length { diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py index 1085075e4..6f51c153e 100644 --- a/clients/python/text_generation/types.py +++ b/clients/python/text_generation/types.py @@ -67,7 +67,7 @@ class ChoiceDeltaToolCall(BaseModel): class ChoiceDelta(BaseModel): role: str content: Optional[str] = None - tool_calls: Optional[ChoiceDeltaToolCall] = None + tool_calls: Optional[List[ChoiceDeltaToolCall]] = None class Choice(BaseModel): diff --git a/docs/openapi.json b/docs/openapi.json index e16ca7f97..e1ce234ed 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -2302,7 +2302,10 @@ "example": "assistant" }, "tool_calls": { - "$ref": "#/components/schemas/DeltaToolCall" + "type": "array", + "items": { + "$ref": "#/components/schemas/DeltaToolCall" + } } } }, diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index 0ffcd162b..01250ce2d 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -98,7 +98,7 @@ def pytest_collection_modifyitems(config, items): selector(item) -@pytest.fixture(autouse=True) +@pytest.fixture(autouse=True, scope="module") def container_log(request: SubRequest): error_log = request.getfixturevalue("error_log") assert error_log is not None @@ -269,7 +269,17 @@ class ResponseComparator(JSONSnapshotExtension): def eq_chat_complete_chunk( response: ChatCompletionChunk, other: ChatCompletionChunk ) -> bool: - return response.choices[0].delta.content == other.choices[0].delta.content + if response.choices[0].delta.content is not None: + return ( + response.choices[0].delta.content == other.choices[0].delta.content + ) + elif response.choices[0].delta.tool_calls is not None: + return ( + response.choices[0].delta.tool_calls + == other.choices[0].delta.tool_calls + ) + else: + raise RuntimeError(f"Invalid empty chat chunk {response} vs {other}") def eq_response(response: Response, other: Response) -> bool: return response.generated_text == other.generated_text and eq_details( diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json index 33e223ba4..7445099f2 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json @@ -26,11 +26,11 @@ "usage": null } ], - "created": 1732293383, + "created": 1741195536, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", - "system_fingerprint": "2.4.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": { "completion_tokens": 30, "prompt_tokens": 615, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json index 92ffbbc15..99018f96e 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json @@ -26,11 +26,11 @@ "usage": null } ], - "created": 1732293384, + "created": 1741195538, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", - "system_fingerprint": "2.4.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": { "completion_tokens": 30, "prompt_tokens": 615, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json index 603c90afa..a80a6a23d 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json @@ -1,7 +1,7 @@ { "choices": [ { - "finish_reason": "eos_token", + "finish_reason": "stop", "index": 0, "logprobs": null, "message": { @@ -13,12 +13,12 @@ "function": { "arguments": { "format": "celsius", - "location": "New York, NY" + "location": "Brooklyn, New York" }, "description": null, "name": "get_current_weather" }, - "id": 0, + "id": "0", "type": "function" } ] @@ -26,14 +26,14 @@ "usage": null } ], - "created": 1712852394, + "created": 1741195540, "id": "", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "object": "text_completion", - "system_fingerprint": "2.0.1-native", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion", + "system_fingerprint": "3.1.2-dev0-native", "usage": { - "completion_tokens": 48, - "prompt_tokens": 320, - "total_tokens": 368 + "completion_tokens": 30, + "prompt_tokens": 326, + "total_tokens": 356 } } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json index 3ed893fa9..9cfea7912 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json @@ -13,14 +13,14 @@ "usage": null } ], - "created": 1728497062, + "created": 1741195542, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": { - "completion_tokens": 23, - "prompt_tokens": 604, - "total_tokens": 627 + "completion_tokens": 22, + "prompt_tokens": 608, + "total_tokens": 630 } } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json index b134004a1..34615f8ed 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json @@ -11,10 +11,10 @@ "logprobs": null } ], - "created": 1728497531, + "created": 1741195542, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": null } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json new file mode 100644 index 000000000..e6d789248 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json @@ -0,0 +1,992 @@ +[ + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "{\"", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "function", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "\":", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " {\"", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "_", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "name", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "\":", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " \"", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "get", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "_current", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "_weather", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "\",", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " \"", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "location", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "\":", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " \"", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "Bro", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "oklyn", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": ",", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " New", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " York", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "\",", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " \"", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "format", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195536, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "\":", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195537, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": " \"", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195537, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "c", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195537, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "elsius", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195537, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "\"}}", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741195537, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "<|eot_id|>", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null + } + ], + "created": 1741195537, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + } +] diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json index 1362b4721..116441908 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json @@ -11,10 +11,10 @@ "logprobs": null } ], - "created": 1728497461, + "created": 1741195545, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": null } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json index bb8d61c8e..713e7a56c 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json @@ -3,25 +3,27 @@ { "delta": { "role": "assistant", - "tool_calls": { - "function": { - "arguments": "<|eot_id|>", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } + "tool_calls": [ + { + "function": { + "arguments": "<|eot_id|>", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] }, "finish_reason": "stop", "index": 0, "logprobs": null } ], - "created": 1732293254, + "created": 1741195554, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", - "system_fingerprint": "2.4.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": null } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json index 2ccab4a9d..bde28149b 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json @@ -11,10 +11,10 @@ "logprobs": null } ], - "created": 1729262528, + "created": 1741195551, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", - "system_fingerprint": "2.3.2-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": null } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json index dbced5b8e..7896607a7 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json @@ -4,25 +4,27 @@ "delta": { "content": null, "role": "assistant", - "tool_calls": { - "function": { - "arguments": "<|eot_id|>", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } + "tool_calls": [ + { + "function": { + "arguments": "<|eot_id|>", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] }, "finish_reason": "stop", "index": 0, "logprobs": null } ], - "created": 1732293246, + "created": 1741195548, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", - "system_fingerprint": "2.4.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": null } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json index 27d2f9cae..92d27f614 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json @@ -4,25 +4,27 @@ "delta": { "content": null, "role": "assistant", - "tool_calls": { - "function": { - "arguments": "<|eot_id|>", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } + "tool_calls": [ + { + "function": { + "arguments": "<|eot_id|>", + "name": null + }, + "id": "", + "index": 0, + "type": "function" + } + ] }, "finish_reason": "stop", "index": 0, "logprobs": null } ], - "created": 1732293235, + "created": 1741195541, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", - "system_fingerprint": "2.4.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": null } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json index 4f10aa3b9..33a3bb430 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json @@ -13,11 +13,11 @@ "usage": null } ], - "created": 1739932427, + "created": 1741195556, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", - "system_fingerprint": "3.1.1-dev0-native", + "system_fingerprint": "3.1.2-dev0-native", "usage": { "completion_tokens": 79, "prompt_tokens": 103, diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py index b8a90cff9..7fd6cadd8 100644 --- a/integration-tests/models/test_tools_llama.py +++ b/integration-tests/models/test_tools_llama.py @@ -1,6 +1,7 @@ import pytest import requests import json +from openai import OpenAI @pytest.fixture(scope="module") @@ -108,6 +109,38 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna assert response == response_snapshot +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_grammar_tools_openai( + flash_llama_grammar_tools, response_snapshot +): + client = OpenAI(api_key="xx", base_url=f"{flash_llama_grammar_tools.base_url}/v1") + stream = client.chat.completions.create( + model="tgi", + max_tokens=100, + seed=1, + tools=tools, + stream=True, + temperature=0.0, + messages=[ + { + "role": "system", + "content": "Youre a helpful assistant! Answer the users question best you can.", + }, + { + "role": "user", + "content": "What is the weather like in Brooklyn, New York?", + }, + ], + ) + + chunks = [] + for chunk in stream: + chunks.append(chunk) + + assert chunks == response_snapshot + + @pytest.mark.asyncio @pytest.mark.private async def test_flash_llama_grammar_tools_auto( @@ -213,7 +246,9 @@ async def test_flash_llama_grammar_tools_stream( last_response = None async for response in responses: count += 1 - tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments + tool_calls_generated += ( + response.choices[0].delta.tool_calls[0].function.arguments + ) last_response = response assert response.choices[0].delta.content is None @@ -360,7 +395,9 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_required( async for response in responses: count += 1 assert response.choices[0].delta.content is None - tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments + tool_calls_generated += ( + response.choices[0].delta.tool_calls[0].function.arguments + ) last_response = response assert count == 29 @@ -458,8 +495,8 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object( break response = json.loads(line) tool_calls_generated += response["choices"][0]["delta"]["tool_calls"][ - "function" - ]["arguments"] + 0 + ]["function"]["arguments"] last_response = response assert count == 39 diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index 1838995ed..37003440f 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "pytest-asyncio>=0.23.1", "docker>=7", "numpy>=2.0", + "openai>=1.65", ] [tool.isort] diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt index c5a918259..d419d4b36 100644 --- a/integration-tests/requirements.txt +++ b/integration-tests/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile pyproject.toml --output-file requirements.txt +# uv pip compile pyproject.toml -o requirements.txt aiohappyeyeballs==2.4.6 # via aiohttp aiohttp==3.11.12 @@ -8,12 +8,21 @@ aiosignal==1.3.2 # via aiohttp annotated-types==0.7.0 # via pydantic +anyio==4.8.0 + # via + # httpx + # openai attrs==25.1.0 # via aiohttp certifi==2025.1.31 - # via requests + # via + # httpcore + # httpx + # requests charset-normalizer==3.4.1 # via requests +distro==1.9.0 + # via openai docker==7.1.0 # via text-generation-integration-tests (pyproject.toml) filelock==3.17.0 @@ -24,20 +33,32 @@ frozenlist==1.5.0 # aiosignal fsspec==2025.2.0 # via huggingface-hub +h11==0.14.0 + # via httpcore +httpcore==1.0.7 + # via httpx +httpx==0.28.1 + # via openai huggingface-hub==0.29.0 # via text-generation idna==3.10 # via + # anyio + # httpx # requests # yarl iniconfig==2.0.0 # via pytest +jiter==0.8.2 + # via openai multidict==6.1.0 # via # aiohttp # yarl numpy==2.2.3 # via text-generation-integration-tests (pyproject.toml) +openai==1.65.3 + # via text-generation-integration-tests (pyproject.toml) packaging==24.2 # via # huggingface-hub @@ -51,6 +72,7 @@ propcache==0.2.1 pydantic==2.10.6 # via # text-generation-integration-tests (pyproject.toml) + # openai # text-generation pydantic-core==2.27.2 # via pydantic @@ -67,15 +89,23 @@ requests==2.32.3 # via # docker # huggingface-hub +sniffio==1.3.1 + # via + # anyio + # openai syrupy==4.8.1 # via text-generation-integration-tests (pyproject.toml) text-generation==0.7.0 # via text-generation-integration-tests (pyproject.toml) tqdm==4.67.1 - # via huggingface-hub -typing-extensions==4.12.2 # via # huggingface-hub + # openai +typing-extensions==4.12.2 + # via + # anyio + # huggingface-hub + # openai # pydantic # pydantic-core urllib3==2.3.0 diff --git a/nix/overlay.nix b/nix/overlay.nix index d90478192..63398f075 100644 --- a/nix/overlay.nix +++ b/nix/overlay.nix @@ -18,8 +18,8 @@ final: prev: { src = final.fetchFromGitHub { owner = "huggingface"; repo = "transformers"; - rev = "8d73a38606bc342b370afe1f42718b4828d95aaa"; - hash = "sha256-MxroG6CWqrcmRS+eFt7Ej87TDOInN15aRPBUcaycKTI="; + rev = "v4.49.0"; + hash = "sha256-drq7RWoRaRejiQjCUHIYuzaKa9rA4eQZI2do74scp1c="; }; } ); diff --git a/router/src/lib.rs b/router/src/lib.rs index 637e6c56e..60f1f73a4 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -730,7 +730,7 @@ pub(crate) struct ChatCompletionChoice { pub struct ToolCallDelta { #[schema(example = "assistant")] role: String, - tool_calls: DeltaToolCall, + tool_calls: Vec, } #[derive(Clone, Debug, Serialize, ToSchema)] @@ -774,7 +774,7 @@ impl ChatCompletionChunk { }), (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta { role: "assistant".to_string(), - tool_calls: DeltaToolCall { + tool_calls: vec![DeltaToolCall { index: 0, id: String::new(), r#type: "function".to_string(), @@ -782,7 +782,7 @@ impl ChatCompletionChunk { name: None, arguments: tool_calls[0].to_string(), }, - }, + }], }), (None, None) => ChatCompletionDelta::Chat(TextMessage { role: "assistant".to_string(),