From 5c5528e36287c2207fdbc6958f565da883f8d5d8 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 12 Mar 2025 09:28:47 +0100 Subject: [PATCH] Fix tool call4 (#3094) * Removing the no_tool content information. * Removing a lot of NO_TOOL shenanigans. * Update the tests. --- ...ols_insufficient_information_nostream.json | 10 +- ...tools_insufficient_information_stream.json | 320 ++++- ...ammar_tools_sea_creatures_stream_auto.json | 1250 +---------------- integration-tests/models/test_tools_llama.py | 18 +- router/src/chat.rs | 516 ++----- router/src/infer/tool_grammar.rs | 14 +- router/src/server.rs | 72 +- 7 files changed, 526 insertions(+), 1674 deletions(-) diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json index 797c95788..6d8417479 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json @@ -5,20 +5,20 @@ "index": 0, "logprobs": null, "message": { - "content": "I am a helpful assistant!", + "content": "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI", "role": "assistant", "tool_calls": null } } ], - "created": 1741263686, + "created": 1741693957, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", "system_fingerprint": "3.1.2-dev0-native", "usage": { - "completion_tokens": 23, - "prompt_tokens": 494, - "total_tokens": 517 + "completion_tokens": 12, + "prompt_tokens": 53, + "total_tokens": 65 } } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json index dc969ceea..47f23f4c9 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json @@ -1,24 +1,4 @@ [ - { - "choices": [ - { - "delta": { - "content": "", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741364571, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, { "choices": [ { @@ -32,7 +12,7 @@ "logprobs": null } ], - "created": 1741364571, + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -43,7 +23,7 @@ "choices": [ { "delta": { - "content": " am", + "content": "'m", "role": "assistant", "tool_calls": null }, @@ -52,7 +32,127 @@ "logprobs": null } ], - "created": 1741364571, + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " an", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " artificial", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " intelligence", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " model", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " known", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " as", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -72,7 +172,7 @@ "logprobs": null } ], - "created": 1741364571, + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -83,7 +183,7 @@ "choices": [ { "delta": { - "content": " helpful", + "content": " large", "role": "assistant", "tool_calls": null }, @@ -92,7 +192,7 @@ "logprobs": null } ], - "created": 1741364571, + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -103,7 +203,7 @@ "choices": [ { "delta": { - "content": " assistant", + "content": " language", "role": "assistant", "tool_calls": null }, @@ -112,7 +212,7 @@ "logprobs": null } ], - "created": 1741364571, + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -123,7 +223,7 @@ "choices": [ { "delta": { - "content": "!", + "content": " model", "role": "assistant", "tool_calls": null }, @@ -132,7 +232,167 @@ "logprobs": null } ], - "created": 1741364571, + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " (", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "LL", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "M", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": ")", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " or", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " convers", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "ational", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " AI", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": "length", + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json index cdac9bc4a..30f039200 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json @@ -3,7 +3,7 @@ "choices": [ { "delta": { - "content": "", + "content": "Once", "role": "assistant", "tool_calls": null }, @@ -12,7 +12,7 @@ "logprobs": null } ], - "created": 1741371722, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -23,7 +23,7 @@ "choices": [ { "delta": { - "content": "There", + "content": " upon", "role": "assistant", "tool_calls": null }, @@ -32,27 +32,7 @@ "logprobs": null } ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " was", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -72,7 +52,7 @@ "logprobs": null } ], - "created": 1741371722, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -83,7 +63,7 @@ "choices": [ { "delta": { - "content": " wise", + "content": " time", "role": "assistant", "tool_calls": null }, @@ -92,7 +72,7 @@ "logprobs": null } ], - "created": 1741371722, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -103,7 +83,7 @@ "choices": [ { "delta": { - "content": " old", + "content": ",", "role": "assistant", "tool_calls": null }, @@ -112,147 +92,7 @@ "logprobs": null } ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " oct", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "opus", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " named", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Oracle", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ".", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " He", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " lived", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -272,7 +112,7 @@ "logprobs": null } ], - "created": 1741371722, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -292,7 +132,7 @@ "logprobs": null } ], - "created": 1741371722, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -303,7 +143,7 @@ "choices": [ { "delta": { - "content": " cozy", + "content": " vibrant", "role": "assistant", "tool_calls": null }, @@ -312,887 +152,7 @@ "logprobs": null } ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " little", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " cave", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371722, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " beneath", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " waves", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " with", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " his", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " best", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " friend", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " a", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " curious", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " se", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ah", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "orse", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " named", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Fin", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ley", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ".", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " One", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " day", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Fin", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ley", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " met", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " a", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " playful", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " dolphin", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " named", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Daisy", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " and", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371723, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " three", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " became", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " inse", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "parable", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ".", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " They", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " spent", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " their", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " days", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " exploring", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1212,7 +172,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1223,7 +183,7 @@ "choices": [ { "delta": { - "content": ",", + "content": " filled", "role": "assistant", "tool_calls": null }, @@ -1232,7 +192,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1243,7 +203,7 @@ "choices": [ { "delta": { - "content": " playing", + "content": " with", "role": "assistant", "tool_calls": null }, @@ -1252,7 +212,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1263,7 +223,7 @@ "choices": [ { "delta": { - "content": " hide", + "content": " coral", "role": "assistant", "tool_calls": null }, @@ -1272,7 +232,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1283,7 +243,7 @@ "choices": [ { "delta": { - "content": "-and", + "content": " reefs", "role": "assistant", "tool_calls": null }, @@ -1292,67 +252,7 @@ "logprobs": null } ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "-se", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ek", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1372,7 +272,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1383,7 +283,7 @@ "choices": [ { "delta": { - "content": " learning", + "content": " schools", "role": "assistant", "tool_calls": null }, @@ -1392,67 +292,7 @@ "logprobs": null } ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " about", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " wonders", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1472,7 +312,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1483,7 +323,7 @@ "choices": [ { "delta": { - "content": " the", + "content": " shimmer", "role": "assistant", "tool_calls": null }, @@ -1492,7 +332,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1503,7 +343,7 @@ "choices": [ { "delta": { - "content": " sea", + "content": "ing", "role": "assistant", "tool_calls": null }, @@ -1512,7 +352,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1523,7 +363,7 @@ "choices": [ { "delta": { - "content": " from", + "content": " fish", "role": "assistant", "tool_calls": null }, @@ -1532,7 +372,7 @@ "logprobs": null } ], - "created": 1741371724, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1543,36 +383,16 @@ "choices": [ { "delta": { - "content": " Oracle", + "content": ",", "role": "assistant", "tool_calls": null }, - "finish_reason": null, + "finish_reason": "length", "index": 0, "logprobs": null } ], - "created": 1741371724, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ".", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741371725, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py index bb4b308bd..612fa6bd9 100644 --- a/integration-tests/models/test_tools_llama.py +++ b/integration-tests/models/test_tools_llama.py @@ -279,7 +279,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream( ): client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1") response = client.chat_completion( - max_tokens=100, + max_tokens=20, seed=24, tools=tools, tool_choice="auto", @@ -299,7 +299,10 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream( content_generated = response.choices[0].message.content assert response.choices[0].message.tool_calls is None - assert content_generated == "I am a helpful assistant!" + assert ( + content_generated + == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI" + ) assert response == response_snapshot @@ -310,7 +313,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream( ): client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1") stream = client.chat_completion( - max_tokens=100, + max_tokens=20, seed=24, tools=tools, tool_choice="auto", @@ -335,7 +338,10 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream( assert chunk.choices[0].delta.tool_calls is None ######## This is exactly the same as the non streaming case - assert content_generated == "I am a helpful assistant!" + assert ( + content_generated + == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI" + ) assert chunks == response_snapshot @@ -346,7 +352,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto( ): client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1") stream = client.chat_completion( - max_tokens=100, + max_tokens=20, seed=24, tools=tools, tool_choice="auto", @@ -372,7 +378,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto( assert ( content_generated - == "There was a wise old octopus named Oracle. He lived in a cozy little cave beneath the waves with his best friend, a curious seahorse named Finley. One day, Finley met a playful dolphin named Daisy, and the three became inseparable. They spent their days exploring the ocean, playing hide-and-seek, and learning about the wonders of the sea from Oracle." + == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish," ) assert chunks == response_snapshot diff --git a/router/src/chat.rs b/router/src/chat.rs index 63bd53bf1..d5824fea0 100644 --- a/router/src/chat.rs +++ b/router/src/chat.rs @@ -6,22 +6,6 @@ use crate::{ use serde::Deserialize; use serde_json::Value; -#[derive(Debug, Deserialize)] -#[serde(rename_all = "snake_case")] -enum _NoTool { - NoTool, -} - -#[derive(Debug, Deserialize)] -struct NoToolCall { - _name: _NoTool, - content: String, -} -#[derive(Debug, Deserialize)] -struct NoTool { - function: NoToolCall, -} - #[derive(Debug, Deserialize)] struct ToolCall { _name: String, @@ -34,9 +18,19 @@ struct Call { function: ToolCall, } -pub(crate) fn parse_output( - generated_text: &str, -) -> Result<(Option>, Option), InferError> { +#[cfg_attr(test, derive(Debug))] +pub(crate) enum ChatEvent { + NoTool, + Events(Vec), +} + +#[cfg_attr(test, derive(Debug))] +pub(crate) enum ChatChoice { + NoTool, + ToolCalls(Vec), +} + +pub(crate) fn parse_output(generated_text: &str) -> Result { let call: Call = serde_json::from_str(generated_text).map_err(|e| { InferError::ToolError(format!( "Failed to parse generated text: {} {:?}", @@ -48,16 +42,7 @@ pub(crate) fn parse_output( match &name[..] { "no_tool" => { // parse the content message - let content_message = call - .function - .arguments - .get("content") - .and_then(Value::as_str) - .ok_or_else(|| { - InferError::ToolError("No `content` found in generated text".to_string()) - })? - .to_string(); - Ok((None, Some(content_message))) + Ok(ChatChoice::NoTool) } name => { let tool_calls = vec![crate::ToolCall { @@ -73,7 +58,7 @@ pub(crate) fn parse_output( })?, }, }]; - Ok((Some(tool_calls), None)) + Ok(ChatChoice::ToolCalls(tool_calls)) } } } @@ -158,10 +143,6 @@ enum StreamState { Buffering, /// We detected a tool call here Tool, - /// During the `content` part of the tool call - NoTool, - /// Finishing frames of the ToolCall - NoToolFinish, /// This is without tool calling Content, } @@ -202,34 +183,16 @@ impl ChatState { } } - pub fn push(&mut self, mut stream_token: StreamResponse) -> Vec { + pub fn push(&mut self, mut stream_token: StreamResponse) -> ChatEvent { let mut events = vec![]; let token_text = &stream_token.token.text; match self.state { StreamState::Buffering => { self.text.push_str(token_text); - // We have a special match for `no_tool` in order to capture directly the `content` - // key which should be re-emitted as raw text. - if let Ok(value) = serde_json::from_str::(&format!("{}\"}}}}", self.text)) { - self.state = StreamState::NoTool; - // Modifiy the content of the token to be whatever was captured by the JSON - stream_token.token.text = value.function.content; - let chat_complete = create_event_from_stream_token( - &stream_token, - self.logprobs, - false, - self.fingerprint.clone(), - self.model_id.clone(), - None, - self.id.clone(), - ); - - events.push(chat_complete); - } - // XXX Caution, here we do not postfix the quote, so that the current output - // Is necessarily finished with quotes for us to be able to parse. + tracing::info!("Current text {:?}", self.text); let partial = &self.text; - let partial = partial.trim_end_matches(|c: char| c.is_whitespace() || c == ','); + let partial = + partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',' || c == '}'); if let Ok(call) = serde_json::from_str::(&format!("{}}}}}", partial)) { // This can be no_tool before the content has been emitted if call.function._name != "no_tool" { @@ -246,6 +209,8 @@ impl ChatState { events.push(chat_complete); self.state = StreamState::Tool; + } else { + return ChatEvent::NoTool; } } } @@ -282,50 +247,6 @@ impl ChatState { events.push(chat_complete); } } - // if we skipped sending the buffer we need to avoid sending the following json key and quotes - // We have remainder tokens, ignore everying, - StreamState::NoToolFinish => {} - StreamState::NoTool => { - self.text.push_str(token_text); - if token_text.contains("\"") { - let mut text = self - .text - .trim_end_matches(|c: char| c.is_whitespace() || c == '}'); - // Trim once - if text.ends_with("\"") { - // Verify we have actually trimmed something - // The opposite can happen if the model is outputting inline JSON. - text = &text[..text.len() - 1]; - if let Ok(_value) = - serde_json::from_str::(&format!("{}\"}}}}", text)) - { - let mut text = token_text - .trim_end_matches(|c: char| c.is_whitespace() || c == '}'); - // Effectively trim_end_match('"', 1) - // because we do not want to eventually trim finishing escaped quotes - // {{"\"Something\""}} - if text.ends_with("\"") { - text = &text[..text.len() - 1]; - } - stream_token.token.text = text.to_string(); - self.state = StreamState::NoToolFinish; - } - } - } - // This escaping is usually inline json escaping and we can therefore remove it. - stream_token.token.text = stream_token.token.text.replace("\\", ""); - let chat_complete = create_event_from_stream_token( - &stream_token, - self.logprobs, - false, - self.fingerprint.clone(), - self.model_id.clone(), - None, - self.id.clone(), - ); - - events.push(chat_complete); - } StreamState::Content => { let chat_complete = create_event_from_stream_token( &stream_token, @@ -373,7 +294,7 @@ impl ChatState { events.push(chat_complete); } } - events + ChatEvent::Events(events) } } @@ -385,24 +306,6 @@ mod tests { use super::*; - fn get_text_content(event: &CompletionType) -> &String { - match event { - CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { - assert_eq!(choices.len(), 1); - if let ChatCompletionChoice { - delta: ChatCompletionDelta::Chat(TextMessage { content, .. }), - .. - } = &choices[0] - { - content - } else { - panic!("Expected plain message"); - } - } - _ => panic!("Unexpected chunk"), - } - } - fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &String) { match event { CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { @@ -456,24 +359,28 @@ mod tests { index: 0, details: None, }); - assert_eq!(events.len(), 1); - match &events[0] { - CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { - assert_eq!( - choices, - &[ChatCompletionChoice { - index: 0, - delta: ChatCompletionDelta::Chat(TextMessage { - role: "assistant".to_string(), - content: "Hi".to_string(), - tool_call_id: None, - }), - logprobs: None, - finish_reason: None, - }] - ); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 1); + match &events[0] { + CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { + assert_eq!( + choices, + &[ChatCompletionChoice { + index: 0, + delta: ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: "Hi".to_string(), + tool_call_id: None, + }), + logprobs: None, + finish_reason: None, + }] + ); + } + _ => panic!("Unexpected chunk"), } - _ => panic!("Unexpected chunk"), + } else { + panic!("Expected chat events"); } } @@ -507,43 +414,47 @@ mod tests { finish_reason: FinishReason::Length, }), }); - assert_eq!(events.len(), 2); - match &events[0] { - CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { - assert_eq!( - choices, - &[ChatCompletionChoice { - index: 0, - delta: ChatCompletionDelta::Chat(TextMessage { - role: "assistant".to_string(), - content: "Hi".to_string(), - tool_call_id: None, - }), - logprobs: None, - // HAS A FINISH REASON - finish_reason: Some("length".to_string()), - }] - ); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 2); + match &events[0] { + CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { + assert_eq!( + choices, + &[ChatCompletionChoice { + index: 0, + delta: ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: "Hi".to_string(), + tool_call_id: None, + }), + logprobs: None, + // HAS A FINISH REASON + finish_reason: Some("length".to_string()), + }] + ); + } + _ => panic!("Unexpected chunk"), } - _ => panic!("Unexpected chunk"), - } - match &events[1] { - CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => { - assert_eq!( - *usage, - Some(Usage { - prompt_tokens: 2, - completion_tokens: 10, - total_tokens: 12, - }) - ); + match &events[1] { + CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => { + assert_eq!( + *usage, + Some(Usage { + prompt_tokens: 2, + completion_tokens: 10, + total_tokens: 12, + }) + ); + } + _ => panic!("Unexpected chunk"), } - _ => panic!("Unexpected chunk"), + } else { + panic!("Expected chat events"); } } #[test] - fn test_chat_stream_tool_no_tool() { + fn test_chat_stream_tool_no_tool_simple() { let mut chat_state = ChatState::new( true, StreamOptions { @@ -597,217 +508,21 @@ mod tests { .collect(); // Initial ignored output - for token in &tokens[..14] { + for token in &tokens[..10] { let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0); - } - - // No tool output - let mut output = String::new(); - for token in &tokens[14..14 + 7] { - let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 1); - let content = get_text_content(&events[0]); - output.push_str(content); - } - - assert_eq!(output, "I am a helpful assistant!"); - - // No tool finish - for token in &tokens[14 + 7..] { - let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0); - } - } - - #[test] - fn test_chat_stream_tool_no_tool_many_quotes() { - let mut chat_state = ChatState::new( - true, - StreamOptions { - include_usage: true, - }, - "fingerprint".to_string(), - "model_id".to_string(), - false, - "0".to_string(), - ); - - let tokens = vec![ - "{\"".to_string(), - "function".to_string(), - "\":".to_string(), - " {\"".to_string(), - "_".to_string(), - "name".to_string(), - "\":".to_string(), - " \"".to_string(), - "no".to_string(), - "_tool".to_string(), - "\",".to_string(), - " \"".to_string(), - "content".to_string(), - "\":".to_string(), - " \"".to_string(), // Token 14 - "I".to_string(), // Event 1 - " am".to_string(), // Event 2 - " a".to_string(), // Event 3 - " helpful".to_string(), // Event 4 - " assistant".to_string(), // Event 5 - "!\\\"\"".to_string(), // Extra inside the string quote that would get removed - "}".to_string(), - "}".to_string(), - ]; - - // Initial ignored output - for text in &tokens[..14] { - let events = chat_state.push(StreamResponse { - generated_text: None, - token: Token { - id: 42, - text: text.to_string(), - logprob: 0.0, - special: false, - }, - top_tokens: vec![], - index: 0, - details: None, - }); - assert_eq!(events.len(), 0); - } - - // No tool output - let mut output = String::new(); - for text in &tokens[14..14 + 7] { - let events = chat_state.push(StreamResponse { - generated_text: None, - token: Token { - id: 42, - text: text.to_string(), - logprob: 0.0, - special: false, - }, - top_tokens: vec![], - index: 0, - details: None, - }); - assert_eq!(events.len(), 1); - match &events[0] { - CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { - assert_eq!(choices.len(), 1); - if let ChatCompletionChoice { - delta: ChatCompletionDelta::Chat(TextMessage { content, .. }), - .. - } = &choices[0] - { - output.push_str(content); - } else { - panic!("Expected plain message"); - } - } - _ => panic!("Unexpected chunk"), + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); } } - assert_eq!(output, "I am a helpful assistant!\""); - - // No tool finish - for text in &tokens[14 + 7..] { - let events = chat_state.push(StreamResponse { - generated_text: None, - token: Token { - id: 42, - text: text.to_string(), - logprob: 0.0, - special: false, - }, - top_tokens: vec![], - index: 0, - details: None, - }); - assert_eq!(events.len(), 0); - } - } - - #[test] - fn test_chat_stream_tool_no_tool_inline_json() { - let mut chat_state = ChatState::new( - true, - StreamOptions { - include_usage: true, - }, - "fingerprint".to_string(), - "model_id".to_string(), - false, - "0".to_string(), - ); - - let tokens = vec![ - "{\"".to_string(), - "function".to_string(), - "\":".to_string(), - " {\"".to_string(), - "_".to_string(), - "name".to_string(), - "\":".to_string(), - " \"".to_string(), - "no".to_string(), - "_tool".to_string(), - "\",".to_string(), - " \"".to_string(), - "content".to_string(), - "\":".to_string(), - " \"".to_string(), // Token 14 - "{\\\"".to_string(), // Event 1 - "a".to_string(), // Event 1 - "\\\":".to_string(), // Event 1 - "2".to_string(), // Event 2 - ",\\".to_string(), // Event 2 - "\"".to_string(), // Event 2 - "b".to_string(), // Event 3 - "\\\": ".to_string(), // Event 4 - "1".to_string(), // Event 5 - "}".to_string(), // Event 5 - "\"}".to_string(), // Extra inside the string quote that would get removed - "}".to_string(), - ]; - let tokens: Vec<_> = tokens - .into_iter() - .map(|text| StreamResponse { - generated_text: None, - token: Token { - id: 42, - text: text.to_string(), - logprob: 0.0, - special: false, - }, - top_tokens: vec![], - index: 0, - details: None, - }) - .collect(); - - // Initial ignored output - for token in &tokens[..14] { - let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0); - } - // No tool output - let mut output = String::new(); - for token in &tokens[14..14 + 12] { - let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 1, "Current text is {output:?}"); - let content = get_text_content(&events[0]); - output.push_str(content); - } - - assert_eq!(output, "{\"a\":2,\"b\": 1}"); - - // No tool finish - for token in &tokens[14 + 12..] { - let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0, "Extra events {events:?}"); + let events = chat_state.push(tokens[10].clone()); + if let ChatEvent::NoTool = events { + assert!(true); + } else { + panic!("Expected chat events"); } } @@ -859,26 +574,21 @@ mod tests { .collect(); // Initial ignored output - for token in &tokens[..13] { + for token in &tokens[..10] { let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); + } } // No tool output - let mut output = String::new(); - for token in &tokens[13..13 + 2] { - let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 1, "Current text is {output:?}"); - let content = get_text_content(&events[0]); - output.push_str(content); - } - - assert_eq!(output, ""); - - // No tool finish - for token in &tokens[13 + 2..] { - let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0, "Extra events {events:?}"); + let events = chat_state.push(tokens[10].clone()); + if let ChatEvent::NoTool = events { + assert!(true); + } else { + panic!("Expected chat events"); } } @@ -946,7 +656,11 @@ mod tests { // Initial ignored output for token in &tokens[..11] { let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0, "{events:?}"); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); + } } // No tool output @@ -954,13 +668,17 @@ mod tests { let mut output_name = String::new(); for token in &tokens[11..11 + 17] { let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 1); - let (name, arguments) = get_tool_call_content(&events[0]); - if let Some(name) = name { - assert_eq!(name, "get_current_weather"); - output_name.push_str(&name); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 1); + let (name, arguments) = get_tool_call_content(&events[0]); + if let Some(name) = name { + assert_eq!(name, "get_current_weather"); + output_name.push_str(&name); + } + output.push_str(arguments); + } else { + panic!("Expected chat events"); } - output.push_str(arguments); } assert_eq!(output_name, "get_current_weather"); @@ -972,7 +690,11 @@ mod tests { // No tool finish for token in &tokens[11 + 17..] { let events = chat_state.push(token.clone()); - assert_eq!(events.len(), 0); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); + } } } } diff --git a/router/src/infer/tool_grammar.rs b/router/src/infer/tool_grammar.rs index 7770cd9d7..e4e208598 100644 --- a/router/src/infer/tool_grammar.rs +++ b/router/src/infer/tool_grammar.rs @@ -40,13 +40,13 @@ impl ToolGrammar { ), arguments: json!({ "type": "object", - "properties": { - "content": { - "type": "string", - "description": "The response content", - } - }, - "required": ["content"] + // "properties": { + // "content": { + // "type": "string", + // "description": "The response content", + // } + // }, + // "required": ["content"] }), }, })) diff --git a/router/src/server.rs b/router/src/server.rs index d0fa15d5f..0346b1f19 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1,4 +1,4 @@ -use crate::chat::ChatState; +use crate::chat::{ChatChoice, ChatEvent, ChatState}; /// HTTP Server logic use crate::config::Config; use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse}; @@ -1151,7 +1151,7 @@ pub(crate) async fn chat_completions( Extension(infer): Extension, Extension(compute_type): Extension, Extension(info): Extension, - Json(chat): Json, + Json(mut chat): Json, ) -> Result)> { let span = tracing::Span::current(); metrics::counter!("tgi_request_count").increment(1); @@ -1166,7 +1166,7 @@ pub(crate) async fn chat_completions( tracing::debug!("Got chat_template {:?}", infer.chat_template); let id = chat.next_tool_call_id(); let (generate_request, using_tools): (GenerateRequest, bool) = - chat.try_into_generate(&infer)?; + chat.clone().try_into_generate(&infer)?; span.record("parameters", format!("{:?}", generate_request.parameters)); let logprobs = logprobs.unwrap_or_default(); @@ -1178,21 +1178,41 @@ pub(crate) async fn chat_completions( let system_fingerprint = format!("{}-{}", info.version, info.docker_label.unwrap_or("native")); // switch on stream if stream { - let (headers, response_stream) = - generate_stream_internal(infer, compute_type, Json(generate_request), span).await; + let (headers, response_stream) = generate_stream_internal( + infer.clone(), + compute_type.clone(), + Json(generate_request), + span.clone(), + ) + .await; let response_stream = async_stream::stream! { let mut response_stream = Box::pin(response_stream); - let mut state = ChatState::new(using_tools, stream_options, system_fingerprint, model_id, logprobs, id); + let mut state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone()); while let Some(result) = response_stream.next().await { match result{ Ok(stream_token) => { let events = state.push(stream_token); - for chat_complete in events{ - yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| { - tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e); - Event::default() - })); + match events{ + ChatEvent::NoTool => { + chat.tools = None; + chat.response_format = None; + let (generate_request, using_tools): (GenerateRequest, bool) = + chat.clone().try_into_generate(&infer).unwrap(); + assert!(!using_tools); + let (_headers, response_stream2) = + generate_stream_internal(infer.clone(), compute_type.clone(), Json(generate_request), span.clone()).await; + state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone()); + response_stream = Box::pin(response_stream2); + } + ChatEvent::Events(events) => { + for chat_complete in events{ + yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| { + tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e); + Event::default() + })); + } + } } } Err(err) => yield Ok(err.into_openai_event()) @@ -1204,8 +1224,13 @@ pub(crate) async fn chat_completions( let sse = Sse::new(response_stream).keep_alive(KeepAlive::default()); Ok((headers, sse).into_response()) } else { - let (headers, input_length, Json(generation)) = - generate_internal(Extension(infer), compute_type, Json(generate_request), span).await?; + let (mut headers, mut input_length, Json(generation)) = generate_internal( + Extension(infer.clone()), + compute_type.clone(), + Json(generate_request), + span.clone(), + ) + .await?; let current_time = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -1213,7 +1238,26 @@ pub(crate) async fn chat_completions( .as_secs(); let (tool_calls, output) = if using_tools { - crate::chat::parse_output(&generation.generated_text)? + match crate::chat::parse_output(&generation.generated_text)? { + ChatChoice::NoTool => { + chat.tools = None; + chat.response_format = None; + let (generate_request, using_tools): (GenerateRequest, bool) = + chat.clone().try_into_generate(&infer)?; + assert!(!using_tools); + let (headers_final, input_length_final, Json(generation)) = generate_internal( + Extension(infer), + compute_type, + Json(generate_request), + span, + ) + .await?; + headers = headers_final; + input_length = input_length_final; + (None, Some(generation.generated_text)) + } + ChatChoice::ToolCalls(tool_calls) => (Some(tool_calls), None), + } } else { (None, Some(generation.generated_text)) };