Fix tool call4 (#3094)

* Removing the no_tool content information. * Removing a lot of NO_TOOL shenanigans. * Update the tests.
2025-06-03 21:22:08 +00:00 · 2025-03-12 09:28:47 +01:00 · 2025-03-12 09:28:47 +01:00 · 5c5528e362
commit 5c5528e362
parent ed46c2c414
7 changed files with 526 additions and 1674 deletions
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
@ -5,20 +5,20 @@
      "index": 0,
      "logprobs": null,
      "message": {
-        "content": "I am a helpful assistant!",
+        "content": "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI",
        "role": "assistant",
        "tool_calls": null
      }
    }
  ],
-  "created": 1741263686,
+  "created": 1741693957,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
-    "completion_tokens": 23,
-    "prompt_tokens": 494,
-    "total_tokens": 517
+    "completion_tokens": 12,
+    "prompt_tokens": 53,
+    "total_tokens": 65
  }
 }
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@ -1,24 +1,4 @@
 [
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741364571,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
  {
    "choices": [
      {
@ -32,7 +12,7 @@
        "logprobs": null
      }
    ],
-    "created": 1741364571,
+    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
@ -43,7 +23,7 @@
    "choices": [
      {
        "delta": {
-          "content": " am",
+          "content": "'m",
          "role": "assistant",
          "tool_calls": null
        },
@ -52,7 +32,127 @@
        "logprobs": null
      }
    ],
-    "created": 1741364571,
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " an",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " artificial",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " intelligence",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " model",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " known",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " as",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
@ -72,7 +172,7 @@
        "logprobs": null
      }
    ],
-    "created": 1741364571,
+    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
@ -83,7 +183,7 @@
    "choices": [
      {
        "delta": {
-          "content": " helpful",
+          "content": " large",
          "role": "assistant",
          "tool_calls": null
        },
@ -92,7 +192,7 @@
        "logprobs": null
      }
    ],
-    "created": 1741364571,
+    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
@ -103,7 +203,7 @@
    "choices": [
      {
        "delta": {
-          "content": " assistant",
+          "content": " language",
          "role": "assistant",
          "tool_calls": null
        },
@ -112,7 +212,7 @@
        "logprobs": null
      }
    ],
-    "created": 1741364571,
+    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
@ -123,7 +223,7 @@
    "choices": [
      {
        "delta": {
-          "content": "!",
+          "content": " model",
          "role": "assistant",
          "tool_calls": null
        },
@ -132,7 +232,167 @@
        "logprobs": null
      }
    ],
-    "created": 1741364571,
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " (",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "LL",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "M",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ")",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " or",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " convers",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ational",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " AI",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@ -279,7 +279,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream(
 ):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    response = client.chat_completion(
-        max_tokens=100,
+        max_tokens=20,
        seed=24,
        tools=tools,
        tool_choice="auto",
@ -299,7 +299,10 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream(
    content_generated = response.choices[0].message.content
    assert response.choices[0].message.tool_calls is None

-    assert content_generated == "I am a helpful assistant!"
+    assert (
+        content_generated
+        == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI"
+    )
    assert response == response_snapshot


@ -310,7 +313,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream(
 ):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
-        max_tokens=100,
+        max_tokens=20,
        seed=24,
        tools=tools,
        tool_choice="auto",
@ -335,7 +338,10 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream(
        assert chunk.choices[0].delta.tool_calls is None

    ######## This is exactly the same as the non streaming case
-    assert content_generated == "I am a helpful assistant!"
+    assert (
+        content_generated
+        == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI"
+    )
    assert chunks == response_snapshot


@ -346,7 +352,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(
 ):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
-        max_tokens=100,
+        max_tokens=20,
        seed=24,
        tools=tools,
        tool_choice="auto",
@ -372,7 +378,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(

    assert (
        content_generated
-        == "There was a wise old octopus named Oracle. He lived in a cozy little cave beneath the waves with his best friend, a curious seahorse named Finley. One day, Finley met a playful dolphin named Daisy, and the three became inseparable. They spent their days exploring the ocean, playing hide-and-seek, and learning about the wonders of the sea from Oracle."
+        == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish,"
    )
    assert chunks == response_snapshot

--- a/router/src/chat.rs
+++ b/router/src/chat.rs
@ -6,22 +6,6 @@ use crate::{
 use serde::Deserialize;
 use serde_json::Value;

-#[derive(Debug, Deserialize)]
-#[serde(rename_all = "snake_case")]
-enum _NoTool {
-    NoTool,
-}
-
-#[derive(Debug, Deserialize)]
-struct NoToolCall {
-    _name: _NoTool,
-    content: String,
-}
-#[derive(Debug, Deserialize)]
-struct NoTool {
-    function: NoToolCall,
-}
-
 #[derive(Debug, Deserialize)]
 struct ToolCall {
    _name: String,
@ -34,9 +18,19 @@ struct Call {
    function: ToolCall,
 }

-pub(crate) fn parse_output(
-    generated_text: &str,
-) -> Result<(Option<Vec<crate::ToolCall>>, Option<String>), InferError> {
+#[cfg_attr(test, derive(Debug))]
+pub(crate) enum ChatEvent {
+    NoTool,
+    Events(Vec<CompletionType>),
+}
+
+#[cfg_attr(test, derive(Debug))]
+pub(crate) enum ChatChoice {
+    NoTool,
+    ToolCalls(Vec<crate::ToolCall>),
+}
+
+pub(crate) fn parse_output(generated_text: &str) -> Result<ChatChoice, InferError> {
    let call: Call = serde_json::from_str(generated_text).map_err(|e| {
        InferError::ToolError(format!(
            "Failed to parse generated text: {} {:?}",
@ -48,16 +42,7 @@ pub(crate) fn parse_output(
    match &name[..] {
        "no_tool" => {
            // parse the content message
-            let content_message = call
-                .function
-                .arguments
-                .get("content")
-                .and_then(Value::as_str)
-                .ok_or_else(|| {
-                    InferError::ToolError("No `content` found in generated text".to_string())
-                })?
-                .to_string();
-            Ok((None, Some(content_message)))
+            Ok(ChatChoice::NoTool)
        }
        name => {
            let tool_calls = vec![crate::ToolCall {
@ -73,7 +58,7 @@ pub(crate) fn parse_output(
                    })?,
                },
            }];
-            Ok((Some(tool_calls), None))
+            Ok(ChatChoice::ToolCalls(tool_calls))
        }
    }
 }
@ -158,10 +143,6 @@ enum StreamState {
    Buffering,
    /// We detected a tool call here
    Tool,
-    /// During the `content` part of the tool call
-    NoTool,
-    /// Finishing frames of the ToolCall
-    NoToolFinish,
    /// This is without tool calling
    Content,
 }
@ -202,34 +183,16 @@ impl ChatState {
        }
    }

-    pub fn push(&mut self, mut stream_token: StreamResponse) -> Vec<CompletionType> {
+    pub fn push(&mut self, mut stream_token: StreamResponse) -> ChatEvent {
        let mut events = vec![];
        let token_text = &stream_token.token.text;
        match self.state {
            StreamState::Buffering => {
                self.text.push_str(token_text);
-                // We have a special match for `no_tool` in order to capture directly the `content`
-                // key which should be re-emitted as raw text.
-                if let Ok(value) = serde_json::from_str::<NoTool>(&format!("{}\"}}}}", self.text)) {
-                    self.state = StreamState::NoTool;
-                    // Modifiy the content of the token to be whatever was captured by the JSON
-                    stream_token.token.text = value.function.content;
-                    let chat_complete = create_event_from_stream_token(
-                        &stream_token,
-                        self.logprobs,
-                        false,
-                        self.fingerprint.clone(),
-                        self.model_id.clone(),
-                        None,
-                        self.id.clone(),
-                    );
-
-                    events.push(chat_complete);
-                }
-                // XXX Caution, here we do not postfix the quote, so that the current output
-                // Is necessarily finished with quotes for us to be able to parse.
+                tracing::info!("Current text {:?}", self.text);
                let partial = &self.text;
-                let partial = partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',');
+                let partial =
+                    partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',' || c == '}');
                if let Ok(call) = serde_json::from_str::<Call>(&format!("{}}}}}", partial)) {
                    // This can be no_tool before the content has been emitted
                    if call.function._name != "no_tool" {
@ -246,6 +209,8 @@ impl ChatState {

                        events.push(chat_complete);
                        self.state = StreamState::Tool;
+                    } else {
+                        return ChatEvent::NoTool;
                    }
                }
            }
@ -282,50 +247,6 @@ impl ChatState {
                    events.push(chat_complete);
                }
            }
-            // if we skipped sending the buffer we need to avoid sending the following json key and quotes
-            // We have remainder tokens, ignore everying,
-            StreamState::NoToolFinish => {}
-            StreamState::NoTool => {
-                self.text.push_str(token_text);
-                if token_text.contains("\"") {
-                    let mut text = self
-                        .text
-                        .trim_end_matches(|c: char| c.is_whitespace() || c == '}');
-                    // Trim once
-                    if text.ends_with("\"") {
-                        // Verify we have actually trimmed something
-                        // The opposite can happen if the model is outputting inline JSON.
-                        text = &text[..text.len() - 1];
-                        if let Ok(_value) =
-                            serde_json::from_str::<NoTool>(&format!("{}\"}}}}", text))
-                        {
-                            let mut text = token_text
-                                .trim_end_matches(|c: char| c.is_whitespace() || c == '}');
-                            // Effectively trim_end_match('"', 1)
-                            // because we do not want to eventually trim finishing escaped quotes
-                            // {{"\"Something\""}}
-                            if text.ends_with("\"") {
-                                text = &text[..text.len() - 1];
-                            }
-                            stream_token.token.text = text.to_string();
-                            self.state = StreamState::NoToolFinish;
-                        }
-                    }
-                }
-                // This escaping is usually inline json escaping and we can therefore remove it.
-                stream_token.token.text = stream_token.token.text.replace("\\", "");
-                let chat_complete = create_event_from_stream_token(
-                    &stream_token,
-                    self.logprobs,
-                    false,
-                    self.fingerprint.clone(),
-                    self.model_id.clone(),
-                    None,
-                    self.id.clone(),
-                );
-
-                events.push(chat_complete);
-            }
            StreamState::Content => {
                let chat_complete = create_event_from_stream_token(
                    &stream_token,
@ -373,7 +294,7 @@ impl ChatState {
                events.push(chat_complete);
            }
        }
-        events
+        ChatEvent::Events(events)
    }
 }

@ -385,24 +306,6 @@ mod tests {

    use super::*;

-    fn get_text_content(event: &CompletionType) -> &String {
-        match event {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                assert_eq!(choices.len(), 1);
-                if let ChatCompletionChoice {
-                    delta: ChatCompletionDelta::Chat(TextMessage { content, .. }),
-                    ..
-                } = &choices[0]
-                {
-                    content
-                } else {
-                    panic!("Expected plain message");
-                }
-            }
-            _ => panic!("Unexpected chunk"),
-        }
-    }
-
    fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &String) {
        match event {
            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
@ -456,24 +359,28 @@ mod tests {
            index: 0,
            details: None,
        });
-        assert_eq!(events.len(), 1);
-        match &events[0] {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                assert_eq!(
-                    choices,
-                    &[ChatCompletionChoice {
-                        index: 0,
-                        delta: ChatCompletionDelta::Chat(TextMessage {
-                            role: "assistant".to_string(),
-                            content: "Hi".to_string(),
-                            tool_call_id: None,
-                        }),
-                        logprobs: None,
-                        finish_reason: None,
-                    }]
-                );
+        if let ChatEvent::Events(events) = events {
+            assert_eq!(events.len(), 1);
+            match &events[0] {
+                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                    assert_eq!(
+                        choices,
+                        &[ChatCompletionChoice {
+                            index: 0,
+                            delta: ChatCompletionDelta::Chat(TextMessage {
+                                role: "assistant".to_string(),
+                                content: "Hi".to_string(),
+                                tool_call_id: None,
+                            }),
+                            logprobs: None,
+                            finish_reason: None,
+                        }]
+                    );
+                }
+                _ => panic!("Unexpected chunk"),
            }
-            _ => panic!("Unexpected chunk"),
+        } else {
+            panic!("Expected chat events");
        }
    }

@ -507,43 +414,47 @@ mod tests {
                finish_reason: FinishReason::Length,
            }),
        });
-        assert_eq!(events.len(), 2);
-        match &events[0] {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                assert_eq!(
-                    choices,
-                    &[ChatCompletionChoice {
-                        index: 0,
-                        delta: ChatCompletionDelta::Chat(TextMessage {
-                            role: "assistant".to_string(),
-                            content: "Hi".to_string(),
-                            tool_call_id: None,
-                        }),
-                        logprobs: None,
-                        // HAS A FINISH REASON
-                        finish_reason: Some("length".to_string()),
-                    }]
-                );
+        if let ChatEvent::Events(events) = events {
+            assert_eq!(events.len(), 2);
+            match &events[0] {
+                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                    assert_eq!(
+                        choices,
+                        &[ChatCompletionChoice {
+                            index: 0,
+                            delta: ChatCompletionDelta::Chat(TextMessage {
+                                role: "assistant".to_string(),
+                                content: "Hi".to_string(),
+                                tool_call_id: None,
+                            }),
+                            logprobs: None,
+                            // HAS A FINISH REASON
+                            finish_reason: Some("length".to_string()),
+                        }]
+                    );
+                }
+                _ => panic!("Unexpected chunk"),
            }
-            _ => panic!("Unexpected chunk"),
-        }
-        match &events[1] {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => {
-                assert_eq!(
-                    *usage,
-                    Some(Usage {
-                        prompt_tokens: 2,
-                        completion_tokens: 10,
-                        total_tokens: 12,
-                    })
-                );
+            match &events[1] {
+                CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => {
+                    assert_eq!(
+                        *usage,
+                        Some(Usage {
+                            prompt_tokens: 2,
+                            completion_tokens: 10,
+                            total_tokens: 12,
+                        })
+                    );
+                }
+                _ => panic!("Unexpected chunk"),
            }
-            _ => panic!("Unexpected chunk"),
+        } else {
+            panic!("Expected chat events");
        }
    }

    #[test]
-    fn test_chat_stream_tool_no_tool() {
+    fn test_chat_stream_tool_no_tool_simple() {
        let mut chat_state = ChatState::new(
            true,
            StreamOptions {
@ -597,217 +508,21 @@ mod tests {
            .collect();

        // Initial ignored output
-        for token in &tokens[..14] {
+        for token in &tokens[..10] {
            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
-        }
-
-        // No tool output
-        let mut output = String::new();
-        for token in &tokens[14..14 + 7] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1);
-            let content = get_text_content(&events[0]);
-            output.push_str(content);
-        }
-
-        assert_eq!(output, "I am a helpful assistant!");
-
-        // No tool finish
-        for token in &tokens[14 + 7..] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
-        }
-    }
-
-    #[test]
-    fn test_chat_stream_tool_no_tool_many_quotes() {
-        let mut chat_state = ChatState::new(
-            true,
-            StreamOptions {
-                include_usage: true,
-            },
-            "fingerprint".to_string(),
-            "model_id".to_string(),
-            false,
-            "0".to_string(),
-        );
-
-        let tokens = vec![
-            "{\"".to_string(),
-            "function".to_string(),
-            "\":".to_string(),
-            " {\"".to_string(),
-            "_".to_string(),
-            "name".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),
-            "no".to_string(),
-            "_tool".to_string(),
-            "\",".to_string(),
-            " \"".to_string(),
-            "content".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),        // Token 14
-            "I".to_string(),          // Event 1
-            " am".to_string(),        // Event 2
-            " a".to_string(),         // Event 3
-            " helpful".to_string(),   // Event 4
-            " assistant".to_string(), // Event 5
-            "!\\\"\"".to_string(),    // Extra inside the string quote that would get removed
-            "}".to_string(),
-            "}".to_string(),
-        ];
-
-        // Initial ignored output
-        for text in &tokens[..14] {
-            let events = chat_state.push(StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            });
-            assert_eq!(events.len(), 0);
-        }
-
-        // No tool output
-        let mut output = String::new();
-        for text in &tokens[14..14 + 7] {
-            let events = chat_state.push(StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            });
-            assert_eq!(events.len(), 1);
-            match &events[0] {
-                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                    assert_eq!(choices.len(), 1);
-                    if let ChatCompletionChoice {
-                        delta: ChatCompletionDelta::Chat(TextMessage { content, .. }),
-                        ..
-                    } = &choices[0]
-                    {
-                        output.push_str(content);
-                    } else {
-                        panic!("Expected plain message");
-                    }
-                }
-                _ => panic!("Unexpected chunk"),
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
            }
        }

-        assert_eq!(output, "I am a helpful assistant!\"");
-
-        // No tool finish
-        for text in &tokens[14 + 7..] {
-            let events = chat_state.push(StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            });
-            assert_eq!(events.len(), 0);
-        }
-    }
-
-    #[test]
-    fn test_chat_stream_tool_no_tool_inline_json() {
-        let mut chat_state = ChatState::new(
-            true,
-            StreamOptions {
-                include_usage: true,
-            },
-            "fingerprint".to_string(),
-            "model_id".to_string(),
-            false,
-            "0".to_string(),
-        );
-
-        let tokens = vec![
-            "{\"".to_string(),
-            "function".to_string(),
-            "\":".to_string(),
-            " {\"".to_string(),
-            "_".to_string(),
-            "name".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),
-            "no".to_string(),
-            "_tool".to_string(),
-            "\",".to_string(),
-            " \"".to_string(),
-            "content".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),    // Token 14
-            "{\\\"".to_string(),  // Event 1
-            "a".to_string(),      // Event 1
-            "\\\":".to_string(),  // Event 1
-            "2".to_string(),      // Event 2
-            ",\\".to_string(),    // Event 2
-            "\"".to_string(),     // Event 2
-            "b".to_string(),      // Event 3
-            "\\\": ".to_string(), // Event 4
-            "1".to_string(),      // Event 5
-            "}".to_string(),      // Event 5
-            "\"}".to_string(),    // Extra inside the string quote that would get removed
-            "}".to_string(),
-        ];
-        let tokens: Vec<_> = tokens
-            .into_iter()
-            .map(|text| StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            })
-            .collect();
-
-        // Initial ignored output
-        for token in &tokens[..14] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
-        }
-
        // No tool output
-        let mut output = String::new();
-        for token in &tokens[14..14 + 12] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1, "Current text is {output:?}");
-            let content = get_text_content(&events[0]);
-            output.push_str(content);
-        }
-
-        assert_eq!(output, "{\"a\":2,\"b\": 1}");
-
-        // No tool finish
-        for token in &tokens[14 + 12..] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0, "Extra events {events:?}");
+        let events = chat_state.push(tokens[10].clone());
+        if let ChatEvent::NoTool = events {
+            assert!(true);
+        } else {
+            panic!("Expected chat events");
        }
    }

@ -859,26 +574,21 @@ mod tests {
            .collect();

        // Initial ignored output
-        for token in &tokens[..13] {
+        for token in &tokens[..10] {
            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
+            }
        }

        // No tool output
-        let mut output = String::new();
-        for token in &tokens[13..13 + 2] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1, "Current text is {output:?}");
-            let content = get_text_content(&events[0]);
-            output.push_str(content);
-        }
-
-        assert_eq!(output, "");
-
-        // No tool finish
-        for token in &tokens[13 + 2..] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0, "Extra events {events:?}");
+        let events = chat_state.push(tokens[10].clone());
+        if let ChatEvent::NoTool = events {
+            assert!(true);
+        } else {
+            panic!("Expected chat events");
        }
    }

@ -946,7 +656,11 @@ mod tests {
        // Initial ignored output
        for token in &tokens[..11] {
            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0, "{events:?}");
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
+            }
        }

        // No tool output
@ -954,13 +668,17 @@ mod tests {
        let mut output_name = String::new();
        for token in &tokens[11..11 + 17] {
            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1);
-            let (name, arguments) = get_tool_call_content(&events[0]);
-            if let Some(name) = name {
-                assert_eq!(name, "get_current_weather");
-                output_name.push_str(&name);
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 1);
+                let (name, arguments) = get_tool_call_content(&events[0]);
+                if let Some(name) = name {
+                    assert_eq!(name, "get_current_weather");
+                    output_name.push_str(&name);
+                }
+                output.push_str(arguments);
+            } else {
+                panic!("Expected chat events");
            }
-            output.push_str(arguments);
        }

        assert_eq!(output_name, "get_current_weather");
@ -972,7 +690,11 @@ mod tests {
        // No tool finish
        for token in &tokens[11 + 17..] {
            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
+            }
        }
    }
 }
--- a/router/src/infer/tool_grammar.rs
+++ b/router/src/infer/tool_grammar.rs
@ -40,13 +40,13 @@ impl ToolGrammar {
                            ),
                            arguments: json!({
                                "type": "object",
-                                "properties": {
-                                    "content": {
-                                        "type": "string",
-                                        "description": "The response content",
-                                    }
-                                },
-                                "required": ["content"]
+                                // "properties": {
+                                //     "content": {
+                                //         "type": "string",
+                                //         "description": "The response content",
+                                //     }
+                                // },
+                                // "required": ["content"]
                            }),
                        },
                    }))
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1,4 +1,4 @@
-use crate::chat::ChatState;
+use crate::chat::{ChatChoice, ChatEvent, ChatState};
 /// HTTP Server logic
 use crate::config::Config;
 use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse};
@ -1151,7 +1151,7 @@ pub(crate) async fn chat_completions(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Extension(info): Extension<Info>,
-    Json(chat): Json<ChatRequest>,
+    Json(mut chat): Json<ChatRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    metrics::counter!("tgi_request_count").increment(1);
@ -1166,7 +1166,7 @@ pub(crate) async fn chat_completions(
    tracing::debug!("Got chat_template {:?}", infer.chat_template);
    let id = chat.next_tool_call_id();
    let (generate_request, using_tools): (GenerateRequest, bool) =
-        chat.try_into_generate(&infer)?;
+        chat.clone().try_into_generate(&infer)?;
    span.record("parameters", format!("{:?}", generate_request.parameters));
    let logprobs = logprobs.unwrap_or_default();

@ -1178,21 +1178,41 @@ pub(crate) async fn chat_completions(
    let system_fingerprint = format!("{}-{}", info.version, info.docker_label.unwrap_or("native"));
    // switch on stream
    if stream {
-        let (headers, response_stream) =
-            generate_stream_internal(infer, compute_type, Json(generate_request), span).await;
+        let (headers, response_stream) = generate_stream_internal(
+            infer.clone(),
+            compute_type.clone(),
+            Json(generate_request),
+            span.clone(),
+        )
+        .await;

        let response_stream = async_stream::stream! {
            let mut response_stream = Box::pin(response_stream);
-            let mut state = ChatState::new(using_tools, stream_options, system_fingerprint, model_id, logprobs, id);
+            let mut state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone());
            while let Some(result) = response_stream.next().await {
                match result{
                Ok(stream_token) => {
                    let events = state.push(stream_token);
-                    for chat_complete in events{
-                        yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| {
-                            tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e);
-                            Event::default()
-                        }));
+                    match events{
+                        ChatEvent::NoTool => {
+                            chat.tools = None;
+                            chat.response_format = None;
+                            let (generate_request, using_tools): (GenerateRequest, bool) =
+                                chat.clone().try_into_generate(&infer).unwrap();
+                            assert!(!using_tools);
+                            let (_headers, response_stream2) =
+                                generate_stream_internal(infer.clone(), compute_type.clone(), Json(generate_request), span.clone()).await;
+                            state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone());
+                            response_stream = Box::pin(response_stream2);
+                        }
+                        ChatEvent::Events(events) => {
+                            for chat_complete in events{
+                                yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| {
+                                    tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e);
+                                    Event::default()
+                                }));
+                            }
+                        }
                    }
                }
                Err(err) => yield Ok(err.into_openai_event())
@ -1204,8 +1224,13 @@ pub(crate) async fn chat_completions(
        let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
        Ok((headers, sse).into_response())
    } else {
-        let (headers, input_length, Json(generation)) =
-            generate_internal(Extension(infer), compute_type, Json(generate_request), span).await?;
+        let (mut headers, mut input_length, Json(generation)) = generate_internal(
+            Extension(infer.clone()),
+            compute_type.clone(),
+            Json(generate_request),
+            span.clone(),
+        )
+        .await?;

        let current_time = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
@ -1213,7 +1238,26 @@ pub(crate) async fn chat_completions(
            .as_secs();

        let (tool_calls, output) = if using_tools {
-            crate::chat::parse_output(&generation.generated_text)?
+            match crate::chat::parse_output(&generation.generated_text)? {
+                ChatChoice::NoTool => {
+                    chat.tools = None;
+                    chat.response_format = None;
+                    let (generate_request, using_tools): (GenerateRequest, bool) =
+                        chat.clone().try_into_generate(&infer)?;
+                    assert!(!using_tools);
+                    let (headers_final, input_length_final, Json(generation)) = generate_internal(
+                        Extension(infer),
+                        compute_type,
+                        Json(generate_request),
+                        span,
+                    )
+                    .await?;
+                    headers = headers_final;
+                    input_length = input_length_final;
+                    (None, Some(generation.generated_text))
+                }
+                ChatChoice::ToolCalls(tool_calls) => (Some(tool_calls), None),
+            }
        } else {
            (None, Some(generation.generated_text))
        };