From 8e92942a18f51b3670c5285baef1885526b64da0 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 5 Mar 2025 22:32:31 +0100
Subject: [PATCH] Making `tool_calls` a vector. (#3075)

* Making `tool_calls` a vector.

* Update doc.

* Fixing the nix overlay with updated version.

* Add openai dependency.

* Updating the old tests.

* Trying to reduce the logs in the case of errors.

* Less spammy logs too.
---
 backends/v3/src/queue.rs                      |   4 +-
 clients/python/text_generation/types.py       |   2 +-
 docs/openapi.json                             |   5 +-
 integration-tests/conftest.py                 |  14 +-
 .../test_flash_llama_grammar_tools.json       |   4 +-
 .../test_flash_llama_grammar_tools_auto.json  |   4 +-
 ...test_flash_llama_grammar_tools_choice.json |  20 +-
 ...rammar_tools_insufficient_information.json |  10 +-
 ...tools_insufficient_information_stream.json |   4 +-
 ...test_flash_llama_grammar_tools_openai.json | 992 ++++++++++++++++++
 ...ma_grammar_tools_sea_creatures_stream.json |   4 +-
 ..._sea_creatures_stream_function_object.json |  24 +-
 ...ammar_tools_sea_creatures_stream_none.json |   4 +-
 ...r_tools_sea_creatures_stream_required.json |  24 +-
 ...test_flash_llama_grammar_tools_stream.json |  24 +-
 .../test_flash_llama_tool_reply_response.json |   4 +-
 integration-tests/models/test_tools_llama.py  |  45 +-
 integration-tests/pyproject.toml              |   1 +
 integration-tests/requirements.txt            |  38 +-
 nix/overlay.nix                               |   4 +-
 router/src/lib.rs                             |   6 +-
 21 files changed, 1158 insertions(+), 79 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json

diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
index 249eebf7..d3bf4b9c 100644
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -311,7 +311,7 @@ impl State {
                         + entry.request.stopping_parameters.max_new_tokens
                         + self.speculate
                         - 1;
-                    tracing::debug!("Allocating {tokens} with {input_ids:?}");
+                    // tracing::debug!("Allocating {tokens} with {input_ids:?}");
 
                     let block_allocation = match block_allocator.allocate(tokens, input_ids).await {
                         None => {
@@ -322,7 +322,7 @@ impl State {
                             break 'entry_loop;
                         }
                         Some(mut block_allocation) => {
-                            tracing::debug!("Allocation: {block_allocation:?}");
+                            // tracing::debug!("Allocation: {block_allocation:?}");
                             max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);
 
                             if block_allocation.prefix_len == entry.request.input_length {
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index 1085075e..6f51c153 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -67,7 +67,7 @@ class ChoiceDeltaToolCall(BaseModel):
 class ChoiceDelta(BaseModel):
     role: str
     content: Optional[str] = None
-    tool_calls: Optional[ChoiceDeltaToolCall] = None
+    tool_calls: Optional[List[ChoiceDeltaToolCall]] = None
 
 
 class Choice(BaseModel):
diff --git a/docs/openapi.json b/docs/openapi.json
index e16ca7f9..e1ce234e 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -2302,7 +2302,10 @@
             "example": "assistant"
           },
           "tool_calls": {
-            "$ref": "#/components/schemas/DeltaToolCall"
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/DeltaToolCall"
+            }
           }
         }
       },
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 0ffcd162..01250ce2 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -98,7 +98,7 @@ def pytest_collection_modifyitems(config, items):
             selector(item)
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture(autouse=True, scope="module")
 def container_log(request: SubRequest):
     error_log = request.getfixturevalue("error_log")
     assert error_log is not None
@@ -269,7 +269,17 @@ class ResponseComparator(JSONSnapshotExtension):
         def eq_chat_complete_chunk(
             response: ChatCompletionChunk, other: ChatCompletionChunk
         ) -> bool:
-            return response.choices[0].delta.content == other.choices[0].delta.content
+            if response.choices[0].delta.content is not None:
+                return (
+                    response.choices[0].delta.content == other.choices[0].delta.content
+                )
+            elif response.choices[0].delta.tool_calls is not None:
+                return (
+                    response.choices[0].delta.tool_calls
+                    == other.choices[0].delta.tool_calls
+                )
+            else:
+                raise RuntimeError(f"Invalid empty chat chunk {response} vs {other}")
 
         def eq_response(response: Response, other: Response) -> bool:
             return response.generated_text == other.generated_text and eq_details(
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
index 33e223ba..7445099f 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
@@ -26,11 +26,11 @@
       "usage": null
     }
   ],
-  "created": 1732293383,
+  "created": 1741195536,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 30,
     "prompt_tokens": 615,
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
index 92ffbbc1..99018f96 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
@@ -26,11 +26,11 @@
       "usage": null
     }
   ],
-  "created": 1732293384,
+  "created": 1741195538,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 30,
     "prompt_tokens": 615,
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
index 603c90af..a80a6a23 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
@@ -1,7 +1,7 @@
 {
   "choices": [
     {
-      "finish_reason": "eos_token",
+      "finish_reason": "stop",
       "index": 0,
       "logprobs": null,
       "message": {
@@ -13,12 +13,12 @@
             "function": {
               "arguments": {
                 "format": "celsius",
-                "location": "New York, NY"
+                "location": "Brooklyn, New York"
               },
               "description": null,
               "name": "get_current_weather"
             },
-            "id": 0,
+            "id": "0",
             "type": "function"
           }
         ]
@@ -26,14 +26,14 @@
       "usage": null
     }
   ],
-  "created": 1712852394,
+  "created": 1741195540,
   "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-  "object": "text_completion",
-  "system_fingerprint": "2.0.1-native",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 48,
-    "prompt_tokens": 320,
-    "total_tokens": 368
+    "completion_tokens": 30,
+    "prompt_tokens": 326,
+    "total_tokens": 356
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
index 3ed893fa..9cfea791 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1728497062,
+  "created": 1741195542,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 23,
-    "prompt_tokens": 604,
-    "total_tokens": 627
+    "completion_tokens": 22,
+    "prompt_tokens": 608,
+    "total_tokens": 630
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
index b134004a..34615f8e 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1728497531,
+  "created": 1741195542,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
new file mode 100644
index 00000000..e6d78924
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
@@ -0,0 +1,992 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "{\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "function",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " {\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "name",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "get",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_current",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_weather",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "location",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "Bro",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "oklyn",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": ",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " New",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " York",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "format",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "c",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "elsius",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\"}}",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "<|eot_id|>",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
index 1362b472..11644190 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1728497461,
+  "created": 1741195545,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
index bb8d61c8..713e7a56 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
@@ -3,25 +3,27 @@
     {
       "delta": {
         "role": "assistant",
-        "tool_calls": {
-          "function": {
-            "arguments": "<|eot_id|>",
-            "name": null
-          },
-          "id": "",
-          "index": 0,
-          "type": "function"
-        }
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": "<|eot_id|>",
+              "name": null
+            },
+            "id": "",
+            "index": 0,
+            "type": "function"
+          }
+        ]
       },
       "finish_reason": "stop",
       "index": 0,
       "logprobs": null
     }
   ],
-  "created": 1732293254,
+  "created": 1741195554,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
index 2ccab4a9..bde28149 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1729262528,
+  "created": 1741195551,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
index dbced5b8..7896607a 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
@@ -4,25 +4,27 @@
       "delta": {
         "content": null,
         "role": "assistant",
-        "tool_calls": {
-          "function": {
-            "arguments": "<|eot_id|>",
-            "name": null
-          },
-          "id": "",
-          "index": 0,
-          "type": "function"
-        }
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": "<|eot_id|>",
+              "name": null
+            },
+            "id": "",
+            "index": 0,
+            "type": "function"
+          }
+        ]
       },
       "finish_reason": "stop",
       "index": 0,
       "logprobs": null
     }
   ],
-  "created": 1732293246,
+  "created": 1741195548,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
index 27d2f9ca..92d27f61 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
@@ -4,25 +4,27 @@
       "delta": {
         "content": null,
         "role": "assistant",
-        "tool_calls": {
-          "function": {
-            "arguments": "<|eot_id|>",
-            "name": null
-          },
-          "id": "",
-          "index": 0,
-          "type": "function"
-        }
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": "<|eot_id|>",
+              "name": null
+            },
+            "id": "",
+            "index": 0,
+            "type": "function"
+          }
+        ]
       },
       "finish_reason": "stop",
       "index": 0,
       "logprobs": null
     }
   ],
-  "created": 1732293235,
+  "created": 1741195541,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
index 4f10aa3b..33a3bb43 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
@@ -13,11 +13,11 @@
       "usage": null
     }
   ],
-  "created": 1739932427,
+  "created": 1741195556,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "3.1.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 79,
     "prompt_tokens": 103,
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index b8a90cff..7fd6cadd 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -1,6 +1,7 @@
 import pytest
 import requests
 import json
+from openai import OpenAI
 
 
 @pytest.fixture(scope="module")
@@ -108,6 +109,38 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna
     assert response == response_snapshot
 
 
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_openai(
+    flash_llama_grammar_tools, response_snapshot
+):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat.completions.create(
+        model="tgi",
+        max_tokens=100,
+        seed=1,
+        tools=tools,
+        stream=True,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
+
+
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_auto(
@@ -213,7 +246,9 @@ async def test_flash_llama_grammar_tools_stream(
     last_response = None
     async for response in responses:
         count += 1
-        tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments
+        tool_calls_generated += (
+            response.choices[0].delta.tool_calls[0].function.arguments
+        )
         last_response = response
         assert response.choices[0].delta.content is None
 
@@ -360,7 +395,9 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
     async for response in responses:
         count += 1
         assert response.choices[0].delta.content is None
-        tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments
+        tool_calls_generated += (
+            response.choices[0].delta.tool_calls[0].function.arguments
+        )
         last_response = response
 
     assert count == 29
@@ -458,8 +495,8 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
                     break
                 response = json.loads(line)
                 tool_calls_generated += response["choices"][0]["delta"]["tool_calls"][
-                    "function"
-                ]["arguments"]
+                    0
+                ]["function"]["arguments"]
                 last_response = response
 
     assert count == 39
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index 1838995e..37003440 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "pytest-asyncio>=0.23.1",
     "docker>=7",
     "numpy>=2.0",
+    "openai>=1.65",
 ]
 
 [tool.isort]
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index c5a91825..d419d4b3 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile pyproject.toml --output-file requirements.txt
+#    uv pip compile pyproject.toml -o requirements.txt
 aiohappyeyeballs==2.4.6
     # via aiohttp
 aiohttp==3.11.12
@@ -8,12 +8,21 @@ aiosignal==1.3.2
     # via aiohttp
 annotated-types==0.7.0
     # via pydantic
+anyio==4.8.0
+    # via
+    #   httpx
+    #   openai
 attrs==25.1.0
     # via aiohttp
 certifi==2025.1.31
-    # via requests
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
 charset-normalizer==3.4.1
     # via requests
+distro==1.9.0
+    # via openai
 docker==7.1.0
     # via text-generation-integration-tests (pyproject.toml)
 filelock==3.17.0
@@ -24,20 +33,32 @@ frozenlist==1.5.0
     #   aiosignal
 fsspec==2025.2.0
     # via huggingface-hub
+h11==0.14.0
+    # via httpcore
+httpcore==1.0.7
+    # via httpx
+httpx==0.28.1
+    # via openai
 huggingface-hub==0.29.0
     # via text-generation
 idna==3.10
     # via
+    #   anyio
+    #   httpx
     #   requests
     #   yarl
 iniconfig==2.0.0
     # via pytest
+jiter==0.8.2
+    # via openai
 multidict==6.1.0
     # via
     #   aiohttp
     #   yarl
 numpy==2.2.3
     # via text-generation-integration-tests (pyproject.toml)
+openai==1.65.3
+    # via text-generation-integration-tests (pyproject.toml)
 packaging==24.2
     # via
     #   huggingface-hub
@@ -51,6 +72,7 @@ propcache==0.2.1
 pydantic==2.10.6
     # via
     #   text-generation-integration-tests (pyproject.toml)
+    #   openai
     #   text-generation
 pydantic-core==2.27.2
     # via pydantic
@@ -67,15 +89,23 @@ requests==2.32.3
     # via
     #   docker
     #   huggingface-hub
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   openai
 syrupy==4.8.1
     # via text-generation-integration-tests (pyproject.toml)
 text-generation==0.7.0
     # via text-generation-integration-tests (pyproject.toml)
 tqdm==4.67.1
-    # via huggingface-hub
-typing-extensions==4.12.2
     # via
     #   huggingface-hub
+    #   openai
+typing-extensions==4.12.2
+    # via
+    #   anyio
+    #   huggingface-hub
+    #   openai
     #   pydantic
     #   pydantic-core
 urllib3==2.3.0
diff --git a/nix/overlay.nix b/nix/overlay.nix
index d9047819..63398f07 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -18,8 +18,8 @@ final: prev: {
             src = final.fetchFromGitHub {
               owner = "huggingface";
               repo = "transformers";
-              rev = "8d73a38606bc342b370afe1f42718b4828d95aaa";
-              hash = "sha256-MxroG6CWqrcmRS+eFt7Ej87TDOInN15aRPBUcaycKTI=";
+              rev = "v4.49.0";
+              hash = "sha256-drq7RWoRaRejiQjCUHIYuzaKa9rA4eQZI2do74scp1c=";
             };
           }
         );
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 637e6c56..60f1f73a 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -730,7 +730,7 @@ pub(crate) struct ChatCompletionChoice {
 pub struct ToolCallDelta {
     #[schema(example = "assistant")]
     role: String,
-    tool_calls: DeltaToolCall,
+    tool_calls: Vec<DeltaToolCall>,
 }
 
 #[derive(Clone, Debug, Serialize, ToSchema)]
@@ -774,7 +774,7 @@ impl ChatCompletionChunk {
             }),
             (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
                 role: "assistant".to_string(),
-                tool_calls: DeltaToolCall {
+                tool_calls: vec![DeltaToolCall {
                     index: 0,
                     id: String::new(),
                     r#type: "function".to_string(),
@@ -782,7 +782,7 @@ impl ChatCompletionChunk {
                         name: None,
                         arguments: tool_calls[0].to_string(),
                     },
-                },
+                }],
             }),
             (None, None) => ChatCompletionDelta::Chat(TextMessage {
                 role: "assistant".to_string(),