From 818c8db29a0fa60b0d88f6274c7105cda2192fde Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 6 Mar 2025 16:24:11 +0100
Subject: [PATCH] change ChatCompletionChunk to align with "OpenAI Chat
 Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.
---
 flake.lock                                    | 127 +++-
 integration-tests/conftest.py                 |   8 +-
 .../test_chat_hfhub_nousage.json              |  62 ++
 .../test_chat_hfhub_usage.json                |  75 +++
 .../test_chat_openai_nousage.json             |  71 +++
 .../test_chat_openai_usage.json               |  87 +++
 ...t_flash_llama_completion_many_prompts.json |  18 +-
 ..._llama_completion_many_prompts_stream.json | 602 ------------------
 ..._flash_llama_completion_single_prompt.json |   6 +-
 .../models/test_chat_stream_options.py        |  16 +
 .../models/test_completion_prompts.py         | 140 +++-
 router/src/lib.rs                             |   3 +-
 router/src/server.rs                          |  74 ++-
 13 files changed, 581 insertions(+), 708 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json
 delete mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
 create mode 100644 integration-tests/models/test_chat_stream_options.py

diff --git a/flake.lock b/flake.lock
index b6cf7e53..719cdeea 100644
--- a/flake.lock
+++ b/flake.lock
@@ -2,10 +2,16 @@
   "nodes": {
     "cachix": {
       "inputs": {
-        "devenv": ["crate2nix"],
-        "flake-compat": ["crate2nix"],
+        "devenv": [
+          "crate2nix"
+        ],
+        "flake-compat": [
+          "crate2nix"
+        ],
         "nixpkgs": "nixpkgs",
-        "pre-commit-hooks": ["crate2nix"]
+        "pre-commit-hooks": [
+          "crate2nix"
+        ]
       },
       "locked": {
         "lastModified": 1709700175,
@@ -24,10 +30,19 @@
     },
     "cachix_2": {
       "inputs": {
-        "devenv": ["crate2nix", "crate2nix_stable"],
-        "flake-compat": ["crate2nix", "crate2nix_stable"],
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
         "nixpkgs": "nixpkgs_2",
-        "pre-commit-hooks": ["crate2nix", "crate2nix_stable"]
+        "pre-commit-hooks": [
+          "crate2nix",
+          "crate2nix_stable"
+        ]
       },
       "locked": {
         "lastModified": 1716549461,
@@ -46,8 +61,16 @@
     },
     "cachix_3": {
       "inputs": {
-        "devenv": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
-        "flake-compat": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
         "nixpkgs": "nixpkgs_3",
         "pre-commit-hooks": [
           "crate2nix",
@@ -78,15 +101,18 @@
         "flake-compat": "flake-compat_3",
         "flake-parts": "flake-parts_3",
         "nix-test-runner": "nix-test-runner_3",
-        "nixpkgs": ["tgi-nix", "nixpkgs"],
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
         "pre-commit-hooks": "pre-commit-hooks_3"
       },
       "locked": {
-        "lastModified": 1734429562,
-        "narHash": "sha256-V2XNs3Ir8WXNHdocfzkR/fu0FzkZ9uTDJkVecxJrGmQ=",
+        "lastModified": 1739473963,
+        "narHash": "sha256-ItAhpjNUzEWd/cgZVyW/jvoGbCec4TK29e1Mnmn1oJE=",
         "owner": "nix-community",
         "repo": "crate2nix",
-        "rev": "8537c2d7cb623679aaeff62c4c4c43a91566ab09",
+        "rev": "be31feae9a82c225c0fd1bdf978565dc452a483a",
         "type": "github"
       },
       "original": {
@@ -193,7 +219,11 @@
     "devshell_2": {
       "inputs": {
         "flake-utils": "flake-utils_3",
-        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1717408969,
@@ -212,7 +242,10 @@
     "devshell_3": {
       "inputs": {
         "flake-utils": "flake-utils_4",
-        "nixpkgs": ["crate2nix", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1711099426,
@@ -310,7 +343,11 @@
     },
     "flake-parts_2": {
       "inputs": {
-        "nixpkgs-lib": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs-lib": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1719745305,
@@ -328,7 +365,10 @@
     },
     "flake-parts_3": {
       "inputs": {
-        "nixpkgs-lib": ["crate2nix", "nixpkgs"]
+        "nixpkgs-lib": [
+          "crate2nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1712014858,
@@ -519,7 +559,11 @@
     },
     "gitignore_3": {
       "inputs": {
-        "nixpkgs": ["crate2nix", "pre-commit-hooks", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "pre-commit-hooks",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1709087332,
@@ -726,10 +770,22 @@
     },
     "pre-commit-hooks_2": {
       "inputs": {
-        "flake-compat": ["crate2nix", "crate2nix_stable", "flake-compat"],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "flake-compat"
+        ],
         "gitignore": "gitignore_2",
-        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"],
-        "nixpkgs-stable": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1719259945,
@@ -747,11 +803,20 @@
     },
     "pre-commit-hooks_3": {
       "inputs": {
-        "flake-compat": ["crate2nix", "flake-compat"],
+        "flake-compat": [
+          "crate2nix",
+          "flake-compat"
+        ],
         "flake-utils": "flake-utils_5",
         "gitignore": "gitignore_3",
-        "nixpkgs": ["crate2nix", "nixpkgs"],
-        "nixpkgs-stable": ["crate2nix", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1712055707,
@@ -772,21 +837,27 @@
         "crate2nix": "crate2nix",
         "flake-utils": "flake-utils_6",
         "nix-filter": "nix-filter",
-        "nixpkgs": ["tgi-nix", "nixpkgs"],
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
         "rust-overlay": "rust-overlay",
         "tgi-nix": "tgi-nix"
       }
     },
     "rust-overlay": {
       "inputs": {
-        "nixpkgs": ["tgi-nix", "nixpkgs"]
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
-        "lastModified": 1738549608,
-        "narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
+        "lastModified": 1741141853,
+        "narHash": "sha256-FauVtC+FbOgkKpGVuQTNxSqrvgbmVc7hFkjn/DacwMo=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
+        "rev": "02edad1f19d6dec824e0812e4cdc0aa7930ff8ae",
         "type": "github"
       },
       "original": {
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 6490f833..e7e64072 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -39,7 +39,13 @@ from typing import Dict, List, Optional
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 from syrupy.extensions.json import JSONSnapshotExtension
-
+from huggingface_hub.inference._generated.types.chat_completion import (
+    ChatCompletionStreamOutput,
+    ChatCompletionOutput,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChatCompletionChunk as OAIChatCompletionChunk,
+)
 from text_generation import AsyncClient
 from text_generation.types import (
     BestOfSequence,
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json
new file mode 100644
index 00000000..a05b685e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json
@@ -0,0 +1,62 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json
new file mode 100644
index 00000000..d2c969b2
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json
@@ -0,0 +1,75 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 3,
+      "prompt_tokens": 39,
+      "total_tokens": 42
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json
new file mode 100644
index 00000000..6c362059
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json
@@ -0,0 +1,71 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json
new file mode 100644
index 00000000..feb32567
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json
@@ -0,0 +1,87 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 3,
+      "completion_tokens_details": null,
+      "prompt_tokens": 39,
+      "prompt_tokens_details": null,
+      "total_tokens": 42
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
index 25b8120d..5bef4172 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@@ -1,17 +1,17 @@
 {
   "choices": [
-    {
-      "finish_reason": "length",
-      "index": 0,
-      "logprobs": null,
-      "text": " A Beginner’s Guide\nDeep learning is a subset"
-    },
     {
       "finish_reason": "length",
       "index": 1,
       "logprobs": null,
       "text": " This is a question that has puzzled many people for"
     },
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": " A Beginner’s Guide\nDeep learning is a subset"
+    },
     {
       "finish_reason": "length",
       "index": 3,
@@ -25,11 +25,11 @@
       "text": " Paris\nWhat is the capital of France?\nThe"
     }
   ],
-  "created": 1725877154,
+  "created": 1741264813,
   "id": "",
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "text_completion",
-  "system_fingerprint": "2.2.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 40,
     "prompt_tokens": 22,
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
deleted file mode 100644
index dd22ceae..00000000
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
+++ /dev/null
@@ -1,602 +0,0 @@
-[
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " A"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " This"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " Paris"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "us"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " Beginner"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " is"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "cul"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "’s"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " a"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "What"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "as"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " Guide"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " question"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " is"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "_minus"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " that"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " the"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "cul"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "Deep"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " has"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " capital"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "as"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " learning"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " puzzled"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " of"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "(s"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " is"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " many"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " France"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "):\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " a"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " people"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "?\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "   "
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 0,
-        "logprobs": null,
-        "text": " subset"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 1,
-        "logprobs": null,
-        "text": " for"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 2,
-        "logprobs": null,
-        "text": "The"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 3,
-        "logprobs": null,
-        "text": " \"\"\"\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  }
-]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
index 7ad56271..1cb8c103 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
@@ -7,11 +7,11 @@
       "text": " A Beginner’s Guide\nDeep learning is a subset"
     }
   ],
-  "created": 1725876621,
+  "created": 1741264812,
   "id": "",
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "text_completion",
-  "system_fingerprint": "2.2.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 10,
     "prompt_tokens": 6,
diff --git a/integration-tests/models/test_chat_stream_options.py b/integration-tests/models/test_chat_stream_options.py
new file mode 100644
index 00000000..41f4f741
--- /dev/null
+++ b/integration-tests/models/test_chat_stream_options.py
@@ -0,0 +1,16 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def chat_handle(launcher):
+    with launcher(
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def chat_client(chat_handle):
+    await chat_handle.health(300)
+    return chat_handle.client
+
diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
index 27988ef9..de04d85b 100644
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@@ -2,8 +2,8 @@ import pytest
 import requests
 import json
 from aiohttp import ClientSession
+from openai import OpenAI
 from huggingface_hub import InferenceClient
-
 from text_generation.types import Completion
 
 
@@ -158,47 +158,30 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
 async def test_flash_llama_completion_many_prompts_stream(
     flash_llama_completion, response_snapshot
 ):
-    request = {
-        "model": "tgi",
-        "prompt": [
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.completion(
+        model="tgi",
+        prompt=[
             "What is Deep Learning?",
             "Is water wet?",
             "What is the capital of France?",
             "def mai",
         ],
-        "max_tokens": 10,
-        "seed": 0,
-        "temperature": 0.0,
-        "stream": True,
-    }
+        max_tokens=10,
+        seed=0,
+        temperature=0.0,
+        stream=True,
+    )
 
-    url = f"{flash_llama_completion.base_url}/v1/completions"
-
-    chunks = []
     strings = [""] * 4
-    async with ClientSession(headers=flash_llama_completion.headers) as session:
-        async with session.post(url, json=request) as response:
-            # iterate over the stream
-            async for chunk in response.content.iter_any():
-                # remove "data:"
-                chunk = chunk.decode().split("\n\n")
-                # remove "data:" if present
-                chunk = [c.replace("data:", "") for c in chunk]
-                # remove empty strings
-                chunk = [c for c in chunk if c]
-                # remove completion marking chunk
-                chunk = [c for c in chunk if c != " [DONE]"]
-                # parse json
-                chunk = [json.loads(c) for c in chunk]
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+        assert "choices" in chunk
+        index = chunk.choices[0].index
+        assert 0 <= index <= 4
+        strings[index] += chunk.choices[0].text
 
-                for c in chunk:
-                    chunks.append(Completion(**c))
-                    assert "choices" in c
-                    index = c["choices"][0]["index"]
-                    assert 0 <= index <= 4
-                    strings[index] += c["choices"][0]["text"]
-
-    assert response.status == 200
     assert list(strings) == [
         " A Beginner’s Guide\nDeep learning is a subset",
         " This is a question that has puzzled many people for",
@@ -206,3 +189,92 @@ async def test_flash_llama_completion_many_prompts_stream(
         'usculas_minusculas(s):\n    """\n',
     ]
     assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
+
+    stream = client.chat.completions.create(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": True},
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+    for chunk in chunks[:-1]:
+        assert chunk.usage is None
+    for chunk in chunks[-1:]:
+        assert chunk.usage is not None
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
+
+    stream = client.chat.completions.create(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": False},
+    )
+
+    chunks = []
+    for chunk in stream:
+        assert chunk.usage is None
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": True},
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+
+    for chunk in chunks[:-1]:
+        assert chunk.usage is None
+    for chunk in chunks[-1:]:
+        assert chunk.usage is not None
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": False},
+    )
+
+    chunks = []
+    for chunk in stream:
+        assert chunk.usage is None
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
diff --git a/router/src/lib.rs b/router/src/lib.rs
index a7923c4c..08c31b64 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -764,7 +764,6 @@ impl ChatCompletionChunk {
         created: u64,
         logprobs: Option<ChatCompletionLogprobs>,
         finish_reason: Option<String>,
-        usage: Option<Usage>,
     ) -> Self {
         let delta = match (delta, tool_calls) {
             (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
@@ -801,7 +800,7 @@ impl ChatCompletionChunk {
                 logprobs,
                 finish_reason,
             }],
-            usage,
+            usage: None,
         }
     }
 }
diff --git a/router/src/server.rs b/router/src/server.rs
index 6e55d2bc..9f312316 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1124,7 +1124,6 @@ enum StreamState {
 fn create_event_from_stream_token(
     stream_token: &StreamResponse,
     logprobs: bool,
-    stream_options: Option<StreamOptions>,
     inner_using_tools: bool,
     system_fingerprint: String,
     model_id: String,
@@ -1151,30 +1150,10 @@ fn create_event_from_stream_token(
 
         (content, None)
     };
-
-    let (usage, finish_reason) = match &stream_token.details {
-        Some(details) => {
-            let usage = if stream_options
-                .as_ref()
-                .map(|s| s.include_usage)
-                .unwrap_or(false)
-            {
-                let completion_tokens = details.generated_tokens;
-                let prompt_tokens = details.input_length;
-                let total_tokens = prompt_tokens + completion_tokens;
-                Some(Usage {
-                    completion_tokens,
-                    prompt_tokens,
-                    total_tokens,
-                })
-            } else {
-                None
-            };
-            (usage, Some(details.finish_reason.format(true)))
-        }
-        None => (None, None),
-    };
-
+    let finish_reason = stream_token
+        .details
+        .as_ref()
+        .map(|details| details.finish_reason.format(true));
     let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
         model_id.clone(),
         system_fingerprint.clone(),
@@ -1183,7 +1162,6 @@ fn create_event_from_stream_token(
         current_time,
         logprobs,
         finish_reason,
-        usage,
     ));
 
     event.json_data(chat_complete).unwrap_or_else(|e| {
@@ -1287,6 +1265,17 @@ pub(crate) async fn chat_completions(
                 match result{
                 Ok(stream_token) => {
                     let token_text = &stream_token.token.text.clone();
+                    let usage = stream_token.details.as_ref().map(|details| {
+                        let completion_tokens = details.generated_tokens;
+                        let prompt_tokens = details.input_length;
+                        let total_tokens = prompt_tokens + completion_tokens;
+
+                        Usage {
+                            completion_tokens,
+                            prompt_tokens,
+                            total_tokens,
+                        }
+                    });
                     match state {
                         StreamState::Buffering => {
                             json_buffer.push_str(&token_text.replace(" ", ""));
@@ -1307,7 +1296,6 @@ pub(crate) async fn chat_completions(
                                         let event = create_event_from_stream_token(
                                             stream_token,
                                             logprobs,
-                                            stream_options.clone(),
                                             response_as_tool,
                                             system_fingerprint.clone(),
                                             model_id.clone(),
@@ -1347,7 +1335,6 @@ pub(crate) async fn chat_completions(
                                         current_time,
                                         None,
                                         None,
-                                        None,
                                     ));
                                 yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
                                     InferError::StreamSerializationError(e.to_string()).into()
@@ -1369,7 +1356,6 @@ pub(crate) async fn chat_completions(
                             let event = create_event_from_stream_token(
                                 &stream_token,
                                 logprobs,
-                                stream_options.clone(),
                                 response_as_tool,
                                 system_fingerprint.clone(),
                                 model_id.clone(),
@@ -1378,6 +1364,36 @@ pub(crate) async fn chat_completions(
                             yield Ok::<Event, Infallible>(event);
                         }
                     }
+
+                    let should_send_usage = usage.is_some()
+                        && stream_options
+                            .as_ref()
+                            .is_some_and(|opts| opts.include_usage);
+
+                    if should_send_usage {
+                        let usage_data = usage.unwrap();
+                        let current_time = std::time::SystemTime::now()
+                            .duration_since(std::time::UNIX_EPOCH)
+                            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                            .as_secs();
+
+                        let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk {
+                            id: String::new(),
+                            created: current_time,
+                            model: model_id.clone(),
+                            system_fingerprint: system_fingerprint.clone(),
+                            choices: vec![],
+                            usage: Some(Usage {
+                                prompt_tokens: usage_data.prompt_tokens,
+                                completion_tokens: usage_data.completion_tokens,
+                                total_tokens: usage_data.total_tokens,
+                            }),
+                        });
+
+                        yield Ok(Event::default()
+                            .json_data(chat_complete)
+                            .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into()));
+                    }
                 }
                 Err(err) => yield Ok(err.into_openai_event())
                 }