change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> add in Buffering.. Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> fix: handle usage outside of stream state and add tests Simplifying everything quite a bit. Remove the unused model_dump. Clippy. Clippy ? Ruff. Uppgrade the flake for latest transformers. Upgrade after rebase. Remove potential footgun. Fix completion test.
2025-09-09 19:34:53 +00:00 · 2025-03-06 16:24:11 +01:00 · 2025-03-06 16:24:11 +01:00 · 818c8db29a
commit 818c8db29a
parent 622908deab
13 changed files with 581 additions and 708 deletions
--- a/flake.lock
+++ b/flake.lock
@ -2,10 +2,16 @@
  "nodes": {
    "cachix": {
      "inputs": {
-        "devenv": ["crate2nix"],
-        "flake-compat": ["crate2nix"],
+        "devenv": [
+          "crate2nix"
+        ],
+        "flake-compat": [
+          "crate2nix"
+        ],
        "nixpkgs": "nixpkgs",
-        "pre-commit-hooks": ["crate2nix"]
+        "pre-commit-hooks": [
+          "crate2nix"
+        ]
      },
      "locked": {
        "lastModified": 1709700175,
@ -24,10 +30,19 @@
    },
    "cachix_2": {
      "inputs": {
-        "devenv": ["crate2nix", "crate2nix_stable"],
-        "flake-compat": ["crate2nix", "crate2nix_stable"],
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
        "nixpkgs": "nixpkgs_2",
-        "pre-commit-hooks": ["crate2nix", "crate2nix_stable"]
+        "pre-commit-hooks": [
+          "crate2nix",
+          "crate2nix_stable"
+        ]
      },
      "locked": {
        "lastModified": 1716549461,
@ -46,8 +61,16 @@
    },
    "cachix_3": {
      "inputs": {
-        "devenv": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
-        "flake-compat": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
        "nixpkgs": "nixpkgs_3",
        "pre-commit-hooks": [
          "crate2nix",
@ -78,15 +101,18 @@
        "flake-compat": "flake-compat_3",
        "flake-parts": "flake-parts_3",
        "nix-test-runner": "nix-test-runner_3",
-        "nixpkgs": ["tgi-nix", "nixpkgs"],
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
        "pre-commit-hooks": "pre-commit-hooks_3"
      },
      "locked": {
-        "lastModified": 1734429562,
-        "narHash": "sha256-V2XNs3Ir8WXNHdocfzkR/fu0FzkZ9uTDJkVecxJrGmQ=",
+        "lastModified": 1739473963,
+        "narHash": "sha256-ItAhpjNUzEWd/cgZVyW/jvoGbCec4TK29e1Mnmn1oJE=",
        "owner": "nix-community",
        "repo": "crate2nix",
-        "rev": "8537c2d7cb623679aaeff62c4c4c43a91566ab09",
+        "rev": "be31feae9a82c225c0fd1bdf978565dc452a483a",
        "type": "github"
      },
      "original": {
@ -193,7 +219,11 @@
    "devshell_2": {
      "inputs": {
        "flake-utils": "flake-utils_3",
-        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
      },
      "locked": {
        "lastModified": 1717408969,
@ -212,7 +242,10 @@
    "devshell_3": {
      "inputs": {
        "flake-utils": "flake-utils_4",
-        "nixpkgs": ["crate2nix", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ]
      },
      "locked": {
        "lastModified": 1711099426,
@ -310,7 +343,11 @@
    },
    "flake-parts_2": {
      "inputs": {
-        "nixpkgs-lib": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs-lib": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
      },
      "locked": {
        "lastModified": 1719745305,
@ -328,7 +365,10 @@
    },
    "flake-parts_3": {
      "inputs": {
-        "nixpkgs-lib": ["crate2nix", "nixpkgs"]
+        "nixpkgs-lib": [
+          "crate2nix",
+          "nixpkgs"
+        ]
      },
      "locked": {
        "lastModified": 1712014858,
@ -519,7 +559,11 @@
    },
    "gitignore_3": {
      "inputs": {
-        "nixpkgs": ["crate2nix", "pre-commit-hooks", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "pre-commit-hooks",
+          "nixpkgs"
+        ]
      },
      "locked": {
        "lastModified": 1709087332,
@ -726,10 +770,22 @@
    },
    "pre-commit-hooks_2": {
      "inputs": {
-        "flake-compat": ["crate2nix", "crate2nix_stable", "flake-compat"],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "flake-compat"
+        ],
        "gitignore": "gitignore_2",
-        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"],
-        "nixpkgs-stable": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
      },
      "locked": {
        "lastModified": 1719259945,
@ -747,11 +803,20 @@
    },
    "pre-commit-hooks_3": {
      "inputs": {
-        "flake-compat": ["crate2nix", "flake-compat"],
+        "flake-compat": [
+          "crate2nix",
+          "flake-compat"
+        ],
        "flake-utils": "flake-utils_5",
        "gitignore": "gitignore_3",
-        "nixpkgs": ["crate2nix", "nixpkgs"],
-        "nixpkgs-stable": ["crate2nix", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "nixpkgs"
+        ]
      },
      "locked": {
        "lastModified": 1712055707,
@ -772,21 +837,27 @@
        "crate2nix": "crate2nix",
        "flake-utils": "flake-utils_6",
        "nix-filter": "nix-filter",
-        "nixpkgs": ["tgi-nix", "nixpkgs"],
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
        "rust-overlay": "rust-overlay",
        "tgi-nix": "tgi-nix"
      }
    },
    "rust-overlay": {
      "inputs": {
-        "nixpkgs": ["tgi-nix", "nixpkgs"]
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ]
      },
      "locked": {
-        "lastModified": 1738549608,
-        "narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
+        "lastModified": 1741141853,
+        "narHash": "sha256-FauVtC+FbOgkKpGVuQTNxSqrvgbmVc7hFkjn/DacwMo=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
+        "rev": "02edad1f19d6dec824e0812e4cdc0aa7930ff8ae",
        "type": "github"
      },
      "original": {
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -39,7 +39,13 @@ from typing import Dict, List, Optional
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 from syrupy.extensions.json import JSONSnapshotExtension
-
+from huggingface_hub.inference._generated.types.chat_completion import (
+    ChatCompletionStreamOutput,
+    ChatCompletionOutput,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChatCompletionChunk as OAIChatCompletionChunk,
+)
 from text_generation import AsyncClient
 from text_generation.types import (
    BestOfSequence,
--- a/integration-tests/models/snapshots/test_completion_prompts/test_chat_hfhub_nousage.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_chat_hfhub_nousage.json
@ -0,0 +1,62 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
--- a/integration-tests/models/snapshots/test_completion_prompts/test_chat_hfhub_usage.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_chat_hfhub_usage.json
@ -0,0 +1,75 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 3,
+      "prompt_tokens": 39,
+      "total_tokens": 42
+    }
+  }
+]
--- a/integration-tests/models/snapshots/test_completion_prompts/test_chat_openai_nousage.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_chat_openai_nousage.json
@ -0,0 +1,71 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
--- a/integration-tests/models/snapshots/test_completion_prompts/test_chat_openai_usage.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_chat_openai_usage.json
@ -0,0 +1,87 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 3,
+      "completion_tokens_details": null,
+      "prompt_tokens": 39,
+      "prompt_tokens_details": null,
+      "total_tokens": 42
+    }
+  }
+]
--- a/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@ -1,17 +1,17 @@
 {
  "choices": [
-    {
-      "finish_reason": "length",
-      "index": 0,
-      "logprobs": null,
-      "text": " A Beginner’s Guide\nDeep learning is a subset"
-    },
    {
      "finish_reason": "length",
      "index": 1,
      "logprobs": null,
      "text": " This is a question that has puzzled many people for"
    },
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": " A Beginner’s Guide\nDeep learning is a subset"
+    },
    {
      "finish_reason": "length",
      "index": 3,
@ -25,11 +25,11 @@
      "text": " Paris\nWhat is the capital of France?\nThe"
    }
  ],
-  "created": 1725877154,
+  "created": 1741264813,
  "id": "",
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "text_completion",
-  "system_fingerprint": "2.2.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 40,
    "prompt_tokens": 22,
--- a/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
@ -1,602 +0,0 @@
-[
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " A"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " This"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " Paris"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "us"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " Beginner"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " is"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "cul"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "’s"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " a"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "What"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "as"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " Guide"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " question"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " is"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "_minus"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " that"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " the"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "cul"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": "Deep"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " has"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " capital"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "as"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " learning"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " puzzled"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " of"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "(s"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " is"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " many"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": " France"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "):\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 0,
-        "logprobs": null,
-        "text": " a"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 1,
-        "logprobs": null,
-        "text": " people"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 2,
-        "logprobs": null,
-        "text": "?\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "",
-        "index": 3,
-        "logprobs": null,
-        "text": "   "
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 0,
-        "logprobs": null,
-        "text": " subset"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 1,
-        "logprobs": null,
-        "text": " for"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 2,
-        "logprobs": null,
-        "text": "The"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 3,
-        "logprobs": null,
-        "text": " \"\"\"\n"
-      }
-    ],
-    "created": 1725883643,
-    "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
-  }
-]
--- a/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_single_prompt.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_single_prompt.json
@ -7,11 +7,11 @@
      "text": " A Beginner’s Guide\nDeep learning is a subset"
    }
  ],
-  "created": 1725876621,
+  "created": 1741264812,
  "id": "",
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "text_completion",
-  "system_fingerprint": "2.2.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 6,
--- a/integration-tests/models/test_chat_stream_options.py
+++ b/integration-tests/models/test_chat_stream_options.py
@ -0,0 +1,16 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def chat_handle(launcher):
+    with launcher(
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def chat_client(chat_handle):
+    await chat_handle.health(300)
+    return chat_handle.client
+
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@ -2,8 +2,8 @@ import pytest
 import requests
 import json
 from aiohttp import ClientSession
+from openai import OpenAI
 from huggingface_hub import InferenceClient
-
 from text_generation.types import Completion


@ -158,47 +158,30 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
 async def test_flash_llama_completion_many_prompts_stream(
    flash_llama_completion, response_snapshot
 ):
-    request = {
-        "model": "tgi",
-        "prompt": [
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.completion(
+        model="tgi",
+        prompt=[
            "What is Deep Learning?",
            "Is water wet?",
            "What is the capital of France?",
            "def mai",
        ],
-        "max_tokens": 10,
-        "seed": 0,
-        "temperature": 0.0,
-        "stream": True,
-    }
+        max_tokens=10,
+        seed=0,
+        temperature=0.0,
+        stream=True,
+    )

-    url = f"{flash_llama_completion.base_url}/v1/completions"
-
-    chunks = []
    strings = [""] * 4
-    async with ClientSession(headers=flash_llama_completion.headers) as session:
-        async with session.post(url, json=request) as response:
-            # iterate over the stream
-            async for chunk in response.content.iter_any():
-                # remove "data:"
-                chunk = chunk.decode().split("\n\n")
-                # remove "data:" if present
-                chunk = [c.replace("data:", "") for c in chunk]
-                # remove empty strings
-                chunk = [c for c in chunk if c]
-                # remove completion marking chunk
-                chunk = [c for c in chunk if c != " [DONE]"]
-                # parse json
-                chunk = [json.loads(c) for c in chunk]
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+        assert "choices" in chunk
+        index = chunk.choices[0].index
+        assert 0 <= index <= 4
+        strings[index] += chunk.choices[0].text

-                for c in chunk:
-                    chunks.append(Completion(**c))
-                    assert "choices" in c
-                    index = c["choices"][0]["index"]
-                    assert 0 <= index <= 4
-                    strings[index] += c["choices"][0]["text"]
-
-    assert response.status == 200
    assert list(strings) == [
        " A Beginner’s Guide\nDeep learning is a subset",
        " This is a question that has puzzled many people for",
@ -206,3 +189,92 @@ async def test_flash_llama_completion_many_prompts_stream(
        'usculas_minusculas(s):\n    """\n',
    ]
    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
+
+    stream = client.chat.completions.create(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": True},
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+    for chunk in chunks[:-1]:
+        assert chunk.usage is None
+    for chunk in chunks[-1:]:
+        assert chunk.usage is not None
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
+
+    stream = client.chat.completions.create(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": False},
+    )
+
+    chunks = []
+    for chunk in stream:
+        assert chunk.usage is None
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": True},
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+
+    for chunk in chunks[:-1]:
+        assert chunk.usage is None
+    for chunk in chunks[-1:]:
+        assert chunk.usage is not None
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": False},
+    )
+
+    chunks = []
+    for chunk in stream:
+        assert chunk.usage is None
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -764,7 +764,6 @@ impl ChatCompletionChunk {
        created: u64,
        logprobs: Option<ChatCompletionLogprobs>,
        finish_reason: Option<String>,
-        usage: Option<Usage>,
    ) -> Self {
        let delta = match (delta, tool_calls) {
            (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
@ -801,7 +800,7 @@ impl ChatCompletionChunk {
                logprobs,
                finish_reason,
            }],
-            usage,
+            usage: None,
        }
    }
 }
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1124,7 +1124,6 @@ enum StreamState {
 fn create_event_from_stream_token(
    stream_token: &StreamResponse,
    logprobs: bool,
-    stream_options: Option<StreamOptions>,
    inner_using_tools: bool,
    system_fingerprint: String,
    model_id: String,
@ -1151,30 +1150,10 @@ fn create_event_from_stream_token(

        (content, None)
    };
-
-    let (usage, finish_reason) = match &stream_token.details {
-        Some(details) => {
-            let usage = if stream_options
-                .as_ref()
-                .map(|s| s.include_usage)
-                .unwrap_or(false)
-            {
-                let completion_tokens = details.generated_tokens;
-                let prompt_tokens = details.input_length;
-                let total_tokens = prompt_tokens + completion_tokens;
-                Some(Usage {
-                    completion_tokens,
-                    prompt_tokens,
-                    total_tokens,
-                })
-            } else {
-                None
-            };
-            (usage, Some(details.finish_reason.format(true)))
-        }
-        None => (None, None),
-    };
-
+    let finish_reason = stream_token
+        .details
+        .as_ref()
+        .map(|details| details.finish_reason.format(true));
    let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
        model_id.clone(),
        system_fingerprint.clone(),
@ -1183,7 +1162,6 @@ fn create_event_from_stream_token(
        current_time,
        logprobs,
        finish_reason,
-        usage,
    ));

    event.json_data(chat_complete).unwrap_or_else(|e| {
@ -1287,6 +1265,17 @@ pub(crate) async fn chat_completions(
                match result{
                Ok(stream_token) => {
                    let token_text = &stream_token.token.text.clone();
+                    let usage = stream_token.details.as_ref().map(|details| {
+                        let completion_tokens = details.generated_tokens;
+                        let prompt_tokens = details.input_length;
+                        let total_tokens = prompt_tokens + completion_tokens;
+
+                        Usage {
+                            completion_tokens,
+                            prompt_tokens,
+                            total_tokens,
+                        }
+                    });
                    match state {
                        StreamState::Buffering => {
                            json_buffer.push_str(&token_text.replace(" ", ""));
@ -1307,7 +1296,6 @@ pub(crate) async fn chat_completions(
                                        let event = create_event_from_stream_token(
                                            stream_token,
                                            logprobs,
-                                            stream_options.clone(),
                                            response_as_tool,
                                            system_fingerprint.clone(),
                                            model_id.clone(),
@ -1347,7 +1335,6 @@ pub(crate) async fn chat_completions(
                                        current_time,
                                        None,
                                        None,
-                                        None,
                                    ));
                                yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
                                    InferError::StreamSerializationError(e.to_string()).into()
@ -1369,7 +1356,6 @@ pub(crate) async fn chat_completions(
                            let event = create_event_from_stream_token(
                                &stream_token,
                                logprobs,
-                                stream_options.clone(),
                                response_as_tool,
                                system_fingerprint.clone(),
                                model_id.clone(),
@ -1378,6 +1364,36 @@ pub(crate) async fn chat_completions(
                            yield Ok::<Event, Infallible>(event);
                        }
                    }
+
+                    let should_send_usage = usage.is_some()
+                        && stream_options
+                            .as_ref()
+                            .is_some_and(|opts| opts.include_usage);
+
+                    if should_send_usage {
+                        let usage_data = usage.unwrap();
+                        let current_time = std::time::SystemTime::now()
+                            .duration_since(std::time::UNIX_EPOCH)
+                            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                            .as_secs();
+
+                        let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk {
+                            id: String::new(),
+                            created: current_time,
+                            model: model_id.clone(),
+                            system_fingerprint: system_fingerprint.clone(),
+                            choices: vec![],
+                            usage: Some(Usage {
+                                prompt_tokens: usage_data.prompt_tokens,
+                                completion_tokens: usage_data.completion_tokens,
+                                total_tokens: usage_data.total_tokens,
+                            }),
+                        });
+
+                        yield Ok(Event::default()
+                            .json_data(chat_complete)
+                            .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into()));
+                    }
                }
                Err(err) => yield Ok(err.into_openai_event())
                }