Only send the usage when asked for.

2025-09-12 04:44:52 +00:00 · 2024-09-18 12:56:59 +02:00 · 2024-09-18 12:56:59 +02:00 · df287fe758
commit df287fe758
parent 4716bd51ad
3 changed files with 69 additions and 12 deletions
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@ -68,7 +68,7 @@ async def test_flash_llama_completion_stream_usage(
    }
    string = ""
    chunks = []
-    is_final = False
+    had_usage = False
    async with ClientSession(headers=flash_llama_completion.headers) as session:
        async with session.post(url, json=request) as response:
            # iterate over the stream
@ -93,18 +93,68 @@ async def test_flash_llama_completion_stream_usage(
                        string += c["choices"][0]["delta"]["content"]

                        has_usage = c["usage"] is not None
-                        assert not is_final
+                        assert not had_usage
                        if has_usage:
-                            is_final = True
+                            had_usage = True
                    else:
                        raise RuntimeError("Expected different payload")
-    assert is_final
+    assert had_usage
    assert (
        string
        == "**Deep Learning: An Overview**\n=====================================\n\n"
    )
    assert chunks == response_snapshot

+    request = {
+        "model": "tgi",
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is Deep Learning?",
+            }
+        ],
+        "max_tokens": 10,
+        "temperature": 0.0,
+        "stream": True,
+    }
+    string = ""
+    chunks = []
+    had_usage = False
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(ChatCompletionChunk(**c))
+                    assert "choices" in c
+                    if len(c["choices"]) == 1:
+                        index = c["choices"][0]["index"]
+                        assert index == 0
+                        string += c["choices"][0]["delta"]["content"]
+
+                        has_usage = c["usage"] is not None
+                        assert not had_usage
+                        if has_usage:
+                            had_usage = True
+                    else:
+                        raise RuntimeError("Expected different payload")
+    assert not had_usage
+    assert (
+        string
+        == "**Deep Learning: An Overview**\n=====================================\n\n"
+    )
+

@pytest.mark.release
 def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
--- a/nix/client.nix
+++ b/nix/client.nix
@ -6,7 +6,7 @@
 }:

 buildPythonPackage {
-  name = "text-generation-x";
+  name = "text-generation";

  src = ../clients/python;

--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1175,6 +1175,7 @@ async fn chat_completions(
        seed,
        stop,
        stream,
+        stream_options,
        tools,
        tool_choice,
        tool_prompt,
@ -1267,17 +1268,23 @@ async fn chat_completions(

            let (usage, finish_reason) = match stream_token.details {
                Some(details) => {
-                    let completion_tokens = details.generated_tokens;
-                    let prompt_tokens = details.input_length;
-                    let total_tokens = prompt_tokens + completion_tokens;
-                    (
+                    let usage = if stream_options
+                        .as_ref()
+                        .map(|s| s.include_usage)
+                        .unwrap_or(false)
+                    {
+                        let completion_tokens = details.generated_tokens;
+                        let prompt_tokens = details.input_length;
+                        let total_tokens = prompt_tokens + completion_tokens;
                        Some(Usage {
                            completion_tokens,
                            prompt_tokens,
                            total_tokens,
-                        }),
-                        Some(details.finish_reason.format(true)),
-                    )
+                        })
+                    } else {
+                        None
+                    };
+                    (usage, Some(details.finish_reason.format(true)))
                }
                None => (None, None),
            };