chore: release v3.3.5

2025-09-13 13:24:53 +00:00 · 2025-09-02 16:58:41 +02:00
16 changed files with 31 additions and 47 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -175,6 +175,13 @@ jobs:
          registry: docker.io
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      - name: Login to Azure Container Registry
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.AZURE_DOCKER_USERNAME }}
          password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
          registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
      # If pull request
      - name: Extract metadata (tags, labels) for Docker
        if: ${{ github.event_name == 'pull_request' }}
@ -196,6 +203,7 @@ jobs:
          images: |
            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
            ghcr.io/huggingface/text-generation-inference
            db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
          tags: |
            type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4650,7 +4650,7 @@ dependencies = [
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "async-trait",
 "clap 4.5.32",
@ -4671,7 +4671,7 @@ dependencies = [
 [[package]]
 name = "text-generation-benchmark"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "average",
 "clap 4.5.32",
@ -4691,7 +4691,7 @@ dependencies = [
 [[package]]
 name = "text-generation-client"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@ -4709,7 +4709,7 @@ dependencies = [
 [[package]]
 name = "text-generation-launcher"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "clap 4.5.32",
 "ctrlc",
@ -4730,7 +4730,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "anyhow",
 "async-stream",
@ -4782,7 +4782,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "async-trait",
 "bindgen 0.71.1",
@ -4800,7 +4800,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router-v2"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "async-stream",
 "async-trait",
@ -4849,7 +4849,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router-v3"
-version = "3.3.5-dev0"
+version = "3.3.5"
 dependencies = [
 "async-stream",
 "async-trait",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 [workspace.package]
-version = "3.3.5-dev0"
+version = "3.3.5"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "3.3.5-dev0"
+    "version": "3.3.5"
  },
  "paths": {
    "/": {
--- a/docs/source/backends/llamacpp.md
+++ b/docs/source/backends/llamacpp.md
@ -83,7 +83,7 @@ docker run \
    -e "HF_TOKEN=$HF_TOKEN" \
    -v "$HOME/models:/app/models" \
    tgi-llamacpp \
-    --n-gpu-layers 99 \
+    --n-gpu-layers 99
    --model-id "Qwen/Qwen2.5-3B-Instruct"
 ```
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
 <div class="block dark:hidden">
 	<iframe
-        src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
+        src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
        width="850"
        height="350"
    ></iframe>
 </div>
 <div class="hidden dark:block">
    <iframe
-        src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
+        src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
        width="850"
        height="350"
    ></iframe>
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 42,
    "prompt_tokens": 277,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 62,
    "prompt_tokens": 277,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 67,
    "prompt_tokens": 277,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 72,
    "prompt_tokens": 275,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 80,
    "prompt_tokens": 279,
--- a/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_basic.json
+++ b/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_basic.json
@ -14,7 +14,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 35,
    "prompt_tokens": 32,
--- a/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_complex.json
+++ b/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_complex.json
@ -14,7 +14,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 44,
    "prompt_tokens": 37,
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
@ -18,7 +18,7 @@
    "id": "",
    "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "3.3.5-dev0-native",
+    "system_fingerprint": "3.3.5-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
@ -44,7 +44,7 @@
    "id": "",
    "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "3.3.5-dev0-native",
+    "system_fingerprint": "3.3.5-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.5-dev0-native",
+  "system_fingerprint": "3.3.5-native",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 45,
--- a/server/text_generation_server/layers/attention/flashinfer.py
+++ b/server/text_generation_server/layers/attention/flashinfer.py
@ -1,7 +1,6 @@
 from typing import Optional
 from contextvars import ContextVar
 from contextlib import contextmanager
 import math
 import flashinfer
 import torch
@ -21,20 +20,6 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
 workspace: Optional[torch.Tensor] = None
 def unpad_2d_mask(
    attention_mask: torch.Tensor, seq_lengths: torch.Tensor
 ) -> torch.Tensor:
    # Like torch unpad_sequence, but for 2D masks.
    unpadded_tensors = []
    for i, length in enumerate(seq_lengths):
        unpadded_matrix = attention_mask[i, :length, :length]
        unpadded_tensors.append(unpadded_matrix.flatten())
    packed_tensor = torch.cat(unpadded_tensors)
    return packed_tensor
 def get_workspace(device):
    """Get shared flashinfer workspace."""
    global workspace
@ -98,15 +83,6 @@ def use_prefill_with_paged_kv_state(
        last_page_len += 1
    token = prefill_with_paged_kv_state.set(state)
    # Attention masks are padded, unpad.
    if custom_mask is not None:
        bs = input_lengths.shape[0]
        seq_len = math.isqrt(custom_mask.numel() // bs)
        custom_mask = unpad_2d_mask(
            custom_mask.reshape(bs, seq_len, seq_len), input_lengths
        )
    try:
        state.plan(
            qo_indptr=cu_seqlens,