From 0f791622883238924da013464522eff05794195a Mon Sep 17 00:00:00 2001
From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com>
Date: Tue, 2 Sep 2025 15:35:42 +0200
Subject: [PATCH] chore: prepare version 3.3.5 (#3314)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* chore: prepare version 3.3.5

* black

* neuron: black

* Update hf-xet in uv lockfile

* Attempt to fix API doc check failure

Add `error_type` where missing.

* Pin redocly version

* Sync redocly with Nix for now

---------

Co-authored-by: Daniël de Kok <me@danieldk.eu>
---
 .github/workflows/autodocs.yaml               |  2 +-
 Cargo.lock                                    | 16 ++---
 Cargo.toml                                    |  2 +-
 README.md                                     |  6 +-
 .../docker_commands/docker_commands.md        | 10 +--
 backends/neuron/tests/server/test_prefill.py  |  1 +
 docs/openapi.json                             | 62 ++++++++++++-------
 docs/source/backends/gaudi.mdx                |  6 +-
 docs/source/backends/neuron.md                |  2 +-
 .../basic_tutorials/gated_model_access.md     |  2 +-
 docs/source/conceptual/quantization.md        |  6 +-
 docs/source/installation_amd.md               |  2 +-
 docs/source/installation_intel.md             |  4 +-
 docs/source/installation_nvidia.md            |  2 +-
 docs/source/quicktour.md                      |  4 +-
 docs/source/reference/api_reference.md        |  2 +-
 ...est_flash_gemma3_image_base64_rgb_jpg.json |  2 +-
 ...est_flash_gemma3_image_base64_rgb_png.json |  2 +-
 .../test_flash_gemma3_image_base64_rgba.json  |  2 +-
 .../test_flash_gemma3_image_cow.json          |  2 +-
 .../test_flash_gemma3_image_cow_dog.json      |  2 +-
 .../test_json_schema_basic.json               |  2 +-
 .../test_json_schema_complex.json             |  2 +-
 .../test_mllama/test_mllama_load.json         |  4 +-
 .../test_mllama/test_mllama_simpl.json        |  2 +-
 .../test_grammar_response_format_llama.py     | 16 ++++-
 router/src/server.rs                          | 40 ++++++------
 .../models/custom_modeling/idefics2.py        |  2 +-
 .../models/custom_modeling/idefics3.py        |  2 +-
 .../models/custom_modeling/idefics_config.py  |  2 +-
 .../custom_modeling/idefics_modeling.py       |  2 +-
 .../models/custom_modeling/idefics_vision.py  |  2 +-
 .../models/custom_modeling/llava_next.py      |  2 +-
 .../models/custom_modeling/neox_modeling.py   |  2 +-
 .../models/custom_modeling/t5_modeling.py     |  2 +-
 .../text_generation_server/utils/segments.py  |  2 +-
 server/uv.lock                                | 32 +++++-----
 37 files changed, 143 insertions(+), 112 deletions(-)

diff --git a/.github/workflows/autodocs.yaml b/.github/workflows/autodocs.yaml
index a768f263..4e799939 100644
--- a/.github/workflows/autodocs.yaml
+++ b/.github/workflows/autodocs.yaml
@@ -41,5 +41,5 @@ jobs:
 
     - name: Check that documentation is up-to-date
       run: |
-        npm install -g @redocly/cli
+        npm install -g @redocly/cli@1.34.2
         python update_doc.py --check
diff --git a/Cargo.lock b/Cargo.lock
index 7e172ed5..165cb590 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4650,7 +4650,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "async-trait",
  "clap 4.5.32",
@@ -4671,7 +4671,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "average",
  "clap 4.5.32",
@@ -4691,7 +4691,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4709,7 +4709,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "clap 4.5.32",
  "ctrlc",
@@ -4730,7 +4730,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -4782,7 +4782,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "async-trait",
  "bindgen 0.71.1",
@@ -4800,7 +4800,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v2"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4849,7 +4849,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v3"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index 065046bc..a32d8e7f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/README.md b/README.md
index f9a45bc2..0890d9c6 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/backends/gaudi/examples/docker_commands/docker_commands.md b/backends/gaudi/examples/docker_commands/docker_commands.md
index 22b9d34b..ccacfbdb 100644
--- a/backends/gaudi/examples/docker_commands/docker_commands.md
+++ b/backends/gaudi/examples/docker_commands/docker_commands.md
@@ -19,7 +19,7 @@ docker run -p 8080:80 \
    --ipc=host \
    -v $volume:/data \
    -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@@ -39,7 +39,7 @@ docker run -p 8080:80 \
    --ipc=host \
    -v $volume:/data \
    -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -58,7 +58,7 @@ docker run -p 8080:80 \
    --cap-add=sys_nice \
    --ipc=host \
    -v $volume:/data \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
@@ -81,7 +81,7 @@ docker run -p 8080:80 \
    --ipc=host \
    -v $volume:/data \
    -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model \
    --kv-cache-dtype fp8_e4m3fn \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -102,7 +102,7 @@ docker run -p 8080:80 \
    --ipc=host \
    -v $volume:/data \
    -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model \
    --kv-cache-dtype fp8_e4m3fn \
    --sharded true --num-shard 8 \
diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py
index 796e4817..1061fbc4 100644
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@@ -56,6 +56,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
             assert tokens.ids[0] == expectations[0]
             assert tokens.texts[0] == expectations[1]
 
+
 def test_prefill_truncate(neuron_model_config):
     config_name = neuron_model_config["name"]
     neuron_model_path = neuron_model_config["neuron_model_path"]
diff --git a/docs/openapi.json b/docs/openapi.json
index 63572257..6225f5e7 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "3.3.4-dev0"
+    "version": "3.3.5-dev0"
   },
   "paths": {
     "/": {
@@ -57,7 +57,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
+                  "error_type": "validation"
                 }
               }
             }
@@ -70,7 +71,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
                 }
               }
             }
@@ -83,7 +85,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
                 }
               }
             }
@@ -96,7 +99,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
                 }
               }
             }
@@ -181,7 +185,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
+                  "error_type": "validation"
                 }
               }
             }
@@ -194,7 +199,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
                 }
               }
             }
@@ -207,7 +213,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
                 }
               }
             }
@@ -220,7 +227,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
                 }
               }
             }
@@ -264,7 +272,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
+                  "error_type": "validation"
                 }
               }
             }
@@ -277,7 +286,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
                 }
               }
             }
@@ -290,7 +300,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
                 }
               }
             }
@@ -303,7 +314,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
                 }
               }
             }
@@ -558,7 +570,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
+                  "error_type": "validation"
                 }
               }
             }
@@ -571,7 +584,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
                 }
               }
             }
@@ -584,7 +598,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
                 }
               }
             }
@@ -597,7 +612,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
                 }
               }
             }
@@ -646,7 +662,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
+                  "error_type": "validation"
                 }
               }
             }
@@ -659,7 +676,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
                 }
               }
             }
@@ -672,7 +690,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
                 }
               }
             }
@@ -685,7 +704,8 @@
                   "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
                 }
               }
             }
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
index 702a9b80..07d34a82 100644
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
     --model-id $model
 ```
 
@@ -52,7 +52,7 @@ hf_token=YOUR_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
     --model-id $model
     <text-generation-inference-launcher-arguments>
 ```
@@ -106,7 +106,7 @@ docker run -p 8080:80 \
    --cap-add=sys_nice \
    --ipc=host \
    -v $volume:/data \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
index 6ba57502..17d720db 100644
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.3.4-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.3.5-neuron <service_parameters>
 ```
 
 - system parameters are used to map ports, volumes and devices between the host and the service,
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index cf164c32..d42bac7a 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index d15e0089..ad6483e2 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index 423a9956..df4abb3b 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index 0b03e3c3..60b0bcc0 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 507e9c70..37cb841c 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 \
+    ghcr.io/huggingface/text-generation-inference:3.3.5 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index e66e5808..bd8495c5 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 \
+    ghcr.io/huggingface/text-generation-inference:3.3.5 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.3.4 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.3.5 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index b900887e..7d21eca7 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.4"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.5"),
  env=hub,
  role=role,
 )
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
index 436ec29d..b9803da8 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
@@ -17,7 +17,7 @@
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 42,
     "prompt_tokens": 277,
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
index 68783c27..a91f01f7 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
@@ -17,7 +17,7 @@
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 62,
     "prompt_tokens": 277,
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
index 76a34128..d8104c9a 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
@@ -17,7 +17,7 @@
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 67,
     "prompt_tokens": 277,
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
index be774054..0a712cc7 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
@@ -17,7 +17,7 @@
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 72,
     "prompt_tokens": 275,
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
index cd79c363..6d4ee727 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
@@ -17,7 +17,7 @@
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 80,
     "prompt_tokens": 279,
diff --git a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json
index 05129fe0..3310bdcd 100644
--- a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json
+++ b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json
@@ -14,7 +14,7 @@
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 35,
     "prompt_tokens": 32,
diff --git a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json
index 6c548214..e627b2be 100644
--- a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json
+++ b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json
@@ -14,7 +14,7 @@
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 44,
     "prompt_tokens": 37,
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
index 7b992b4f..58f5ada8 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -18,7 +18,7 @@
     "id": "",
     "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
-    "system_fingerprint": "3.3.4-dev0-native",
+    "system_fingerprint": "3.3.5-dev0-native",
     "usage": {
       "completion_tokens": 10,
       "prompt_tokens": 45,
@@ -44,7 +44,7 @@
     "id": "",
     "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
-    "system_fingerprint": "3.3.4-dev0-native",
+    "system_fingerprint": "3.3.5-dev0-native",
     "usage": {
       "completion_tokens": 10,
       "prompt_tokens": 45,
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
index ccf53120..6830b36b 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -17,7 +17,7 @@
   "id": "",
   "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
   "usage": {
     "completion_tokens": 10,
     "prompt_tokens": 45,
diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py
index 8a905e64..424dcaaf 100644
--- a/integration-tests/models/test_grammar_response_format_llama.py
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@@ -43,7 +43,10 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh
         ],
         "seed": 42,
         "max_tokens": 500,
-        "response_format": {"type": "json_object", "value": Weather.model_json_schema()},
+        "response_format": {
+            "type": "json_object",
+            "value": Weather.model_json_schema(),
+        },
     }
     # send the request
     response = requests.post(
@@ -75,7 +78,11 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh
 
     json_payload["response_format"] = {
         "type": "json_schema",
-        "value": {"name": "weather", "strict": True, "schema": Weather.model_json_schema()},
+        "value": {
+            "name": "weather",
+            "strict": True,
+            "schema": Weather.model_json_schema(),
+        },
     }
     response = requests.post(
         f"{llama_grammar.base_url}/v1/chat/completions",
@@ -119,7 +126,10 @@ async def test_grammar_response_format_llama_error_if_tools_not_installed(
             "seed": 42,
             "max_tokens": 500,
             "tools": [],
-            "response_format": {"type": "json_object", "value": Weather.model_json_schema()},
+            "response_format": {
+                "type": "json_object",
+                "value": Weather.model_json_schema(),
+            },
         },
     )
 
diff --git a/router/src/server.rs b/router/src/server.rs
index 5fbe0403..97a0cea2 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -113,13 +113,13 @@ content(
 ("text/event-stream" = StreamResponse),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(skip(infer, req))]
@@ -249,13 +249,13 @@ request_body = GenerateRequest,
 responses(
 (status = 200, description = "Generated Text", body = GenerateResponse),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(
@@ -448,16 +448,16 @@ responses(
 (status = 200, description = "Generated Text", body = StreamResponse,
 content_type = "text/event-stream"),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"}),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"}),
 content_type = "text/event-stream"),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"}),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"}),
 content_type = "text/event-stream"),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"}),
+example = json ! ({"error": "Input validation error", "error_type": "validation"}),
 content_type = "text/event-stream"),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"}),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"}),
 content_type = "text/event-stream"),
 )
 )]
@@ -691,13 +691,13 @@ content(
 ("text/event-stream" = Chunk),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(
@@ -1144,13 +1144,13 @@ content(
 ("text/event-stream" = ChatCompletionChunk),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(
diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py
index 5c0d2fcc..c891f4c8 100644
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics2 model."""
+"""PyTorch Idefics2 model."""
 
 from typing import List, Optional, Tuple
 
diff --git a/server/text_generation_server/models/custom_modeling/idefics3.py b/server/text_generation_server/models/custom_modeling/idefics3.py
index 6d303c2c..216b1eac 100644
--- a/server/text_generation_server/models/custom_modeling/idefics3.py
+++ b/server/text_generation_server/models/custom_modeling/idefics3.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics3 model."""
+"""PyTorch Idefics3 model."""
 
 from typing import List, Optional, Tuple
 
diff --git a/server/text_generation_server/models/custom_modeling/idefics_config.py b/server/text_generation_server/models/custom_modeling/idefics_config.py
index a5565819..6ce2054e 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_config.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_config.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Idefics model configuration"""
+"""Idefics model configuration"""
 import copy
 
 from transformers import PretrainedConfig
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index 9fc9bca6..3104f742 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics model."""
+"""PyTorch Idefics model."""
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py
index dd8f76bc..7d2051e0 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
+"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
 
 
 from dataclasses import dataclass
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
index 56a9565b..decee125 100644
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Llava-NeXT model."""
+"""PyTorch Llava-NeXT model."""
 
 from typing import List, Optional, Tuple
 
diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
index 06731a6f..8554b632 100644
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPTNeoX model."""
+"""PyTorch GPTNeoX model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py
index e6666acd..0dce0f9e 100644
--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch T5 model."""
+"""PyTorch T5 model."""
 
 import copy
 import math
diff --git a/server/text_generation_server/utils/segments.py b/server/text_generation_server/utils/segments.py
index fd8be563..3d880fec 100644
--- a/server/text_generation_server/utils/segments.py
+++ b/server/text_generation_server/utils/segments.py
@@ -9,7 +9,7 @@ import numpy as np
 
 
 def find_segments(
-    adapter_indices: Union[torch.Tensor, List[int]]
+    adapter_indices: Union[torch.Tensor, List[int]],
 ) -> Tuple[List[int], List[int]]:
     if isinstance(adapter_indices, torch.Tensor):
         adapter_indices = adapter_indices.cpu().numpy()
diff --git a/server/uv.lock b/server/uv.lock
index 7e6f194a..b7864685 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -720,17 +720,17 @@ wheels = [
 
 [[package]]
 name = "hf-xet"
-version = "1.0.0"
+version = "1.1.9"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/64/46/db229dddc55121478105940b610fef1b466c414da02be9d4daa5602a2527/hf_xet-1.0.0.tar.gz", hash = "sha256:5e0ca891ce599fd753e7ffbdc182207d952a93e6252eeb92118475d6866bb093", size = 257192 }
+sdist = { url = "https://files.pythonhosted.org/packages/23/0f/5b60fc28ee7f8cc17a5114a584fd6b86e11c3e0a6e142a7f97a161e9640a/hf_xet-1.1.9.tar.gz", hash = "sha256:c99073ce404462e909f1d5839b2d14a3827b8fe75ed8aed551ba6609c026c803", size = 484242 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e7/0a/c16f8766fa3cd520292b1a765e9b50b8390bce4c2ed7657db9534551f5ed/hf_xet-1.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6106304f92bbce7c9b8509f6f735f2e8ce95e4dc32af8876e874c48b15ca1903", size = 5001841 },
-    { url = "https://files.pythonhosted.org/packages/e3/9f/cca55edd85d03fc98c743bcc093965740a7440e909779c558039d6838f03/hf_xet-1.0.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d0bc7a3e6c1d21fcbb48e8726e3b19a2460e95971375e55e9a5f73ec7079a86", size = 4805318 },
-    { url = "https://files.pythonhosted.org/packages/d1/0b/28bda7ac9d699dcfb96f628aa135ddca3f0f77e9716351aab2b83966f957/hf_xet-1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23dee64f114ea9a272ff71a6a755e025d7a075a6a0dbf6da0990fe9211831ccf", size = 53504907 },
-    { url = "https://files.pythonhosted.org/packages/cb/04/ef1f7249a813841d193cbab2ef4d1d7d67c66c61d21d45223a72fdc5c88e/hf_xet-1.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d5f160550508ed87783d1eca4288602a713d1c45ec517d937acb9d93120f0cab", size = 52410434 },
-    { url = "https://files.pythonhosted.org/packages/81/b3/e7abec2619ecd9d1c743adfe79fa69cf84530f530969daf3dc804efef65b/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:5ebd79db87df0b9d3607e7c9a6bb0662c10e36992f522f66b1d2a7fe93f53f27", size = 53465113 },
-    { url = "https://files.pythonhosted.org/packages/df/82/b51f3b6e5c6f33e91220c37b17760229704c58e79ab0fcfd0fd3b55803d3/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8e6d2625971b4affad634835db82d5392f38de874205a9573e0dd3f0f9cb136f", size = 53461632 },
-    { url = "https://files.pythonhosted.org/packages/95/d2/32defba26d995f7acdc4fe3e5911473b25aff5b75c5a2532786435a709e8/hf_xet-1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:b446964bd75eb7f6b4d983c47241b2023eadfad1f56137ed00e1ca6fc278faec", size = 4121808 },
+    { url = "https://files.pythonhosted.org/packages/de/12/56e1abb9a44cdef59a411fe8a8673313195711b5ecce27880eb9c8fa90bd/hf_xet-1.1.9-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a3b6215f88638dd7a6ff82cb4e738dcbf3d863bf667997c093a3c990337d1160", size = 2762553 },
+    { url = "https://files.pythonhosted.org/packages/3a/e6/2d0d16890c5f21b862f5df3146519c182e7f0ae49b4b4bf2bd8a40d0b05e/hf_xet-1.1.9-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:9b486de7a64a66f9a172f4b3e0dfe79c9f0a93257c501296a2521a13495a698a", size = 2623216 },
+    { url = "https://files.pythonhosted.org/packages/81/42/7e6955cf0621e87491a1fb8cad755d5c2517803cea174229b0ec00ff0166/hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4c5a840c2c4e6ec875ed13703a60e3523bc7f48031dfd750923b2a4d1a5fc3c", size = 3186789 },
+    { url = "https://files.pythonhosted.org/packages/df/8b/759233bce05457f5f7ec062d63bbfd2d0c740b816279eaaa54be92aa452a/hf_xet-1.1.9-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:96a6139c9e44dad1c52c52520db0fffe948f6bce487cfb9d69c125f254bb3790", size = 3088747 },
+    { url = "https://files.pythonhosted.org/packages/6c/3c/28cc4db153a7601a996985bcb564f7b8f5b9e1a706c7537aad4b4809f358/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ad1022e9a998e784c97b2173965d07fe33ee26e4594770b7785a8cc8f922cd95", size = 3251429 },
+    { url = "https://files.pythonhosted.org/packages/84/17/7caf27a1d101bfcb05be85850d4aa0a265b2e1acc2d4d52a48026ef1d299/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:86754c2d6d5afb11b0a435e6e18911a4199262fe77553f8c50d75e21242193ea", size = 3354643 },
+    { url = "https://files.pythonhosted.org/packages/cd/50/0c39c9eed3411deadcc98749a6699d871b822473f55fe472fad7c01ec588/hf_xet-1.1.9-cp37-abi3-win_amd64.whl", hash = "sha256:5aad3933de6b725d61d51034e04174ed1dce7a57c63d530df0014dea15a40127", size = 2804797 },
 ]
 
 [[package]]
@@ -2708,7 +2708,7 @@ requires-dist = [
     { name = "opentelemetry-api", specifier = ">=1.27.0" },
     { name = "opentelemetry-exporter-otlp", specifier = ">=1.27.0" },
     { name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" },
-    { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13" },
+    { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13,<1.0" },
     { name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" },
     { name = "pillow", specifier = ">=11.1.0" },
     { name = "prometheus-client", specifier = ">=0.21.0" },
@@ -2872,22 +2872,22 @@ dependencies = [
     { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b1f0cdd0720ad60536deb5baa427b782fd920dd4fcf72e244d32974caafa3b9e" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ac1849553ee673dfafb44c610c60cb60a2890f0e117f43599a526cf777eb8b8c" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:c52c4b869742f00b12cb34521d1381be6119fa46244791704b00cc4a3cb06850" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:47c895bcab508769d129d717a4b916b10225ae3855723aeec8dff8efe5346207" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c4bbc0b4be60319ba1cefc90be9557b317f0b3c261eeceb96ca6e0343eec56bf" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:bf88f647d76d79da9556ca55df49e45aff1d66c12797886364343179dd09a36c" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6bba7dca5d9a729f1e8e9befb98055498e551efaf5ed034824c168b560afc1ac" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7c0f08d1c44a02abad389373dddfce75904b969a410be2f4e5109483dd3dc0ce" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:1704e5dd66c9221e4e8b6ae2d80cbf54e129571e643f5fa9ca78cc6d2096403a" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:633f35e8b1b1f640ef5f8a98dbd84f19b548222ce7ba8f017fe47ce6badc106a" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2f69f909da5dc52113ec66a851d62079f3d52c83184cf64beebdf12ca2f705c" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:58c749f52ddc9098155c77d6c74153bb13d8978fd6e1063b5d7b41d4644f5af5" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fa05ac6ebed4777de7a5eff398c1f17b697c02422516748ce66a8151873e5a0e" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:78e13c26c38ae92d6841cf9ce760d7e9d52bca3e3183de371812e84274b054dc" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:3559e98be824c2b12ab807319cd61c6174d73a524c9961317de8e8a44133c5c5" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:2f155388b1200e08f3e901bb3487ff93ca6d63cde87c29b97bb6762a8f63b373" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:f446f97b20cb070747b103fb640df941b88cb68c8d3b01538287d05d56a7e874" },
     { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-win_amd64.whl", hash = "sha256:8614a167d6a163273fb130f586802f3243479862b53ee2843941c10cc5761da6" },
 ]