chore: prepare version 3.3.5 (#3314)

* chore: prepare version 3.3.5 * black * neuron: black * Update hf-xet in uv lockfile * Attempt to fix API doc check failure Add `error_type` where missing. * Pin redocly version * Sync redocly with Nix for now --------- Co-authored-by: Daniël de Kok <me@danieldk.eu>
2025-11-18 23:15:59 +00:00 · 2025-09-02 15:35:42 +02:00 · 2025-09-02 15:35:42 +02:00 · 0f79162288
commit 0f79162288
parent 06d9d88b95
37 changed files with 143 additions and 112 deletions
--- a/.github/workflows/autodocs.yaml
+++ b/.github/workflows/autodocs.yaml
@ -41,5 +41,5 @@ jobs:
    - name: Check that documentation is up-to-date
      run: |
-        npm install -g @redocly/cli
+        npm install -g @redocly/cli@1.34.2
        python update_doc.py --check
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4650,7 +4650,7 @@ dependencies = [
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "async-trait",
 "clap 4.5.32",
@ -4671,7 +4671,7 @@ dependencies = [
 [[package]]
 name = "text-generation-benchmark"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "average",
 "clap 4.5.32",
@ -4691,7 +4691,7 @@ dependencies = [
 [[package]]
 name = "text-generation-client"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@ -4709,7 +4709,7 @@ dependencies = [
 [[package]]
 name = "text-generation-launcher"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "clap 4.5.32",
 "ctrlc",
@ -4730,7 +4730,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "anyhow",
 "async-stream",
@ -4782,7 +4782,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "async-trait",
 "bindgen 0.71.1",
@ -4800,7 +4800,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router-v2"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "async-stream",
 "async-trait",
@ -4849,7 +4849,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router-v3"
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 dependencies = [
 "async-stream",
 "async-trait",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 [workspace.package]
-version = "3.3.4-dev0"
+version = "3.3.5-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/README.md
+++ b/README.md
@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
 ```
 And then you can make requests like
@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5-rocm --model-id $model` instead of the command above.
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
 ```
 ### A note on Shared Memory (shm)
--- a/backends/gaudi/examples/docker_commands/docker_commands.md
+++ b/backends/gaudi/examples/docker_commands/docker_commands.md
@ -19,7 +19,7 @@ docker run -p 8080:80 \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -39,7 +39,7 @@ docker run -p 8080:80 \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -58,7 +58,7 @@ docker run -p 8080:80 \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
@ -81,7 +81,7 @@ docker run -p 8080:80 \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --kv-cache-dtype fp8_e4m3fn \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -102,7 +102,7 @@ docker run -p 8080:80 \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --kv-cache-dtype fp8_e4m3fn \
   --sharded true --num-shard 8 \
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@ -56,6 +56,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
            assert tokens.ids[0] == expectations[0]
            assert tokens.texts[0] == expectations[1]
 def test_prefill_truncate(neuron_model_config):
    config_name = neuron_model_config["name"]
    neuron_model_path = neuron_model_config["neuron_model_path"]
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "3.3.4-dev0"
+    "version": "3.3.5-dev0"
  },
  "paths": {
    "/": {
@ -57,7 +57,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
@ -70,7 +71,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
@ -83,7 +85,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
@ -96,7 +99,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
@ -181,7 +185,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
@ -194,7 +199,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
@ -207,7 +213,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
@ -220,7 +227,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
@ -264,7 +272,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
@ -277,7 +286,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
@ -290,7 +300,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
@ -303,7 +314,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
@ -558,7 +570,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
@ -571,7 +584,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
@ -584,7 +598,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
@ -597,7 +612,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
@ -646,7 +662,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Input validation error"
+                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
@ -659,7 +676,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Request failed during generation"
+                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
@ -672,7 +690,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Model is overloaded"
+                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
@ -685,7 +704,8 @@
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
-                  "error": "Incomplete generation"
+                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model
 ```
@ -52,7 +52,7 @@ hf_token=YOUR_ACCESS_TOKEN
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model
    <text-generation-inference-launcher-arguments>
 ```
@ -106,7 +106,7 @@ docker run -p 8080:80 \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
-   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.3.4-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.3.5-neuron <service_parameters>
 ```
 - system parameters are used to map ports, volumes and devices between the host and the service,
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 \
    --model-id $model
 ```
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes
 ```
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes-nf4
 ```
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize gptq
 ```
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-rocm \
    --model-id $model
 ```
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-intel-xpu \
    --model-id $model --cuda-graphs 0
 ```
@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.3.5-intel-cpu \
    --model-id $model --cuda-graphs 0
 ```
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 \
+    ghcr.io/huggingface/text-generation-inference:3.3.5 \
    --model-id $model
 ```
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.3.4 \
+    ghcr.io/huggingface/text-generation-inference:3.3.5 \
    --model-id $model
 ```
@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.3.4 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.3.5 --help
 ```
 </Tip>
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@ -163,7 +163,7 @@ hub = {
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.4"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.5"),
 env=hub,
 role=role,
 )
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 42,
    "prompt_tokens": 277,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 62,
    "prompt_tokens": 277,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 67,
    "prompt_tokens": 277,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 72,
    "prompt_tokens": 275,
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 80,
    "prompt_tokens": 279,
--- a/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_basic.json
+++ b/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_basic.json
@ -14,7 +14,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 35,
    "prompt_tokens": 32,
--- a/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_complex.json
+++ b/integration-tests/models/snapshots/test_json_schema_constrain/test_json_schema_complex.json
@ -14,7 +14,7 @@
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 44,
    "prompt_tokens": 37,
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_load.json
@ -18,7 +18,7 @@
    "id": "",
    "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "3.3.4-dev0-native",
+    "system_fingerprint": "3.3.5-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
@ -44,7 +44,7 @@
    "id": "",
    "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
-    "system_fingerprint": "3.3.4-dev0-native",
+    "system_fingerprint": "3.3.5-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
--- a/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/snapshots/test_mllama/test_mllama_simpl.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
  "object": "chat.completion",
-  "system_fingerprint": "3.3.4-dev0-native",
+  "system_fingerprint": "3.3.5-dev0-native",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 45,
--- a/integration-tests/models/test_grammar_response_format_llama.py
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@ -43,7 +43,10 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh
        ],
        "seed": 42,
        "max_tokens": 500,
-        "response_format": {"type": "json_object", "value": Weather.model_json_schema()},
+        "response_format": {
            "type": "json_object",
            "value": Weather.model_json_schema(),
        },
    }
    # send the request
    response = requests.post(
@ -75,7 +78,11 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh
    json_payload["response_format"] = {
        "type": "json_schema",
-        "value": {"name": "weather", "strict": True, "schema": Weather.model_json_schema()},
+        "value": {
            "name": "weather",
            "strict": True,
            "schema": Weather.model_json_schema(),
        },
    }
    response = requests.post(
        f"{llama_grammar.base_url}/v1/chat/completions",
@ -119,7 +126,10 @@ async def test_grammar_response_format_llama_error_if_tools_not_installed(
            "seed": 42,
            "max_tokens": 500,
            "tools": [],
-            "response_format": {"type": "json_object", "value": Weather.model_json_schema()},
+            "response_format": {
                "type": "json_object",
                "value": Weather.model_json_schema(),
            },
        },
    )
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -113,13 +113,13 @@ content(
 ("text/event-stream" = StreamResponse),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(skip(infer, req))]
@ -249,13 +249,13 @@ request_body = GenerateRequest,
 responses(
 (status = 200, description = "Generated Text", body = GenerateResponse),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(
@ -448,16 +448,16 @@ responses(
 (status = 200, description = "Generated Text", body = StreamResponse,
 content_type = "text/event-stream"),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"}),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"}),
 content_type = "text/event-stream"),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"}),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"}),
 content_type = "text/event-stream"),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"}),
+example = json ! ({"error": "Input validation error", "error_type": "validation"}),
 content_type = "text/event-stream"),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"}),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"}),
 content_type = "text/event-stream"),
 )
 )]
@ -691,13 +691,13 @@ content(
 ("text/event-stream" = Chunk),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(
@ -1144,13 +1144,13 @@ content(
 ("text/event-stream" = ChatCompletionChunk),
 )),
 (status = 424, description = "Generation Error", body = ErrorResponse,
-example = json ! ({"error": "Request failed during generation"})),
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
 (status = 429, description = "Model is overloaded", body = ErrorResponse,
-example = json ! ({"error": "Model is overloaded"})),
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
 (status = 422, description = "Input validation error", body = ErrorResponse,
-example = json ! ({"error": "Input validation error"})),
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
 (status = 500, description = "Incomplete generation", body = ErrorResponse,
-example = json ! ({"error": "Incomplete generation"})),
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
 )
 )]
 #[instrument(
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics2 model."""
+"""PyTorch Idefics2 model."""
 from typing import List, Optional, Tuple
--- a/server/text_generation_server/models/custom_modeling/idefics3.py
+++ b/server/text_generation_server/models/custom_modeling/idefics3.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics3 model."""
+"""PyTorch Idefics3 model."""
 from typing import List, Optional, Tuple
--- a/server/text_generation_server/models/custom_modeling/idefics_config.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_config.py
@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Idefics model configuration"""
+"""Idefics model configuration"""
 import copy
 from transformers import PretrainedConfig
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics model."""
+"""PyTorch Idefics model."""
 from typing import List, Optional, Tuple, Union
 import torch
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
+"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
 from dataclasses import dataclass
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Llava-NeXT model."""
+"""PyTorch Llava-NeXT model."""
 from typing import List, Optional, Tuple
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPTNeoX model."""
+"""PyTorch GPTNeoX model."""
 from typing import Optional, Tuple, Union
--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch T5 model."""
+"""PyTorch T5 model."""
 import copy
 import math
--- a/server/text_generation_server/utils/segments.py
+++ b/server/text_generation_server/utils/segments.py
@ -9,7 +9,7 @@ import numpy as np
 def find_segments(
-    adapter_indices: Union[torch.Tensor, List[int]]
+    adapter_indices: Union[torch.Tensor, List[int]],
 ) -> Tuple[List[int], List[int]]:
    if isinstance(adapter_indices, torch.Tensor):
        adapter_indices = adapter_indices.cpu().numpy()
--- a/server/uv.lock
+++ b/server/uv.lock
@ -720,17 +720,17 @@ wheels = [
 [[package]]
 name = "hf-xet"
-version = "1.0.0"
+version = "1.1.9"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/64/46/db229dddc55121478105940b610fef1b466c414da02be9d4daa5602a2527/hf_xet-1.0.0.tar.gz", hash = "sha256:5e0ca891ce599fd753e7ffbdc182207d952a93e6252eeb92118475d6866bb093", size = 257192 }
+sdist = { url = "https://files.pythonhosted.org/packages/23/0f/5b60fc28ee7f8cc17a5114a584fd6b86e11c3e0a6e142a7f97a161e9640a/hf_xet-1.1.9.tar.gz", hash = "sha256:c99073ce404462e909f1d5839b2d14a3827b8fe75ed8aed551ba6609c026c803", size = 484242 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e7/0a/c16f8766fa3cd520292b1a765e9b50b8390bce4c2ed7657db9534551f5ed/hf_xet-1.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6106304f92bbce7c9b8509f6f735f2e8ce95e4dc32af8876e874c48b15ca1903", size = 5001841 },
+    { url = "https://files.pythonhosted.org/packages/de/12/56e1abb9a44cdef59a411fe8a8673313195711b5ecce27880eb9c8fa90bd/hf_xet-1.1.9-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a3b6215f88638dd7a6ff82cb4e738dcbf3d863bf667997c093a3c990337d1160", size = 2762553 },
-    { url = "https://files.pythonhosted.org/packages/e3/9f/cca55edd85d03fc98c743bcc093965740a7440e909779c558039d6838f03/hf_xet-1.0.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d0bc7a3e6c1d21fcbb48e8726e3b19a2460e95971375e55e9a5f73ec7079a86", size = 4805318 },
+    { url = "https://files.pythonhosted.org/packages/3a/e6/2d0d16890c5f21b862f5df3146519c182e7f0ae49b4b4bf2bd8a40d0b05e/hf_xet-1.1.9-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:9b486de7a64a66f9a172f4b3e0dfe79c9f0a93257c501296a2521a13495a698a", size = 2623216 },
-    { url = "https://files.pythonhosted.org/packages/d1/0b/28bda7ac9d699dcfb96f628aa135ddca3f0f77e9716351aab2b83966f957/hf_xet-1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23dee64f114ea9a272ff71a6a755e025d7a075a6a0dbf6da0990fe9211831ccf", size = 53504907 },
+    { url = "https://files.pythonhosted.org/packages/81/42/7e6955cf0621e87491a1fb8cad755d5c2517803cea174229b0ec00ff0166/hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4c5a840c2c4e6ec875ed13703a60e3523bc7f48031dfd750923b2a4d1a5fc3c", size = 3186789 },
-    { url = "https://files.pythonhosted.org/packages/cb/04/ef1f7249a813841d193cbab2ef4d1d7d67c66c61d21d45223a72fdc5c88e/hf_xet-1.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d5f160550508ed87783d1eca4288602a713d1c45ec517d937acb9d93120f0cab", size = 52410434 },
+    { url = "https://files.pythonhosted.org/packages/df/8b/759233bce05457f5f7ec062d63bbfd2d0c740b816279eaaa54be92aa452a/hf_xet-1.1.9-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:96a6139c9e44dad1c52c52520db0fffe948f6bce487cfb9d69c125f254bb3790", size = 3088747 },
-    { url = "https://files.pythonhosted.org/packages/81/b3/e7abec2619ecd9d1c743adfe79fa69cf84530f530969daf3dc804efef65b/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:5ebd79db87df0b9d3607e7c9a6bb0662c10e36992f522f66b1d2a7fe93f53f27", size = 53465113 },
+    { url = "https://files.pythonhosted.org/packages/6c/3c/28cc4db153a7601a996985bcb564f7b8f5b9e1a706c7537aad4b4809f358/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ad1022e9a998e784c97b2173965d07fe33ee26e4594770b7785a8cc8f922cd95", size = 3251429 },
-    { url = "https://files.pythonhosted.org/packages/df/82/b51f3b6e5c6f33e91220c37b17760229704c58e79ab0fcfd0fd3b55803d3/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8e6d2625971b4affad634835db82d5392f38de874205a9573e0dd3f0f9cb136f", size = 53461632 },
+    { url = "https://files.pythonhosted.org/packages/84/17/7caf27a1d101bfcb05be85850d4aa0a265b2e1acc2d4d52a48026ef1d299/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:86754c2d6d5afb11b0a435e6e18911a4199262fe77553f8c50d75e21242193ea", size = 3354643 },
-    { url = "https://files.pythonhosted.org/packages/95/d2/32defba26d995f7acdc4fe3e5911473b25aff5b75c5a2532786435a709e8/hf_xet-1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:b446964bd75eb7f6b4d983c47241b2023eadfad1f56137ed00e1ca6fc278faec", size = 4121808 },
+    { url = "https://files.pythonhosted.org/packages/cd/50/0c39c9eed3411deadcc98749a6699d871b822473f55fe472fad7c01ec588/hf_xet-1.1.9-cp37-abi3-win_amd64.whl", hash = "sha256:5aad3933de6b725d61d51034e04174ed1dce7a57c63d530df0014dea15a40127", size = 2804797 },
 ]
 [[package]]
@ -2708,7 +2708,7 @@ requires-dist = [
    { name = "opentelemetry-api", specifier = ">=1.27.0" },
    { name = "opentelemetry-exporter-otlp", specifier = ">=1.27.0" },
    { name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" },
-    { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13" },
+    { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13,<1.0" },
    { name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" },
    { name = "pillow", specifier = ">=11.1.0" },
    { name = "prometheus-client", specifier = ">=0.21.0" },
@ -2872,22 +2872,22 @@ dependencies = [
    { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b1f0cdd0720ad60536deb5baa427b782fd920dd4fcf72e244d32974caafa3b9e" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ac1849553ee673dfafb44c610c60cb60a2890f0e117f43599a526cf777eb8b8c" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:c52c4b869742f00b12cb34521d1381be6119fa46244791704b00cc4a3cb06850" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:47c895bcab508769d129d717a4b916b10225ae3855723aeec8dff8efe5346207" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c4bbc0b4be60319ba1cefc90be9557b317f0b3c261eeceb96ca6e0343eec56bf" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:bf88f647d76d79da9556ca55df49e45aff1d66c12797886364343179dd09a36c" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6bba7dca5d9a729f1e8e9befb98055498e551efaf5ed034824c168b560afc1ac" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7c0f08d1c44a02abad389373dddfce75904b969a410be2f4e5109483dd3dc0ce" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:1704e5dd66c9221e4e8b6ae2d80cbf54e129571e643f5fa9ca78cc6d2096403a" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:633f35e8b1b1f640ef5f8a98dbd84f19b548222ce7ba8f017fe47ce6badc106a" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2f69f909da5dc52113ec66a851d62079f3d52c83184cf64beebdf12ca2f705c" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:58c749f52ddc9098155c77d6c74153bb13d8978fd6e1063b5d7b41d4644f5af5" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fa05ac6ebed4777de7a5eff398c1f17b697c02422516748ce66a8151873e5a0e" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:78e13c26c38ae92d6841cf9ce760d7e9d52bca3e3183de371812e84274b054dc" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:3559e98be824c2b12ab807319cd61c6174d73a524c9961317de8e8a44133c5c5" },
-    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:2f155388b1200e08f3e901bb3487ff93ca6d63cde87c29b97bb6762a8f63b373" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:f446f97b20cb070747b103fb640df941b88cb68c8d3b01538287d05d56a7e874" },
    { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-win_amd64.whl", hash = "sha256:8614a167d6a163273fb130f586802f3243479862b53ee2843941c10cc5761da6" },
 ]