chore: prepare version 3.3.5 (#3314)

* chore: prepare version 3.3.5

* black

* neuron: black

* Update hf-xet in uv lockfile

* Attempt to fix API doc check failure

Add `error_type` where missing.

* Pin redocly version

* Sync redocly with Nix for now

---------

Co-authored-by: Daniël de Kok <me@danieldk.eu>
This commit is contained in:
Alvaro Moran 2025-09-02 15:35:42 +02:00 committed by GitHub
parent 06d9d88b95
commit 0f79162288
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 143 additions and 112 deletions

View File

@ -41,5 +41,5 @@ jobs:
- name: Check that documentation is up-to-date - name: Check that documentation is up-to-date
run: | run: |
npm install -g @redocly/cli npm install -g @redocly/cli@1.34.2
python update_doc.py --check python update_doc.py --check

16
Cargo.lock generated
View File

@ -4650,7 +4650,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-backends-trtllm" name = "text-generation-backends-trtllm"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"clap 4.5.32", "clap 4.5.32",
@ -4671,7 +4671,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-benchmark" name = "text-generation-benchmark"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"average", "average",
"clap 4.5.32", "clap 4.5.32",
@ -4691,7 +4691,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-client" name = "text-generation-client"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"base64 0.22.1", "base64 0.22.1",
@ -4709,7 +4709,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-launcher" name = "text-generation-launcher"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"clap 4.5.32", "clap 4.5.32",
"ctrlc", "ctrlc",
@ -4730,7 +4730,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router" name = "text-generation-router"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-stream", "async-stream",
@ -4782,7 +4782,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router-llamacpp" name = "text-generation-router-llamacpp"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"bindgen 0.71.1", "bindgen 0.71.1",
@ -4800,7 +4800,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router-v2" name = "text-generation-router-v2"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"async-trait", "async-trait",
@ -4849,7 +4849,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router-v3" name = "text-generation-router-v3"
version = "3.3.4-dev0" version = "3.3.5-dev0"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"async-trait", "async-trait",

View File

@ -21,7 +21,7 @@ default-members = [
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "3.3.4-dev0" version = "3.3.5-dev0"
edition = "2021" edition = "2021"
authors = ["Olivier Dehaene"] authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference" homepage = "https://github.com/huggingface/text-generation-inference"

View File

@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
volume=$PWD/data volume=$PWD/data
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
``` ```
And then you can make requests like And then you can make requests like
@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4-rocm --model-id $model` instead of the command above. **Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5-rocm --model-id $model` instead of the command above.
To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli): To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
``` ```
@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
token=<your cli READ token> token=<your cli READ token>
docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \ docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
``` ```
### A note on Shared Memory (shm) ### A note on Shared Memory (shm)

View File

@ -19,7 +19,7 @@ docker run -p 8080:80 \
--ipc=host \ --ipc=host \
-v $volume:/data \ -v $volume:/data \
-e HF_TOKEN=$hf_token \ -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -39,7 +39,7 @@ docker run -p 8080:80 \
--ipc=host \ --ipc=host \
-v $volume:/data \ -v $volume:/data \
-e HF_TOKEN=$hf_token \ -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model \ --model-id $model \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
@ -58,7 +58,7 @@ docker run -p 8080:80 \
--cap-add=sys_nice \ --cap-add=sys_nice \
--ipc=host \ --ipc=host \
-v $volume:/data \ -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4 --max-total-tokens 8192 --max-batch-size 4
@ -81,7 +81,7 @@ docker run -p 8080:80 \
--ipc=host \ --ipc=host \
-v $volume:/data \ -v $volume:/data \
-e HF_TOKEN=$hf_token \ -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model \ --model-id $model \
--kv-cache-dtype fp8_e4m3fn \ --kv-cache-dtype fp8_e4m3fn \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
@ -102,7 +102,7 @@ docker run -p 8080:80 \
--ipc=host \ --ipc=host \
-v $volume:/data \ -v $volume:/data \
-e HF_TOKEN=$hf_token \ -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model \ --model-id $model \
--kv-cache-dtype fp8_e4m3fn \ --kv-cache-dtype fp8_e4m3fn \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \

View File

@ -56,6 +56,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
assert tokens.ids[0] == expectations[0] assert tokens.ids[0] == expectations[0]
assert tokens.texts[0] == expectations[1] assert tokens.texts[0] == expectations[1]
def test_prefill_truncate(neuron_model_config): def test_prefill_truncate(neuron_model_config):
config_name = neuron_model_config["name"] config_name = neuron_model_config["name"]
neuron_model_path = neuron_model_config["neuron_model_path"] neuron_model_path = neuron_model_config["neuron_model_path"]

View File

@ -10,7 +10,7 @@
"name": "Apache 2.0", "name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0" "url": "https://www.apache.org/licenses/LICENSE-2.0"
}, },
"version": "3.3.4-dev0" "version": "3.3.5-dev0"
}, },
"paths": { "paths": {
"/": { "/": {
@ -57,7 +57,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Input validation error" "error": "Input validation error",
"error_type": "validation"
} }
} }
} }
@ -70,7 +71,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Request failed during generation" "error": "Request failed during generation",
"error_type": "generation"
} }
} }
} }
@ -83,7 +85,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Model is overloaded" "error": "Model is overloaded",
"error_type": "overloaded"
} }
} }
} }
@ -96,7 +99,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Incomplete generation" "error": "Incomplete generation",
"error_type": "incomplete_generation"
} }
} }
} }
@ -181,7 +185,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Input validation error" "error": "Input validation error",
"error_type": "validation"
} }
} }
} }
@ -194,7 +199,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Request failed during generation" "error": "Request failed during generation",
"error_type": "generation"
} }
} }
} }
@ -207,7 +213,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Model is overloaded" "error": "Model is overloaded",
"error_type": "overloaded"
} }
} }
} }
@ -220,7 +227,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Incomplete generation" "error": "Incomplete generation",
"error_type": "incomplete_generation"
} }
} }
} }
@ -264,7 +272,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Input validation error" "error": "Input validation error",
"error_type": "validation"
} }
} }
} }
@ -277,7 +286,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Request failed during generation" "error": "Request failed during generation",
"error_type": "generation"
} }
} }
} }
@ -290,7 +300,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Model is overloaded" "error": "Model is overloaded",
"error_type": "overloaded"
} }
} }
} }
@ -303,7 +314,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Incomplete generation" "error": "Incomplete generation",
"error_type": "incomplete_generation"
} }
} }
} }
@ -558,7 +570,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Input validation error" "error": "Input validation error",
"error_type": "validation"
} }
} }
} }
@ -571,7 +584,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Request failed during generation" "error": "Request failed during generation",
"error_type": "generation"
} }
} }
} }
@ -584,7 +598,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Model is overloaded" "error": "Model is overloaded",
"error_type": "overloaded"
} }
} }
} }
@ -597,7 +612,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Incomplete generation" "error": "Incomplete generation",
"error_type": "incomplete_generation"
} }
} }
} }
@ -646,7 +662,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Input validation error" "error": "Input validation error",
"error_type": "validation"
} }
} }
} }
@ -659,7 +676,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Request failed during generation" "error": "Request failed during generation",
"error_type": "generation"
} }
} }
} }
@ -672,7 +690,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Model is overloaded" "error": "Model is overloaded",
"error_type": "overloaded"
} }
} }
} }
@ -685,7 +704,8 @@
"$ref": "#/components/schemas/ErrorResponse" "$ref": "#/components/schemas/ErrorResponse"
}, },
"example": { "example": {
"error": "Incomplete generation" "error": "Incomplete generation",
"error_type": "incomplete_generation"
} }
} }
} }

View File

@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
docker run --runtime=habana --cap-add=sys_nice --ipc=host \ docker run --runtime=habana --cap-add=sys_nice --ipc=host \
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model --model-id $model
``` ```
@ -52,7 +52,7 @@ hf_token=YOUR_ACCESS_TOKEN
docker run --runtime=habana --cap-add=sys_nice --ipc=host \ docker run --runtime=habana --cap-add=sys_nice --ipc=host \
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model --model-id $model
<text-generation-inference-launcher-arguments> <text-generation-inference-launcher-arguments>
``` ```
@ -106,7 +106,7 @@ docker run -p 8080:80 \
--cap-add=sys_nice \ --cap-add=sys_nice \
--ipc=host \ --ipc=host \
-v $volume:/data \ -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4 --max-total-tokens 8192 --max-batch-size 4

View File

@ -31,7 +31,7 @@ deployment instructions in the model card:
The service is launched simply by running the text-generation-inference container with two sets of parameters: The service is launched simply by running the text-generation-inference container with two sets of parameters:
``` ```
docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.3.4-neuron <service_parameters> docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.3.5-neuron <service_parameters>
``` ```
- system parameters are used to map ports, volumes and devices between the host and the service, - system parameters are used to map ports, volumes and devices between the host and the service,

View File

@ -19,6 +19,6 @@ docker run --gpus all \
--shm-size 1g \ --shm-size 1g \
-e HF_TOKEN=$token \ -e HF_TOKEN=$token \
-p 8080:80 \ -p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 \ -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 \
--model-id $model --model-id $model
``` ```

View File

@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
```bash ```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes
``` ```
4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load. 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
```bash ```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes-nf4 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes-nf4
``` ```
You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes). You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
```bash ```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize gptq docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize gptq
``` ```
Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.

View File

@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video \ --device=/dev/kfd --device=/dev/dri --group-add video \
--ipc=host --shm-size 256g --net host -v $volume:/data \ --ipc=host --shm-size 256g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4-rocm \ ghcr.io/huggingface/text-generation-inference:3.3.5-rocm \
--model-id $model --model-id $model
``` ```

View File

@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm --privileged --cap-add=sys_nice \ docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \ --device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \ --ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4-intel-xpu \ ghcr.io/huggingface/text-generation-inference:3.3.5-intel-xpu \
--model-id $model --cuda-graphs 0 --model-id $model --cuda-graphs 0
``` ```
@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm --privileged --cap-add=sys_nice \ docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \ --device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \ --ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4-intel-cpu \ ghcr.io/huggingface/text-generation-inference:3.3.5-intel-cpu \
--model-id $model --cuda-graphs 0 --model-id $model --cuda-graphs 0
``` ```

View File

@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \ docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4 \ ghcr.io/huggingface/text-generation-inference:3.3.5 \
--model-id $model --model-id $model
``` ```

View File

@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.3.4 \ ghcr.io/huggingface/text-generation-inference:3.3.5 \
--model-id $model --model-id $model
``` ```
@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more. To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
```bash ```bash
docker run ghcr.io/huggingface/text-generation-inference:3.3.4 --help docker run ghcr.io/huggingface/text-generation-inference:3.3.5 --help
``` ```
</Tip> </Tip>

View File

@ -163,7 +163,7 @@ hub = {
# create Hugging Face Model Class # create Hugging Face Model Class
huggingface_model = HuggingFaceModel( huggingface_model = HuggingFaceModel(
image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.4"), image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.5"),
env=hub, env=hub,
role=role, role=role,
) )

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 42, "completion_tokens": 42,
"prompt_tokens": 277, "prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 62, "completion_tokens": 62,
"prompt_tokens": 277, "prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 67, "completion_tokens": 67,
"prompt_tokens": 277, "prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 72, "completion_tokens": 72,
"prompt_tokens": 275, "prompt_tokens": 275,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 80, "completion_tokens": 80,
"prompt_tokens": 279, "prompt_tokens": 279,

View File

@ -14,7 +14,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 35, "completion_tokens": 35,
"prompt_tokens": 32, "prompt_tokens": 32,

View File

@ -14,7 +14,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 44, "completion_tokens": 44,
"prompt_tokens": 37, "prompt_tokens": 37,

View File

@ -18,7 +18,7 @@
"id": "", "id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct", "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 10, "completion_tokens": 10,
"prompt_tokens": 45, "prompt_tokens": 45,
@ -44,7 +44,7 @@
"id": "", "id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct", "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 10, "completion_tokens": 10,
"prompt_tokens": 45, "prompt_tokens": 45,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct", "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.4-dev0-native", "system_fingerprint": "3.3.5-dev0-native",
"usage": { "usage": {
"completion_tokens": 10, "completion_tokens": 10,
"prompt_tokens": 45, "prompt_tokens": 45,

View File

@ -43,7 +43,10 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh
], ],
"seed": 42, "seed": 42,
"max_tokens": 500, "max_tokens": 500,
"response_format": {"type": "json_object", "value": Weather.model_json_schema()}, "response_format": {
"type": "json_object",
"value": Weather.model_json_schema(),
},
} }
# send the request # send the request
response = requests.post( response = requests.post(
@ -75,7 +78,11 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh
json_payload["response_format"] = { json_payload["response_format"] = {
"type": "json_schema", "type": "json_schema",
"value": {"name": "weather", "strict": True, "schema": Weather.model_json_schema()}, "value": {
"name": "weather",
"strict": True,
"schema": Weather.model_json_schema(),
},
} }
response = requests.post( response = requests.post(
f"{llama_grammar.base_url}/v1/chat/completions", f"{llama_grammar.base_url}/v1/chat/completions",
@ -119,7 +126,10 @@ async def test_grammar_response_format_llama_error_if_tools_not_installed(
"seed": 42, "seed": 42,
"max_tokens": 500, "max_tokens": 500,
"tools": [], "tools": [],
"response_format": {"type": "json_object", "value": Weather.model_json_schema()}, "response_format": {
"type": "json_object",
"value": Weather.model_json_schema(),
},
}, },
) )

View File

@ -113,13 +113,13 @@ content(
("text/event-stream" = StreamResponse), ("text/event-stream" = StreamResponse),
)), )),
(status = 424, description = "Generation Error", body = ErrorResponse, (status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"})), example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse, (status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse, (status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"})), example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse, (status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"})), example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
) )
)] )]
#[instrument(skip(infer, req))] #[instrument(skip(infer, req))]
@ -249,13 +249,13 @@ request_body = GenerateRequest,
responses( responses(
(status = 200, description = "Generated Text", body = GenerateResponse), (status = 200, description = "Generated Text", body = GenerateResponse),
(status = 424, description = "Generation Error", body = ErrorResponse, (status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"})), example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse, (status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse, (status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"})), example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse, (status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"})), example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
) )
)] )]
#[instrument( #[instrument(
@ -448,16 +448,16 @@ responses(
(status = 200, description = "Generated Text", body = StreamResponse, (status = 200, description = "Generated Text", body = StreamResponse,
content_type = "text/event-stream"), content_type = "text/event-stream"),
(status = 424, description = "Generation Error", body = ErrorResponse, (status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"}), example = json ! ({"error": "Request failed during generation", "error_type": "generation"}),
content_type = "text/event-stream"), content_type = "text/event-stream"),
(status = 429, description = "Model is overloaded", body = ErrorResponse, (status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"}), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"}),
content_type = "text/event-stream"), content_type = "text/event-stream"),
(status = 422, description = "Input validation error", body = ErrorResponse, (status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"}), example = json ! ({"error": "Input validation error", "error_type": "validation"}),
content_type = "text/event-stream"), content_type = "text/event-stream"),
(status = 500, description = "Incomplete generation", body = ErrorResponse, (status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"}), example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"}),
content_type = "text/event-stream"), content_type = "text/event-stream"),
) )
)] )]
@ -691,13 +691,13 @@ content(
("text/event-stream" = Chunk), ("text/event-stream" = Chunk),
)), )),
(status = 424, description = "Generation Error", body = ErrorResponse, (status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"})), example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse, (status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse, (status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"})), example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse, (status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"})), example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
) )
)] )]
#[instrument( #[instrument(
@ -1144,13 +1144,13 @@ content(
("text/event-stream" = ChatCompletionChunk), ("text/event-stream" = ChatCompletionChunk),
)), )),
(status = 424, description = "Generation Error", body = ErrorResponse, (status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"})), example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse, (status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse, (status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"})), example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse, (status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"})), example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
) )
)] )]
#[instrument( #[instrument(

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch Idefics2 model.""" """PyTorch Idefics2 model."""
from typing import List, Optional, Tuple from typing import List, Optional, Tuple

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch Idefics3 model.""" """PyTorch Idefics3 model."""
from typing import List, Optional, Tuple from typing import List, Optional, Tuple

View File

@ -17,7 +17,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Idefics model configuration""" """Idefics model configuration"""
import copy import copy
from transformers import PretrainedConfig from transformers import PretrainedConfig

View File

@ -17,7 +17,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch Idefics model.""" """PyTorch Idefics model."""
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import torch import torch

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object""" """PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
from dataclasses import dataclass from dataclasses import dataclass

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch Llava-NeXT model.""" """PyTorch Llava-NeXT model."""
from typing import List, Optional, Tuple from typing import List, Optional, Tuple

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch GPTNeoX model.""" """PyTorch GPTNeoX model."""
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch T5 model.""" """PyTorch T5 model."""
import copy import copy
import math import math

View File

@ -9,7 +9,7 @@ import numpy as np
def find_segments( def find_segments(
adapter_indices: Union[torch.Tensor, List[int]] adapter_indices: Union[torch.Tensor, List[int]],
) -> Tuple[List[int], List[int]]: ) -> Tuple[List[int], List[int]]:
if isinstance(adapter_indices, torch.Tensor): if isinstance(adapter_indices, torch.Tensor):
adapter_indices = adapter_indices.cpu().numpy() adapter_indices = adapter_indices.cpu().numpy()

View File

@ -720,17 +720,17 @@ wheels = [
[[package]] [[package]]
name = "hf-xet" name = "hf-xet"
version = "1.0.0" version = "1.1.9"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/64/46/db229dddc55121478105940b610fef1b466c414da02be9d4daa5602a2527/hf_xet-1.0.0.tar.gz", hash = "sha256:5e0ca891ce599fd753e7ffbdc182207d952a93e6252eeb92118475d6866bb093", size = 257192 } sdist = { url = "https://files.pythonhosted.org/packages/23/0f/5b60fc28ee7f8cc17a5114a584fd6b86e11c3e0a6e142a7f97a161e9640a/hf_xet-1.1.9.tar.gz", hash = "sha256:c99073ce404462e909f1d5839b2d14a3827b8fe75ed8aed551ba6609c026c803", size = 484242 }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/e7/0a/c16f8766fa3cd520292b1a765e9b50b8390bce4c2ed7657db9534551f5ed/hf_xet-1.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6106304f92bbce7c9b8509f6f735f2e8ce95e4dc32af8876e874c48b15ca1903", size = 5001841 }, { url = "https://files.pythonhosted.org/packages/de/12/56e1abb9a44cdef59a411fe8a8673313195711b5ecce27880eb9c8fa90bd/hf_xet-1.1.9-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a3b6215f88638dd7a6ff82cb4e738dcbf3d863bf667997c093a3c990337d1160", size = 2762553 },
{ url = "https://files.pythonhosted.org/packages/e3/9f/cca55edd85d03fc98c743bcc093965740a7440e909779c558039d6838f03/hf_xet-1.0.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d0bc7a3e6c1d21fcbb48e8726e3b19a2460e95971375e55e9a5f73ec7079a86", size = 4805318 }, { url = "https://files.pythonhosted.org/packages/3a/e6/2d0d16890c5f21b862f5df3146519c182e7f0ae49b4b4bf2bd8a40d0b05e/hf_xet-1.1.9-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:9b486de7a64a66f9a172f4b3e0dfe79c9f0a93257c501296a2521a13495a698a", size = 2623216 },
{ url = "https://files.pythonhosted.org/packages/d1/0b/28bda7ac9d699dcfb96f628aa135ddca3f0f77e9716351aab2b83966f957/hf_xet-1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23dee64f114ea9a272ff71a6a755e025d7a075a6a0dbf6da0990fe9211831ccf", size = 53504907 }, { url = "https://files.pythonhosted.org/packages/81/42/7e6955cf0621e87491a1fb8cad755d5c2517803cea174229b0ec00ff0166/hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4c5a840c2c4e6ec875ed13703a60e3523bc7f48031dfd750923b2a4d1a5fc3c", size = 3186789 },
{ url = "https://files.pythonhosted.org/packages/cb/04/ef1f7249a813841d193cbab2ef4d1d7d67c66c61d21d45223a72fdc5c88e/hf_xet-1.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d5f160550508ed87783d1eca4288602a713d1c45ec517d937acb9d93120f0cab", size = 52410434 }, { url = "https://files.pythonhosted.org/packages/df/8b/759233bce05457f5f7ec062d63bbfd2d0c740b816279eaaa54be92aa452a/hf_xet-1.1.9-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:96a6139c9e44dad1c52c52520db0fffe948f6bce487cfb9d69c125f254bb3790", size = 3088747 },
{ url = "https://files.pythonhosted.org/packages/81/b3/e7abec2619ecd9d1c743adfe79fa69cf84530f530969daf3dc804efef65b/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:5ebd79db87df0b9d3607e7c9a6bb0662c10e36992f522f66b1d2a7fe93f53f27", size = 53465113 }, { url = "https://files.pythonhosted.org/packages/6c/3c/28cc4db153a7601a996985bcb564f7b8f5b9e1a706c7537aad4b4809f358/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ad1022e9a998e784c97b2173965d07fe33ee26e4594770b7785a8cc8f922cd95", size = 3251429 },
{ url = "https://files.pythonhosted.org/packages/df/82/b51f3b6e5c6f33e91220c37b17760229704c58e79ab0fcfd0fd3b55803d3/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8e6d2625971b4affad634835db82d5392f38de874205a9573e0dd3f0f9cb136f", size = 53461632 }, { url = "https://files.pythonhosted.org/packages/84/17/7caf27a1d101bfcb05be85850d4aa0a265b2e1acc2d4d52a48026ef1d299/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:86754c2d6d5afb11b0a435e6e18911a4199262fe77553f8c50d75e21242193ea", size = 3354643 },
{ url = "https://files.pythonhosted.org/packages/95/d2/32defba26d995f7acdc4fe3e5911473b25aff5b75c5a2532786435a709e8/hf_xet-1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:b446964bd75eb7f6b4d983c47241b2023eadfad1f56137ed00e1ca6fc278faec", size = 4121808 }, { url = "https://files.pythonhosted.org/packages/cd/50/0c39c9eed3411deadcc98749a6699d871b822473f55fe472fad7c01ec588/hf_xet-1.1.9-cp37-abi3-win_amd64.whl", hash = "sha256:5aad3933de6b725d61d51034e04174ed1dce7a57c63d530df0014dea15a40127", size = 2804797 },
] ]
[[package]] [[package]]
@ -2708,7 +2708,7 @@ requires-dist = [
{ name = "opentelemetry-api", specifier = ">=1.27.0" }, { name = "opentelemetry-api", specifier = ">=1.27.0" },
{ name = "opentelemetry-exporter-otlp", specifier = ">=1.27.0" }, { name = "opentelemetry-exporter-otlp", specifier = ">=1.27.0" },
{ name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" }, { name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" },
{ name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13" }, { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13,<1.0" },
{ name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" }, { name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" },
{ name = "pillow", specifier = ">=11.1.0" }, { name = "pillow", specifier = ">=11.1.0" },
{ name = "prometheus-client", specifier = ">=0.21.0" }, { name = "prometheus-client", specifier = ">=0.21.0" },
@ -2872,22 +2872,22 @@ dependencies = [
{ name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
] ]
wheels = [ wheels = [
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b1f0cdd0720ad60536deb5baa427b782fd920dd4fcf72e244d32974caafa3b9e" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ac1849553ee673dfafb44c610c60cb60a2890f0e117f43599a526cf777eb8b8c" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ac1849553ee673dfafb44c610c60cb60a2890f0e117f43599a526cf777eb8b8c" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:c52c4b869742f00b12cb34521d1381be6119fa46244791704b00cc4a3cb06850" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:c52c4b869742f00b12cb34521d1381be6119fa46244791704b00cc4a3cb06850" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:47c895bcab508769d129d717a4b916b10225ae3855723aeec8dff8efe5346207" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c4bbc0b4be60319ba1cefc90be9557b317f0b3c261eeceb96ca6e0343eec56bf" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c4bbc0b4be60319ba1cefc90be9557b317f0b3c261eeceb96ca6e0343eec56bf" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:bf88f647d76d79da9556ca55df49e45aff1d66c12797886364343179dd09a36c" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:bf88f647d76d79da9556ca55df49e45aff1d66c12797886364343179dd09a36c" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6bba7dca5d9a729f1e8e9befb98055498e551efaf5ed034824c168b560afc1ac" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7c0f08d1c44a02abad389373dddfce75904b969a410be2f4e5109483dd3dc0ce" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7c0f08d1c44a02abad389373dddfce75904b969a410be2f4e5109483dd3dc0ce" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:1704e5dd66c9221e4e8b6ae2d80cbf54e129571e643f5fa9ca78cc6d2096403a" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:1704e5dd66c9221e4e8b6ae2d80cbf54e129571e643f5fa9ca78cc6d2096403a" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:633f35e8b1b1f640ef5f8a98dbd84f19b548222ce7ba8f017fe47ce6badc106a" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2f69f909da5dc52113ec66a851d62079f3d52c83184cf64beebdf12ca2f705c" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2f69f909da5dc52113ec66a851d62079f3d52c83184cf64beebdf12ca2f705c" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:58c749f52ddc9098155c77d6c74153bb13d8978fd6e1063b5d7b41d4644f5af5" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:58c749f52ddc9098155c77d6c74153bb13d8978fd6e1063b5d7b41d4644f5af5" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fa05ac6ebed4777de7a5eff398c1f17b697c02422516748ce66a8151873e5a0e" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:78e13c26c38ae92d6841cf9ce760d7e9d52bca3e3183de371812e84274b054dc" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:78e13c26c38ae92d6841cf9ce760d7e9d52bca3e3183de371812e84274b054dc" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:3559e98be824c2b12ab807319cd61c6174d73a524c9961317de8e8a44133c5c5" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:3559e98be824c2b12ab807319cd61c6174d73a524c9961317de8e8a44133c5c5" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:2f155388b1200e08f3e901bb3487ff93ca6d63cde87c29b97bb6762a8f63b373" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:f446f97b20cb070747b103fb640df941b88cb68c8d3b01538287d05d56a7e874" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:f446f97b20cb070747b103fb640df941b88cb68c8d3b01538287d05d56a7e874" },
{ url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-win_amd64.whl", hash = "sha256:8614a167d6a163273fb130f586802f3243479862b53ee2843941c10cc5761da6" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-win_amd64.whl", hash = "sha256:8614a167d6a163273fb130f586802f3243479862b53ee2843941c10cc5761da6" },
] ]