From 0f791622883238924da013464522eff05794195a Mon Sep 17 00:00:00 2001 From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:35:42 +0200 Subject: [PATCH] chore: prepare version 3.3.5 (#3314) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: prepare version 3.3.5 * black * neuron: black * Update hf-xet in uv lockfile * Attempt to fix API doc check failure Add `error_type` where missing. * Pin redocly version * Sync redocly with Nix for now --------- Co-authored-by: Daniรซl de Kok --- .github/workflows/autodocs.yaml | 2 +- Cargo.lock | 16 ++--- Cargo.toml | 2 +- README.md | 6 +- .../docker_commands/docker_commands.md | 10 +-- backends/neuron/tests/server/test_prefill.py | 1 + docs/openapi.json | 62 ++++++++++++------- docs/source/backends/gaudi.mdx | 6 +- docs/source/backends/neuron.md | 2 +- .../basic_tutorials/gated_model_access.md | 2 +- docs/source/conceptual/quantization.md | 6 +- docs/source/installation_amd.md | 2 +- docs/source/installation_intel.md | 4 +- docs/source/installation_nvidia.md | 2 +- docs/source/quicktour.md | 4 +- docs/source/reference/api_reference.md | 2 +- ...est_flash_gemma3_image_base64_rgb_jpg.json | 2 +- ...est_flash_gemma3_image_base64_rgb_png.json | 2 +- .../test_flash_gemma3_image_base64_rgba.json | 2 +- .../test_flash_gemma3_image_cow.json | 2 +- .../test_flash_gemma3_image_cow_dog.json | 2 +- .../test_json_schema_basic.json | 2 +- .../test_json_schema_complex.json | 2 +- .../test_mllama/test_mllama_load.json | 4 +- .../test_mllama/test_mllama_simpl.json | 2 +- .../test_grammar_response_format_llama.py | 16 ++++- router/src/server.rs | 40 ++++++------ .../models/custom_modeling/idefics2.py | 2 +- .../models/custom_modeling/idefics3.py | 2 +- .../models/custom_modeling/idefics_config.py | 2 +- .../custom_modeling/idefics_modeling.py | 2 +- .../models/custom_modeling/idefics_vision.py | 2 +- .../models/custom_modeling/llava_next.py | 2 +- .../models/custom_modeling/neox_modeling.py | 2 +- .../models/custom_modeling/t5_modeling.py | 2 +- .../text_generation_server/utils/segments.py | 2 +- server/uv.lock | 32 +++++----- 37 files changed, 143 insertions(+), 112 deletions(-) diff --git a/.github/workflows/autodocs.yaml b/.github/workflows/autodocs.yaml index a768f263..4e799939 100644 --- a/.github/workflows/autodocs.yaml +++ b/.github/workflows/autodocs.yaml @@ -41,5 +41,5 @@ jobs: - name: Check that documentation is up-to-date run: | - npm install -g @redocly/cli + npm install -g @redocly/cli@1.34.2 python update_doc.py --check diff --git a/Cargo.lock b/Cargo.lock index 7e172ed5..165cb590 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4650,7 +4650,7 @@ dependencies = [ [[package]] name = "text-generation-backends-trtllm" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "async-trait", "clap 4.5.32", @@ -4671,7 +4671,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "average", "clap 4.5.32", @@ -4691,7 +4691,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "async-trait", "base64 0.22.1", @@ -4709,7 +4709,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "clap 4.5.32", "ctrlc", @@ -4730,7 +4730,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "anyhow", "async-stream", @@ -4782,7 +4782,7 @@ dependencies = [ [[package]] name = "text-generation-router-llamacpp" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "async-trait", "bindgen 0.71.1", @@ -4800,7 +4800,7 @@ dependencies = [ [[package]] name = "text-generation-router-v2" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "async-stream", "async-trait", @@ -4849,7 +4849,7 @@ dependencies = [ [[package]] name = "text-generation-router-v3" -version = "3.3.4-dev0" +version = "3.3.5-dev0" dependencies = [ "async-stream", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 065046bc..a32d8e7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ default-members = [ resolver = "2" [workspace.package] -version = "3.3.4-dev0" +version = "3.3.5-dev0" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/README.md b/README.md index f9a45bc2..0890d9c6 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta volume=$PWD/data docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model + ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model ``` And then you can make requests like @@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \ **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. -**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4-rocm --model-id $model` instead of the command above. +**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5-rocm --model-id $model` instead of the command above. To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli): ``` @@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading token= docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model + ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model ``` ### A note on Shared Memory (shm) diff --git a/backends/gaudi/examples/docker_commands/docker_commands.md b/backends/gaudi/examples/docker_commands/docker_commands.md index 22b9d34b..ccacfbdb 100644 --- a/backends/gaudi/examples/docker_commands/docker_commands.md +++ b/backends/gaudi/examples/docker_commands/docker_commands.md @@ -19,7 +19,7 @@ docker run -p 8080:80 \ --ipc=host \ -v $volume:/data \ -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model \ --max-input-tokens 1024 --max-total-tokens 2048 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \ @@ -39,7 +39,7 @@ docker run -p 8080:80 \ --ipc=host \ -v $volume:/data \ -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model \ --sharded true --num-shard 8 \ --max-input-tokens 1024 --max-total-tokens 2048 \ @@ -58,7 +58,7 @@ docker run -p 8080:80 \ --cap-add=sys_nice \ --ipc=host \ -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-total-tokens 8192 --max-batch-size 4 @@ -81,7 +81,7 @@ docker run -p 8080:80 \ --ipc=host \ -v $volume:/data \ -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model \ --kv-cache-dtype fp8_e4m3fn \ --max-input-tokens 1024 --max-total-tokens 2048 \ @@ -102,7 +102,7 @@ docker run -p 8080:80 \ --ipc=host \ -v $volume:/data \ -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model \ --kv-cache-dtype fp8_e4m3fn \ --sharded true --num-shard 8 \ diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py index 796e4817..1061fbc4 100644 --- a/backends/neuron/tests/server/test_prefill.py +++ b/backends/neuron/tests/server/test_prefill.py @@ -56,6 +56,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample): assert tokens.ids[0] == expectations[0] assert tokens.texts[0] == expectations[1] + def test_prefill_truncate(neuron_model_config): config_name = neuron_model_config["name"] neuron_model_path = neuron_model_config["neuron_model_path"] diff --git a/docs/openapi.json b/docs/openapi.json index 63572257..6225f5e7 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "3.3.4-dev0" + "version": "3.3.5-dev0" }, "paths": { "/": { @@ -57,7 +57,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Input validation error" + "error": "Input validation error", + "error_type": "validation" } } } @@ -70,7 +71,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Request failed during generation" + "error": "Request failed during generation", + "error_type": "generation" } } } @@ -83,7 +85,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Model is overloaded" + "error": "Model is overloaded", + "error_type": "overloaded" } } } @@ -96,7 +99,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Incomplete generation" + "error": "Incomplete generation", + "error_type": "incomplete_generation" } } } @@ -181,7 +185,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Input validation error" + "error": "Input validation error", + "error_type": "validation" } } } @@ -194,7 +199,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Request failed during generation" + "error": "Request failed during generation", + "error_type": "generation" } } } @@ -207,7 +213,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Model is overloaded" + "error": "Model is overloaded", + "error_type": "overloaded" } } } @@ -220,7 +227,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Incomplete generation" + "error": "Incomplete generation", + "error_type": "incomplete_generation" } } } @@ -264,7 +272,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Input validation error" + "error": "Input validation error", + "error_type": "validation" } } } @@ -277,7 +286,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Request failed during generation" + "error": "Request failed during generation", + "error_type": "generation" } } } @@ -290,7 +300,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Model is overloaded" + "error": "Model is overloaded", + "error_type": "overloaded" } } } @@ -303,7 +314,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Incomplete generation" + "error": "Incomplete generation", + "error_type": "incomplete_generation" } } } @@ -558,7 +570,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Input validation error" + "error": "Input validation error", + "error_type": "validation" } } } @@ -571,7 +584,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Request failed during generation" + "error": "Request failed during generation", + "error_type": "generation" } } } @@ -584,7 +598,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Model is overloaded" + "error": "Model is overloaded", + "error_type": "overloaded" } } } @@ -597,7 +612,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Incomplete generation" + "error": "Incomplete generation", + "error_type": "incomplete_generation" } } } @@ -646,7 +662,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Input validation error" + "error": "Input validation error", + "error_type": "validation" } } } @@ -659,7 +676,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Request failed during generation" + "error": "Request failed during generation", + "error_type": "generation" } } } @@ -672,7 +690,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Model is overloaded" + "error": "Model is overloaded", + "error_type": "overloaded" } } } @@ -685,7 +704,8 @@ "$ref": "#/components/schemas/ErrorResponse" }, "example": { - "error": "Incomplete generation" + "error": "Incomplete generation", + "error_type": "incomplete_generation" } } } diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx index 702a9b80..07d34a82 100644 --- a/docs/source/backends/gaudi.mdx +++ b/docs/source/backends/gaudi.mdx @@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN docker run --runtime=habana --cap-add=sys_nice --ipc=host \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model ``` @@ -52,7 +52,7 @@ hf_token=YOUR_ACCESS_TOKEN docker run --runtime=habana --cap-add=sys_nice --ipc=host \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model ``` @@ -106,7 +106,7 @@ docker run -p 8080:80 \ --cap-add=sys_nice \ --ipc=host \ -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \ --model-id $model \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-total-tokens 8192 --max-batch-size 4 diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md index 6ba57502..17d720db 100644 --- a/docs/source/backends/neuron.md +++ b/docs/source/backends/neuron.md @@ -31,7 +31,7 @@ deployment instructions in the model card: The service is launched simply by running the text-generation-inference container with two sets of parameters: ``` -docker run ghcr.io/huggingface/text-generation-inference:3.3.4-neuron +docker run ghcr.io/huggingface/text-generation-inference:3.3.5-neuron ``` - system parameters are used to map ports, volumes and devices between the host and the service, diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md index cf164c32..d42bac7a 100644 --- a/docs/source/basic_tutorials/gated_model_access.md +++ b/docs/source/basic_tutorials/gated_model_access.md @@ -19,6 +19,6 @@ docker run --gpus all \ --shm-size 1g \ -e HF_TOKEN=$token \ -p 8080:80 \ - -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 \ + -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 \ --model-id $model ``` diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md index d15e0089..ad6483e2 100644 --- a/docs/source/conceptual/quantization.md +++ b/docs/source/conceptual/quantization.md @@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below ๐Ÿ‘‡ ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes ``` 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load. @@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below ๐Ÿ‘‡ ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize bitsandbytes-nf4 +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes-nf4 ``` You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes). @@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$ TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below ๐Ÿ‘‡ ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model --quantize gptq +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize gptq ``` Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md index 423a9956..df4abb3b 100644 --- a/docs/source/installation_amd.md +++ b/docs/source/installation_amd.md @@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --device=/dev/kfd --device=/dev/dri --group-add video \ --ipc=host --shm-size 256g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4-rocm \ + ghcr.io/huggingface/text-generation-inference:3.3.5-rocm \ --model-id $model ``` diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md index 0b03e3c3..60b0bcc0 100644 --- a/docs/source/installation_intel.md +++ b/docs/source/installation_intel.md @@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm --privileged --cap-add=sys_nice \ --device=/dev/dri \ --ipc=host --shm-size 1g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4-intel-xpu \ + ghcr.io/huggingface/text-generation-inference:3.3.5-intel-xpu \ --model-id $model --cuda-graphs 0 ``` @@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm --privileged --cap-add=sys_nice \ --device=/dev/dri \ --ipc=host --shm-size 1g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4-intel-cpu \ + ghcr.io/huggingface/text-generation-inference:3.3.5-intel-cpu \ --model-id $model --cuda-graphs 0 ``` diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md index 507e9c70..37cb841c 100644 --- a/docs/source/installation_nvidia.md +++ b/docs/source/installation_nvidia.md @@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4 \ + ghcr.io/huggingface/text-generation-inference:3.3.5 \ --model-id $model ``` diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index e66e5808..bd8495c5 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.3.4 \ + ghcr.io/huggingface/text-generation-inference:3.3.5 \ --model-id $model ``` @@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \ To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more. ```bash -docker run ghcr.io/huggingface/text-generation-inference:3.3.4 --help +docker run ghcr.io/huggingface/text-generation-inference:3.3.5 --help ``` diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md index b900887e..7d21eca7 100644 --- a/docs/source/reference/api_reference.md +++ b/docs/source/reference/api_reference.md @@ -163,7 +163,7 @@ hub = { # create Hugging Face Model Class huggingface_model = HuggingFaceModel( - image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.4"), + image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.5"), env=hub, role=role, ) diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json index 436ec29d..b9803da8 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json @@ -17,7 +17,7 @@ "id": "", "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 42, "prompt_tokens": 277, diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json index 68783c27..a91f01f7 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json @@ -17,7 +17,7 @@ "id": "", "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 62, "prompt_tokens": 277, diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json index 76a34128..d8104c9a 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json @@ -17,7 +17,7 @@ "id": "", "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 67, "prompt_tokens": 277, diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json index be774054..0a712cc7 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json @@ -17,7 +17,7 @@ "id": "", "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 72, "prompt_tokens": 275, diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json index cd79c363..6d4ee727 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json @@ -17,7 +17,7 @@ "id": "", "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 80, "prompt_tokens": 279, diff --git a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json index 05129fe0..3310bdcd 100644 --- a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json +++ b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json @@ -14,7 +14,7 @@ "id": "", "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 35, "prompt_tokens": 32, diff --git a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json index 6c548214..e627b2be 100644 --- a/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json +++ b/integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json @@ -14,7 +14,7 @@ "id": "", "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 44, "prompt_tokens": 37, diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json index 7b992b4f..58f5ada8 100644 --- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json +++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json @@ -18,7 +18,7 @@ "id": "", "model": "unsloth/Llama-3.2-11B-Vision-Instruct", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 10, "prompt_tokens": 45, @@ -44,7 +44,7 @@ "id": "", "model": "unsloth/Llama-3.2-11B-Vision-Instruct", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 10, "prompt_tokens": 45, diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json index ccf53120..6830b36b 100644 --- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json +++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json @@ -17,7 +17,7 @@ "id": "", "model": "unsloth/Llama-3.2-11B-Vision-Instruct", "object": "chat.completion", - "system_fingerprint": "3.3.4-dev0-native", + "system_fingerprint": "3.3.5-dev0-native", "usage": { "completion_tokens": 10, "prompt_tokens": 45, diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py index 8a905e64..424dcaaf 100644 --- a/integration-tests/models/test_grammar_response_format_llama.py +++ b/integration-tests/models/test_grammar_response_format_llama.py @@ -43,7 +43,10 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh ], "seed": 42, "max_tokens": 500, - "response_format": {"type": "json_object", "value": Weather.model_json_schema()}, + "response_format": { + "type": "json_object", + "value": Weather.model_json_schema(), + }, } # send the request response = requests.post( @@ -75,7 +78,11 @@ async def test_grammar_response_format_llama_json(llama_grammar, response_snapsh json_payload["response_format"] = { "type": "json_schema", - "value": {"name": "weather", "strict": True, "schema": Weather.model_json_schema()}, + "value": { + "name": "weather", + "strict": True, + "schema": Weather.model_json_schema(), + }, } response = requests.post( f"{llama_grammar.base_url}/v1/chat/completions", @@ -119,7 +126,10 @@ async def test_grammar_response_format_llama_error_if_tools_not_installed( "seed": 42, "max_tokens": 500, "tools": [], - "response_format": {"type": "json_object", "value": Weather.model_json_schema()}, + "response_format": { + "type": "json_object", + "value": Weather.model_json_schema(), + }, }, ) diff --git a/router/src/server.rs b/router/src/server.rs index 5fbe0403..97a0cea2 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -113,13 +113,13 @@ content( ("text/event-stream" = StreamResponse), )), (status = 424, description = "Generation Error", body = ErrorResponse, -example = json ! ({"error": "Request failed during generation"})), +example = json ! ({"error": "Request failed during generation", "error_type": "generation"})), (status = 429, description = "Model is overloaded", body = ErrorResponse, -example = json ! ({"error": "Model is overloaded"})), +example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Input validation error", body = ErrorResponse, -example = json ! ({"error": "Input validation error"})), +example = json ! ({"error": "Input validation error", "error_type": "validation"})), (status = 500, description = "Incomplete generation", body = ErrorResponse, -example = json ! ({"error": "Incomplete generation"})), +example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})), ) )] #[instrument(skip(infer, req))] @@ -249,13 +249,13 @@ request_body = GenerateRequest, responses( (status = 200, description = "Generated Text", body = GenerateResponse), (status = 424, description = "Generation Error", body = ErrorResponse, -example = json ! ({"error": "Request failed during generation"})), +example = json ! ({"error": "Request failed during generation", "error_type": "generation"})), (status = 429, description = "Model is overloaded", body = ErrorResponse, -example = json ! ({"error": "Model is overloaded"})), +example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Input validation error", body = ErrorResponse, -example = json ! ({"error": "Input validation error"})), +example = json ! ({"error": "Input validation error", "error_type": "validation"})), (status = 500, description = "Incomplete generation", body = ErrorResponse, -example = json ! ({"error": "Incomplete generation"})), +example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})), ) )] #[instrument( @@ -448,16 +448,16 @@ responses( (status = 200, description = "Generated Text", body = StreamResponse, content_type = "text/event-stream"), (status = 424, description = "Generation Error", body = ErrorResponse, -example = json ! ({"error": "Request failed during generation"}), +example = json ! ({"error": "Request failed during generation", "error_type": "generation"}), content_type = "text/event-stream"), (status = 429, description = "Model is overloaded", body = ErrorResponse, -example = json ! ({"error": "Model is overloaded"}), +example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"}), content_type = "text/event-stream"), (status = 422, description = "Input validation error", body = ErrorResponse, -example = json ! ({"error": "Input validation error"}), +example = json ! ({"error": "Input validation error", "error_type": "validation"}), content_type = "text/event-stream"), (status = 500, description = "Incomplete generation", body = ErrorResponse, -example = json ! ({"error": "Incomplete generation"}), +example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"}), content_type = "text/event-stream"), ) )] @@ -691,13 +691,13 @@ content( ("text/event-stream" = Chunk), )), (status = 424, description = "Generation Error", body = ErrorResponse, -example = json ! ({"error": "Request failed during generation"})), +example = json ! ({"error": "Request failed during generation", "error_type": "generation"})), (status = 429, description = "Model is overloaded", body = ErrorResponse, -example = json ! ({"error": "Model is overloaded"})), +example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Input validation error", body = ErrorResponse, -example = json ! ({"error": "Input validation error"})), +example = json ! ({"error": "Input validation error", "error_type": "validation"})), (status = 500, description = "Incomplete generation", body = ErrorResponse, -example = json ! ({"error": "Incomplete generation"})), +example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})), ) )] #[instrument( @@ -1144,13 +1144,13 @@ content( ("text/event-stream" = ChatCompletionChunk), )), (status = 424, description = "Generation Error", body = ErrorResponse, -example = json ! ({"error": "Request failed during generation"})), +example = json ! ({"error": "Request failed during generation", "error_type": "generation"})), (status = 429, description = "Model is overloaded", body = ErrorResponse, -example = json ! ({"error": "Model is overloaded"})), +example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Input validation error", body = ErrorResponse, -example = json ! ({"error": "Input validation error"})), +example = json ! ({"error": "Input validation error", "error_type": "validation"})), (status = 500, description = "Incomplete generation", body = ErrorResponse, -example = json ! ({"error": "Incomplete generation"})), +example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})), ) )] #[instrument( diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index 5c0d2fcc..c891f4c8 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Idefics2 model.""" +"""PyTorch Idefics2 model.""" from typing import List, Optional, Tuple diff --git a/server/text_generation_server/models/custom_modeling/idefics3.py b/server/text_generation_server/models/custom_modeling/idefics3.py index 6d303c2c..216b1eac 100644 --- a/server/text_generation_server/models/custom_modeling/idefics3.py +++ b/server/text_generation_server/models/custom_modeling/idefics3.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Idefics3 model.""" +"""PyTorch Idefics3 model.""" from typing import List, Optional, Tuple diff --git a/server/text_generation_server/models/custom_modeling/idefics_config.py b/server/text_generation_server/models/custom_modeling/idefics_config.py index a5565819..6ce2054e 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_config.py +++ b/server/text_generation_server/models/custom_modeling/idefics_config.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Idefics model configuration""" +"""Idefics model configuration""" import copy from transformers import PretrainedConfig diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py index 9fc9bca6..3104f742 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Idefics model.""" +"""PyTorch Idefics model.""" from typing import List, Optional, Tuple, Union import torch diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py index dd8f76bc..7d2051e0 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_vision.py +++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object""" +"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object""" from dataclasses import dataclass diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index 56a9565b..decee125 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Llava-NeXT model.""" +"""PyTorch Llava-NeXT model.""" from typing import List, Optional, Tuple diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py index 06731a6f..8554b632 100644 --- a/server/text_generation_server/models/custom_modeling/neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch GPTNeoX model.""" +"""PyTorch GPTNeoX model.""" from typing import Optional, Tuple, Union diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py index e6666acd..0dce0f9e 100644 --- a/server/text_generation_server/models/custom_modeling/t5_modeling.py +++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch T5 model.""" +"""PyTorch T5 model.""" import copy import math diff --git a/server/text_generation_server/utils/segments.py b/server/text_generation_server/utils/segments.py index fd8be563..3d880fec 100644 --- a/server/text_generation_server/utils/segments.py +++ b/server/text_generation_server/utils/segments.py @@ -9,7 +9,7 @@ import numpy as np def find_segments( - adapter_indices: Union[torch.Tensor, List[int]] + adapter_indices: Union[torch.Tensor, List[int]], ) -> Tuple[List[int], List[int]]: if isinstance(adapter_indices, torch.Tensor): adapter_indices = adapter_indices.cpu().numpy() diff --git a/server/uv.lock b/server/uv.lock index 7e6f194a..b7864685 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -720,17 +720,17 @@ wheels = [ [[package]] name = "hf-xet" -version = "1.0.0" +version = "1.1.9" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/64/46/db229dddc55121478105940b610fef1b466c414da02be9d4daa5602a2527/hf_xet-1.0.0.tar.gz", hash = "sha256:5e0ca891ce599fd753e7ffbdc182207d952a93e6252eeb92118475d6866bb093", size = 257192 } +sdist = { url = "https://files.pythonhosted.org/packages/23/0f/5b60fc28ee7f8cc17a5114a584fd6b86e11c3e0a6e142a7f97a161e9640a/hf_xet-1.1.9.tar.gz", hash = "sha256:c99073ce404462e909f1d5839b2d14a3827b8fe75ed8aed551ba6609c026c803", size = 484242 } wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/0a/c16f8766fa3cd520292b1a765e9b50b8390bce4c2ed7657db9534551f5ed/hf_xet-1.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6106304f92bbce7c9b8509f6f735f2e8ce95e4dc32af8876e874c48b15ca1903", size = 5001841 }, - { url = "https://files.pythonhosted.org/packages/e3/9f/cca55edd85d03fc98c743bcc093965740a7440e909779c558039d6838f03/hf_xet-1.0.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d0bc7a3e6c1d21fcbb48e8726e3b19a2460e95971375e55e9a5f73ec7079a86", size = 4805318 }, - { url = "https://files.pythonhosted.org/packages/d1/0b/28bda7ac9d699dcfb96f628aa135ddca3f0f77e9716351aab2b83966f957/hf_xet-1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23dee64f114ea9a272ff71a6a755e025d7a075a6a0dbf6da0990fe9211831ccf", size = 53504907 }, - { url = "https://files.pythonhosted.org/packages/cb/04/ef1f7249a813841d193cbab2ef4d1d7d67c66c61d21d45223a72fdc5c88e/hf_xet-1.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d5f160550508ed87783d1eca4288602a713d1c45ec517d937acb9d93120f0cab", size = 52410434 }, - { url = "https://files.pythonhosted.org/packages/81/b3/e7abec2619ecd9d1c743adfe79fa69cf84530f530969daf3dc804efef65b/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:5ebd79db87df0b9d3607e7c9a6bb0662c10e36992f522f66b1d2a7fe93f53f27", size = 53465113 }, - { url = "https://files.pythonhosted.org/packages/df/82/b51f3b6e5c6f33e91220c37b17760229704c58e79ab0fcfd0fd3b55803d3/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8e6d2625971b4affad634835db82d5392f38de874205a9573e0dd3f0f9cb136f", size = 53461632 }, - { url = "https://files.pythonhosted.org/packages/95/d2/32defba26d995f7acdc4fe3e5911473b25aff5b75c5a2532786435a709e8/hf_xet-1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:b446964bd75eb7f6b4d983c47241b2023eadfad1f56137ed00e1ca6fc278faec", size = 4121808 }, + { url = "https://files.pythonhosted.org/packages/de/12/56e1abb9a44cdef59a411fe8a8673313195711b5ecce27880eb9c8fa90bd/hf_xet-1.1.9-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a3b6215f88638dd7a6ff82cb4e738dcbf3d863bf667997c093a3c990337d1160", size = 2762553 }, + { url = "https://files.pythonhosted.org/packages/3a/e6/2d0d16890c5f21b862f5df3146519c182e7f0ae49b4b4bf2bd8a40d0b05e/hf_xet-1.1.9-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:9b486de7a64a66f9a172f4b3e0dfe79c9f0a93257c501296a2521a13495a698a", size = 2623216 }, + { url = "https://files.pythonhosted.org/packages/81/42/7e6955cf0621e87491a1fb8cad755d5c2517803cea174229b0ec00ff0166/hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4c5a840c2c4e6ec875ed13703a60e3523bc7f48031dfd750923b2a4d1a5fc3c", size = 3186789 }, + { url = "https://files.pythonhosted.org/packages/df/8b/759233bce05457f5f7ec062d63bbfd2d0c740b816279eaaa54be92aa452a/hf_xet-1.1.9-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:96a6139c9e44dad1c52c52520db0fffe948f6bce487cfb9d69c125f254bb3790", size = 3088747 }, + { url = "https://files.pythonhosted.org/packages/6c/3c/28cc4db153a7601a996985bcb564f7b8f5b9e1a706c7537aad4b4809f358/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ad1022e9a998e784c97b2173965d07fe33ee26e4594770b7785a8cc8f922cd95", size = 3251429 }, + { url = "https://files.pythonhosted.org/packages/84/17/7caf27a1d101bfcb05be85850d4aa0a265b2e1acc2d4d52a48026ef1d299/hf_xet-1.1.9-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:86754c2d6d5afb11b0a435e6e18911a4199262fe77553f8c50d75e21242193ea", size = 3354643 }, + { url = "https://files.pythonhosted.org/packages/cd/50/0c39c9eed3411deadcc98749a6699d871b822473f55fe472fad7c01ec588/hf_xet-1.1.9-cp37-abi3-win_amd64.whl", hash = "sha256:5aad3933de6b725d61d51034e04174ed1dce7a57c63d530df0014dea15a40127", size = 2804797 }, ] [[package]] @@ -2708,7 +2708,7 @@ requires-dist = [ { name = "opentelemetry-api", specifier = ">=1.27.0" }, { name = "opentelemetry-exporter-otlp", specifier = ">=1.27.0" }, { name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" }, - { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13" }, + { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13,<1.0" }, { name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" }, { name = "pillow", specifier = ">=11.1.0" }, { name = "prometheus-client", specifier = ">=0.21.0" }, @@ -2872,22 +2872,22 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b1f0cdd0720ad60536deb5baa427b782fd920dd4fcf72e244d32974caafa3b9e" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ac1849553ee673dfafb44c610c60cb60a2890f0e117f43599a526cf777eb8b8c" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:c52c4b869742f00b12cb34521d1381be6119fa46244791704b00cc4a3cb06850" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:47c895bcab508769d129d717a4b916b10225ae3855723aeec8dff8efe5346207" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c4bbc0b4be60319ba1cefc90be9557b317f0b3c261eeceb96ca6e0343eec56bf" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:bf88f647d76d79da9556ca55df49e45aff1d66c12797886364343179dd09a36c" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6bba7dca5d9a729f1e8e9befb98055498e551efaf5ed034824c168b560afc1ac" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7c0f08d1c44a02abad389373dddfce75904b969a410be2f4e5109483dd3dc0ce" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:1704e5dd66c9221e4e8b6ae2d80cbf54e129571e643f5fa9ca78cc6d2096403a" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:633f35e8b1b1f640ef5f8a98dbd84f19b548222ce7ba8f017fe47ce6badc106a" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:d2f69f909da5dc52113ec66a851d62079f3d52c83184cf64beebdf12ca2f705c" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:58c749f52ddc9098155c77d6c74153bb13d8978fd6e1063b5d7b41d4644f5af5" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fa05ac6ebed4777de7a5eff398c1f17b697c02422516748ce66a8151873e5a0e" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:78e13c26c38ae92d6841cf9ce760d7e9d52bca3e3183de371812e84274b054dc" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:3559e98be824c2b12ab807319cd61c6174d73a524c9961317de8e8a44133c5c5" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:2f155388b1200e08f3e901bb3487ff93ca6d63cde87c29b97bb6762a8f63b373" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:f446f97b20cb070747b103fb640df941b88cb68c8d3b01538287d05d56a7e874" }, { url = "https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp39-cp39-win_amd64.whl", hash = "sha256:8614a167d6a163273fb130f586802f3243479862b53ee2843941c10cc5761da6" }, ]