Merge branch 'main' into patch_rust

2025-04-24 00:12:08 +00:00 · 2025-03-04 16:50:57 +01:00 · 2025-03-04 16:50:57 +01:00 · bbc68748b7
commit bbc68748b7
parent 1b4ecd41e0 08bbfa16a1
23 changed files with 244 additions and 68 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1007,9 +1007,9 @@ dependencies = [

 [[package]]
 name = "cxx"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc49567e08c72902f4cbc7242ee8d874ec9cbe97fbabf77b4e0e1f447513e13a"
+checksum = "8bc580dceb395cae0efdde0a88f034cfd8a276897e40c693a7b87bed17971d33"
 dependencies = [
 "cc",
 "cxxbridge-cmd",
@ -1021,9 +1021,9 @@ dependencies = [

 [[package]]
 name = "cxx-build"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe46b5309c99e9775e7a338c98e4097455f52db5b684fd793ca22848fde6e371"
+checksum = "49d8c1baedad72a7efda12ad8d7ad687b3e7221dfb304a12443fd69e9de8bb30"
 dependencies = [
 "cc",
 "codespan-reporting",
@ -1035,9 +1035,9 @@ dependencies = [

 [[package]]
 name = "cxxbridge-cmd"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4315c4ce8d23c26d87f2f83698725fd5718d8e6ace4a9093da2664d23294d372"
+checksum = "e43afb0e3b2ef293492a31ecd796af902112460d53e5f923f7804f348a769f9c"
 dependencies = [
 "clap 4.5.30",
 "codespan-reporting",
@ -1048,15 +1048,15 @@ dependencies = [

 [[package]]
 name = "cxxbridge-flags"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f55d69deb3a92f610a60ecc524a72c7374b6dc822f8fb7bb4e5d9473f10530c4"
+checksum = "0257ad2096a2474fe877e9e055ab69603851c3d6b394efcc7e0443899c2492ce"

 [[package]]
 name = "cxxbridge-macro"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bee7a1d9b5091462002c2b8de2a4ed0f0fde011d503cc272633f66075bd5141"
+checksum = "b46cbd7358a46b760609f1cb5093683328e58ca50e594a308716f5403fdc03e5"
 dependencies = [
 "proc-macro2",
 "quote",
@ -1660,30 +1660,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732"
 dependencies = [
 "dirs",
- "futures",
 "indicatif",
 "log",
 "native-tls",
- "num_cpus",
 "rand 0.8.5",
- "reqwest 0.11.27",
 "serde",
 "serde_json",
 "thiserror 1.0.69",
- "tokio",
 "ureq",
 ]

 [[package]]
 name = "hf-hub"
-version = "0.4.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "112fa2f6ad4ab815b9e1b938b4b1e437032d055e2f92ed10fd6ab2e62d02c6b6"
+checksum = "cc03dcb0b0a83ae3f3363ec811014ae669f083e4e499c66602f447c4828737a1"
 dependencies = [
 "dirs",
 "futures",
 "http 1.2.0",
 "indicatif",
+ "libc",
 "log",
 "native-tls",
 "num_cpus",
@ -1694,6 +1691,7 @@ dependencies = [
 "thiserror 2.0.11",
 "tokio",
 "ureq",
+ "windows-sys 0.59.0",
 ]

 [[package]]
@ -4619,7 +4617,7 @@ dependencies = [

 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "async-trait",
 "clap 4.5.30",
@ -4627,7 +4625,7 @@ dependencies = [
 "cxx",
 "cxx-build",
 "hashbrown 0.15.2",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
 "pkg-config",
 "pyo3",
 "text-generation-router",
@ -4640,12 +4638,12 @@ dependencies = [

 [[package]]
 name = "text-generation-benchmark"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "average",
 "clap 4.5.30",
 "float-ord",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
 "ratatui",
 "serde",
 "serde_json",
@ -4660,7 +4658,7 @@ dependencies = [

 [[package]]
 name = "text-generation-client"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@ -4678,12 +4676,12 @@ dependencies = [

 [[package]]
 name = "text-generation-launcher"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "clap 4.5.30",
 "ctrlc",
 "float_eq",
- "hf-hub 0.4.1",
+ "hf-hub 0.4.2",
 "nix 0.28.0",
 "once_cell",
 "pyo3",
@ -4699,7 +4697,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "anyhow",
 "async-stream",
@ -4712,7 +4710,7 @@ dependencies = [
 "csv",
 "futures",
 "futures-util",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
 "image",
 "init-tracing-opentelemetry",
 "itertools 0.10.5",
@ -4751,7 +4749,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "async-trait",
 "bindgen 0.71.1",
@ -4768,7 +4766,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router-v2"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "async-stream",
 "async-trait",
@ -4779,7 +4777,7 @@ dependencies = [
 "futures",
 "futures-util",
 "grpc-metadata",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
 "image",
 "init-tracing-opentelemetry",
 "jsonschema",
@ -4817,7 +4815,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router-v3"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
 "async-stream",
 "async-trait",
@ -4829,7 +4827,7 @@ dependencies = [
 "futures",
 "futures-util",
 "grpc-metadata",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
 "image",
 "init-tracing-opentelemetry",
 "itertools 0.13.0",
@ -4847,6 +4845,7 @@ dependencies = [
 "rand 0.8.5",
 "regex",
 "reqwest 0.11.27",
+ "rustc-hash 2.1.1",
 "serde",
 "serde_json",
 "slotmap",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -21,7 +21,7 @@ default-members = [
 resolver = "2"

 [workspace.package]
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
@ -29,7 +29,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 [workspace.dependencies]
 base64 = "0.22.0"
 tokenizers = { version = "0.20.0", features = ["http"] }
-hf-hub = { version = "0.3.1", features = ["tokio"] }
+hf-hub = { version = "0.4.1", features = ["tokio"] }
 metrics = { version = "0.23.0" }
 metrics-exporter-prometheus = { version = "0.15.1", features = [] }
 minijinja = { version = "2.2.0", features = ["json"] }
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -18,7 +18,8 @@ RUN apt-get update -y \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN cargo install cargo-chef --locked

--- a/README.md
+++ b/README.md
@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
 ```

 And then you can make requests like
@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \

 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1-rocm --model-id $model` instead of the command above.

 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>

 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
 ```

 ### A note on Shared Memory (shm)
--- a/backends/neuron/Cargo.toml
+++ b/backends/neuron/Cargo.toml
@ -22,7 +22,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 [workspace.dependencies]
 base64 = "0.22.0"
 tokenizers = { version = "0.20.0", features = ["http"] }
-hf-hub = { version = "0.3.1", features = ["tokio"] }
+hf-hub = { version = "0.4.2", features = ["tokio"] }
 metrics = { version = "0.23.0" }
 metrics-exporter-prometheus = { version = "0.15.1", features = [] }
 minijinja = { version = "2.2.0", features = ["json"] }
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@ -86,6 +86,10 @@ async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<T
            builder = builder.with_cache_dir(cache_dir.into());
        }

+        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
+            builder = builder.with_user_agent("origin", origin.as_str());
+        }
+
        builder
    };

--- a/backends/v3/Cargo.toml
+++ b/backends/v3/Cargo.toml
@ -71,6 +71,7 @@ prost-build = "0.12.1"
 [dev-dependencies]
 criterion = "0.3"
 itertools = "0.13"
+rustc-hash = "2"

 [features]
 default = ["ngrok"]
--- a/backends/v3/src/radix.rs
+++ b/backends/v3/src/radix.rs
@ -283,7 +283,7 @@ impl RadixTrie {
    }

    /// Find worker.
-    fn find_(&mut self, mut node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
+    fn find_(&mut self, node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
        let node = &self.nodes[node_id];

        if key.len() >= self.block_size {
@ -295,9 +295,13 @@ impl RadixTrie {
                assert_eq!(shared_prefix_len % self.block_size, 0);
                blocks.extend(&child.blocks[..shared_prefix_len / self.block_size]);

+                // A node represents the prefix of its children. So, only
+                // recurse when there is a full prefix match.
                let key = &key[shared_prefix_len..];
-                if !key.is_empty() {
-                    node_id = self.find_(child_id, key, blocks);
+                if !key.is_empty() && shared_prefix_len == child.key.len() {
+                    return self.find_(child_id, key, blocks);
+                } else {
+                    return child_id;
                }
            }
        }
@ -631,6 +635,12 @@ fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize {
 mod tests {
    use std::sync::Arc;

+    use rand::{
+        distributions::Uniform, prelude::Distribution, rngs::SmallRng, seq::SliceRandom,
+        SeedableRng,
+    };
+    use rustc_hash::FxHashSet;
+
    use super::*;

    #[test]
@ -873,4 +883,159 @@ mod tests {
        // Clear out the whole trie.
        assert_eq!(trie.evict(10), vec![1, 2, 3, 0, 1]);
    }
+
+    #[test]
+    fn full_match_returns_correct_node() {
+        let mut trie = RadixTrie::new(1);
+        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
+        let node_id = trie.find(&[0, 1, 2], &mut vec![]);
+        // At this point, there are only two nodes: the root and the node
+        // with tokens 0, 1, 2. Looking up the exact prefix must return
+        // the non-root node.
+        assert_ne!(node_id, trie.root);
+    }
+
+    #[test]
+    fn partial_match_does_not_recurse() {
+        let mut trie = RadixTrie::new(1);
+        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
+        trie.insert(&[0, 1, 2, 3, 4, 5], &[0, 1, 2, 3, 4, 5])
+            .unwrap();
+        let mut blocks = Vec::new();
+        let node_id = trie.find(&[0, 1, 3, 4, 5], &mut blocks);
+        assert_eq!(blocks, vec![0, 1]);
+        assert_eq!(node_id, trie.find(&[0, 1], &mut blocks))
+    }
+
+    struct AllocationWithInfo {
+        allocation: BlockAllocation,
+        // We are doing a lot of set operations and `FxBuildHasher` is
+        // muc faster for a set of integers.
+        blockset: FxHashSet<u32>,
+        non_prefix_blocks: FxHashSet<u32>,
+    }
+
+    #[test]
+    fn invariants_hold_on_many_operations_remove_all() {
+        invariants_hold_on_many_insertions(true);
+    }
+
+    #[test]
+    fn invariants_hold_on_many_operations_remove_subset() {
+        invariants_hold_on_many_insertions(false);
+    }
+
+    fn invariants_hold_on_many_insertions(remove_all: bool) {
+        // Small vocabulary sizes lead to violations more quickly due to
+        // prefix sharing, etc.
+        const VOCAB_SIZE: u32 = 2;
+        const DATA_LEN: usize = 1_000;
+
+        const MAX_PREFILL_LEN: usize = 8;
+        const MAX_DECODE_LEN: usize = 8;
+
+        let vocab_range = Uniform::new(0, VOCAB_SIZE);
+        let data_range = Uniform::new(0, DATA_LEN);
+        let prefill_len_range = Uniform::new(0, MAX_PREFILL_LEN);
+        let decode_len_range = Uniform::new(0, MAX_DECODE_LEN);
+
+        let mut rng = SmallRng::seed_from_u64(64);
+        let data = (0..DATA_LEN)
+            .map(|_| vocab_range.sample(&mut rng))
+            .collect::<Vec<_>>();
+        let mut allocator = RadixAllocator::new(1, 100, None);
+
+        let mut allocations = Vec::new();
+
+        for i in 0..100_000 {
+            // Allocate until all blocks are used.
+            'allocation: loop {
+                // Use offset 0 half of the times for prefix sharing.
+                let prefill_offset = data_range.sample(&mut rng);
+                let prefill_len = prefill_len_range.sample(&mut rng);
+                let decode_len = decode_len_range.sample(&mut rng);
+
+                let prefill =
+                    data[prefill_offset..data.len().min(prefill_offset + prefill_len)].to_vec();
+
+                let allocation = match allocator
+                    .allocate((prefill.len() + decode_len) as u32, Some(Arc::new(prefill)))
+                {
+                    Some(allocation) => allocation,
+                    None => break 'allocation,
+                };
+                let non_prefix_blocks = allocation.blocks[allocation.prefix_len as usize..]
+                    .iter()
+                    .copied()
+                    .collect::<FxHashSet<_>>();
+                let blockset = allocation.blocks.iter().copied().collect::<FxHashSet<_>>();
+
+                // No duplicate blocks in an allocation.
+                assert_eq!(
+                    allocation.blocks.len(),
+                    blockset.len(),
+                    "Duplicate blocks in allocation"
+                );
+
+                allocations.push(AllocationWithInfo {
+                    allocation,
+                    blockset,
+                    non_prefix_blocks,
+                });
+            }
+
+            // Check invariants. Skip first iteration, since there is no prefix sharing yet.
+            if i > 1 {
+                check_allocation_invariants(&allocations);
+            }
+
+            // Remove 20% of the allocations, randomly.
+            if remove_all {
+                allocations.into_iter().for_each(|allocation| {
+                    allocator.free(
+                        allocation.allocation.blocks.clone(),
+                        allocation.allocation.allocation_id,
+                    )
+                });
+                allocations = Vec::new();
+            } else {
+                allocations.shuffle(&mut rng);
+                let remove_index = (allocations.len() as f64 * 0.8) as usize;
+                for allocation in allocations.drain(remove_index..) {
+                    allocator.free(
+                        allocation.allocation.blocks.clone(),
+                        allocation.allocation.allocation_id,
+                    );
+                }
+            }
+        }
+    }
+
+    fn check_allocation_invariants(allocations: &[AllocationWithInfo]) {
+        for i in 0..allocations.len() {
+            let allocation = &allocations[i];
+
+            // 0 is used for health checks, must not be used.
+            assert!(
+                !allocation.blockset.contains(&0),
+                "Block 0 must not be allocated"
+            );
+
+            // No duplicate blocks in an allocation.
+            assert_eq!(
+                allocation.allocation.blocks.len(),
+                allocation.blockset.len(),
+                "Duplicate blocks in allocation"
+            );
+
+            for other_allocation in &allocations[i + 1..] {
+                assert!(
+                    other_allocation
+                        .non_prefix_blocks
+                        .is_disjoint(&allocation.non_prefix_blocks),
+                    "Allocations share non-prefix blocks"
+                )
+            }
+        }
+    }
 }
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "3.1.1-dev0"
+    "version": "3.1.2-dev0"
  },
  "paths": {
    "/": {
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:

 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.0-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.1-neuron <service_parameters>
 ```

 - system parameters are used to map ports, volumes and devices between the host and the service,
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 \
    --model-id $model
 ```
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes
 ```

 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes-nf4
 ```

 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize gptq
 ```

 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-rocm \
    --model-id $model
 ```

--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-intel-xpu \
    --model-id $model --cuda-graphs 0
 ```

@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-intel-cpu \
    --model-id $model --cuda-graphs 0
 ```

--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 \
+    ghcr.io/huggingface/text-generation-inference:3.1.1 \
    --model-id $model
 ```

--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 \
+    ghcr.io/huggingface/text-generation-inference:3.1.1 \
    --model-id $model
 ```

@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.1.0 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.1.1 --help
 ```

 </Tip>
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@ -163,7 +163,7 @@ hub = {

 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.1"),
 env=hub,
 role=role,
 )
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@ -9,7 +9,7 @@ homepage.workspace = true
 [dependencies]
 clap = { version = "4.4.5", features = ["derive", "env"] }
 ctrlc = { version = "3.4.1", features = ["termination"] }
-hf-hub = "0.4.1"
+hf-hub = "0.4.2"
 nix = { version = "0.28.0", features = ["signal"] }
 once_cell = "1.19.0"
 pyo3 = { workspace = true }
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1,8 +1,5 @@
 use clap::{Parser, ValueEnum};
-use hf_hub::{
-    api::sync::{Api, ApiBuilder},
-    Repo, RepoType,
-};
+use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 use nix::sys::signal::{self, Signal};
 use nix::unistd::Pid;
 use serde::Deserialize;
@ -100,12 +97,16 @@ fn get_config(
    let filename = if !path.exists() {
        // Assume it's a hub id

-        let api = if let Ok(token) = std::env::var("HF_TOKEN") {
+        let mut builder = if let Ok(token) = std::env::var("HF_TOKEN") {
            // env variable has precedence over on file token.
-            ApiBuilder::new().with_token(Some(token)).build()?
+            ApiBuilder::new().with_token(Some(token))
        } else {
-            Api::new()?
+            ApiBuilder::new()
        };
+        if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") {
+            builder = builder.with_user_agent("origin", origin.as_str());
+        }
+        let api = builder.build()?;
        let repo = if let Some(ref revision) = revision {
            api.repo(Repo::with_revision(
                model_id,
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1719,6 +1719,10 @@ pub async fn run(
            builder = builder.with_cache_dir(cache_dir.into());
        }

+        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
+            builder = builder.with_user_agent("origin", origin.as_str());
+        }
+
        builder
    };

--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -31,7 +31,8 @@ dependencies = [
    "sentencepiece>=0.2.0",
    "tokenizers>=0.20.3",
    "typer>=0.15.1",
-    "transformers>=4.48.0"
+    "transformers>=4.48.0",
+    "huggingface-hub>=0.29.0",
 ]

 [build-system]
--- a/server/uv.lock
+++ b/server/uv.lock
@ -720,7 +720,7 @@ wheels = [

 [[package]]
 name = "huggingface-hub"
-version = "0.28.1"
+version = "0.29.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "filelock" },
@ -731,9 +731,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074 }
+sdist = { url = "https://files.pythonhosted.org/packages/22/37/797d6476f13e5ef6af5fc48a5d641d32b39c37e166ccf40c3714c5854a85/huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250", size = 389776 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068 },
+    { url = "https://files.pythonhosted.org/packages/ae/05/75b90de9093de0aadafc868bb2fa7c57651fd8f45384adf39bd77f63980d/huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5", size = 468049 },
 ]

 [[package]]
@ -2563,6 +2563,7 @@ dependencies = [
    { name = "grpcio-status" },
    { name = "hf-kernels" },
    { name = "hf-transfer" },
+    { name = "huggingface-hub" },
    { name = "loguru" },
    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@ -2627,6 +2628,7 @@ requires-dist = [
    { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
    { name = "hf-kernels", specifier = ">=0.1.5" },
    { name = "hf-transfer", specifier = ">=0.1.8" },
+    { name = "huggingface-hub", specifier = ">=0.29.0" },
    { name = "loguru", specifier = ">=0.7.3" },
    { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
    { name = "numpy", specifier = ">=1.26,<3" },
--- a/update_doc.py
+++ b/update_doc.py
@ -142,14 +142,12 @@ def check_openapi(check: bool):

    with open(tmp_filename, "w") as f:
        json.dump(new_openapi_data, f, indent=2)
+        f.write("\n")

    if check:
        diff = subprocess.run(
            [
                "diff",
-                # allow for trailing whitespace since it's not significant
-                # and the precommit hook will remove it
-                "--ignore-trailing-space",
                tmp_filename,
                filename,
            ],