diff --git a/Cargo.lock b/Cargo.lock index 4603f77d..0c28b285 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1007,9 +1007,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.140" +version = "1.0.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc49567e08c72902f4cbc7242ee8d874ec9cbe97fbabf77b4e0e1f447513e13a" +checksum = "8bc580dceb395cae0efdde0a88f034cfd8a276897e40c693a7b87bed17971d33" dependencies = [ "cc", "cxxbridge-cmd", @@ -1021,9 +1021,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.140" +version = "1.0.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe46b5309c99e9775e7a338c98e4097455f52db5b684fd793ca22848fde6e371" +checksum = "49d8c1baedad72a7efda12ad8d7ad687b3e7221dfb304a12443fd69e9de8bb30" dependencies = [ "cc", "codespan-reporting", @@ -1035,9 +1035,9 @@ dependencies = [ [[package]] name = "cxxbridge-cmd" -version = "1.0.140" +version = "1.0.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4315c4ce8d23c26d87f2f83698725fd5718d8e6ace4a9093da2664d23294d372" +checksum = "e43afb0e3b2ef293492a31ecd796af902112460d53e5f923f7804f348a769f9c" dependencies = [ "clap 4.5.30", "codespan-reporting", @@ -1048,15 +1048,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.140" +version = "1.0.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55d69deb3a92f610a60ecc524a72c7374b6dc822f8fb7bb4e5d9473f10530c4" +checksum = "0257ad2096a2474fe877e9e055ab69603851c3d6b394efcc7e0443899c2492ce" [[package]] name = "cxxbridge-macro" -version = "1.0.140" +version = "1.0.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bee7a1d9b5091462002c2b8de2a4ed0f0fde011d503cc272633f66075bd5141" +checksum = "b46cbd7358a46b760609f1cb5093683328e58ca50e594a308716f5403fdc03e5" dependencies = [ "proc-macro2", "quote", @@ -1660,30 +1660,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" dependencies = [ "dirs", - "futures", "indicatif", "log", "native-tls", - "num_cpus", "rand 0.8.5", - "reqwest 0.11.27", "serde", "serde_json", "thiserror 1.0.69", - "tokio", "ureq", ] [[package]] name = "hf-hub" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "112fa2f6ad4ab815b9e1b938b4b1e437032d055e2f92ed10fd6ab2e62d02c6b6" +checksum = "cc03dcb0b0a83ae3f3363ec811014ae669f083e4e499c66602f447c4828737a1" dependencies = [ "dirs", "futures", "http 1.2.0", "indicatif", + "libc", "log", "native-tls", "num_cpus", @@ -1694,6 +1691,7 @@ dependencies = [ "thiserror 2.0.11", "tokio", "ureq", + "windows-sys 0.59.0", ] [[package]] @@ -4619,7 +4617,7 @@ dependencies = [ [[package]] name = "text-generation-backends-trtllm" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "async-trait", "clap 4.5.30", @@ -4627,7 +4625,7 @@ dependencies = [ "cxx", "cxx-build", "hashbrown 0.15.2", - "hf-hub 0.3.2", + "hf-hub 0.4.2", "pkg-config", "pyo3", "text-generation-router", @@ -4640,12 +4638,12 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "average", "clap 4.5.30", "float-ord", - "hf-hub 0.3.2", + "hf-hub 0.4.2", "ratatui", "serde", "serde_json", @@ -4660,7 +4658,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "async-trait", "base64 0.22.1", @@ -4678,12 +4676,12 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "clap 4.5.30", "ctrlc", "float_eq", - "hf-hub 0.4.1", + "hf-hub 0.4.2", "nix 0.28.0", "once_cell", "pyo3", @@ -4699,7 +4697,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "anyhow", "async-stream", @@ -4712,7 +4710,7 @@ dependencies = [ "csv", "futures", "futures-util", - "hf-hub 0.3.2", + "hf-hub 0.4.2", "image", "init-tracing-opentelemetry", "itertools 0.10.5", @@ -4751,7 +4749,7 @@ dependencies = [ [[package]] name = "text-generation-router-llamacpp" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "async-trait", "bindgen 0.71.1", @@ -4768,7 +4766,7 @@ dependencies = [ [[package]] name = "text-generation-router-v2" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "async-stream", "async-trait", @@ -4779,7 +4777,7 @@ dependencies = [ "futures", "futures-util", "grpc-metadata", - "hf-hub 0.3.2", + "hf-hub 0.4.2", "image", "init-tracing-opentelemetry", "jsonschema", @@ -4817,7 +4815,7 @@ dependencies = [ [[package]] name = "text-generation-router-v3" -version = "3.1.1-dev0" +version = "3.1.2-dev0" dependencies = [ "async-stream", "async-trait", @@ -4829,7 +4827,7 @@ dependencies = [ "futures", "futures-util", "grpc-metadata", - "hf-hub 0.3.2", + "hf-hub 0.4.2", "image", "init-tracing-opentelemetry", "itertools 0.13.0", @@ -4847,6 +4845,7 @@ dependencies = [ "rand 0.8.5", "regex", "reqwest 0.11.27", + "rustc-hash 2.1.1", "serde", "serde_json", "slotmap", diff --git a/Cargo.toml b/Cargo.toml index df7f2a73..4e3ad010 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ default-members = [ resolver = "2" [workspace.package] -version = "3.1.1-dev0" +version = "3.1.2-dev0" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" @@ -29,7 +29,7 @@ homepage = "https://github.com/huggingface/text-generation-inference" [workspace.dependencies] base64 = "0.22.0" tokenizers = { version = "0.20.0", features = ["http"] } -hf-hub = { version = "0.3.1", features = ["tokio"] } +hf-hub = { version = "0.4.1", features = ["tokio"] } metrics = { version = "0.23.0" } metrics-exporter-prometheus = { version = "0.15.1", features = [] } minijinja = { version = "2.2.0", features = ["json"] } diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 5a22fab3..272407c7 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -18,7 +18,8 @@ RUN apt-get update -y \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y +COPY rust-toolchain.toml rust-toolchain.toml +RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none ENV PATH="/root/.cargo/bin:${PATH}" RUN cargo install cargo-chef --locked diff --git a/README.md b/README.md index b344892e..a24ed5f1 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta volume=$PWD/data docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model + ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model ``` And then you can make requests like @@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \ **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. -**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0-rocm --model-id $model` instead of the command above. +**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1-rocm --model-id $model` instead of the command above. To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli): ``` @@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading token= docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model + ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model ``` ### A note on Shared Memory (shm) diff --git a/backends/neuron/Cargo.toml b/backends/neuron/Cargo.toml index 3b237eda..72f92e69 100644 --- a/backends/neuron/Cargo.toml +++ b/backends/neuron/Cargo.toml @@ -22,7 +22,7 @@ homepage = "https://github.com/huggingface/text-generation-inference" [workspace.dependencies] base64 = "0.22.0" tokenizers = { version = "0.20.0", features = ["http"] } -hf-hub = { version = "0.3.1", features = ["tokio"] } +hf-hub = { version = "0.4.2", features = ["tokio"] } metrics = { version = "0.23.0" } metrics-exporter-prometheus = { version = "0.15.1", features = [] } minijinja = { version = "2.2.0", features = ["json"] } diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index cef225be..9d4bf8f2 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -86,6 +86,10 @@ async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option) -> NodeId { + fn find_(&mut self, node_id: NodeId, key: &[u32], blocks: &mut Vec) -> NodeId { let node = &self.nodes[node_id]; if key.len() >= self.block_size { @@ -295,9 +295,13 @@ impl RadixTrie { assert_eq!(shared_prefix_len % self.block_size, 0); blocks.extend(&child.blocks[..shared_prefix_len / self.block_size]); + // A node represents the prefix of its children. So, only + // recurse when there is a full prefix match. let key = &key[shared_prefix_len..]; - if !key.is_empty() { - node_id = self.find_(child_id, key, blocks); + if !key.is_empty() && shared_prefix_len == child.key.len() { + return self.find_(child_id, key, blocks); + } else { + return child_id; } } } @@ -631,6 +635,12 @@ fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize { mod tests { use std::sync::Arc; + use rand::{ + distributions::Uniform, prelude::Distribution, rngs::SmallRng, seq::SliceRandom, + SeedableRng, + }; + use rustc_hash::FxHashSet; + use super::*; #[test] @@ -873,4 +883,159 @@ mod tests { // Clear out the whole trie. assert_eq!(trie.evict(10), vec![1, 2, 3, 0, 1]); } + + #[test] + fn full_match_returns_correct_node() { + let mut trie = RadixTrie::new(1); + trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap(); + let node_id = trie.find(&[0, 1, 2], &mut vec![]); + // At this point, there are only two nodes: the root and the node + // with tokens 0, 1, 2. Looking up the exact prefix must return + // the non-root node. + assert_ne!(node_id, trie.root); + } + + #[test] + fn partial_match_does_not_recurse() { + let mut trie = RadixTrie::new(1); + trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap(); + trie.insert(&[0, 1, 2, 3, 4, 5], &[0, 1, 2, 3, 4, 5]) + .unwrap(); + let mut blocks = Vec::new(); + let node_id = trie.find(&[0, 1, 3, 4, 5], &mut blocks); + assert_eq!(blocks, vec![0, 1]); + assert_eq!(node_id, trie.find(&[0, 1], &mut blocks)) + } + + struct AllocationWithInfo { + allocation: BlockAllocation, + // We are doing a lot of set operations and `FxBuildHasher` is + // muc faster for a set of integers. + blockset: FxHashSet, + non_prefix_blocks: FxHashSet, + } + + #[test] + fn invariants_hold_on_many_operations_remove_all() { + invariants_hold_on_many_insertions(true); + } + + #[test] + fn invariants_hold_on_many_operations_remove_subset() { + invariants_hold_on_many_insertions(false); + } + + fn invariants_hold_on_many_insertions(remove_all: bool) { + // Small vocabulary sizes lead to violations more quickly due to + // prefix sharing, etc. + const VOCAB_SIZE: u32 = 2; + const DATA_LEN: usize = 1_000; + + const MAX_PREFILL_LEN: usize = 8; + const MAX_DECODE_LEN: usize = 8; + + let vocab_range = Uniform::new(0, VOCAB_SIZE); + let data_range = Uniform::new(0, DATA_LEN); + let prefill_len_range = Uniform::new(0, MAX_PREFILL_LEN); + let decode_len_range = Uniform::new(0, MAX_DECODE_LEN); + + let mut rng = SmallRng::seed_from_u64(64); + let data = (0..DATA_LEN) + .map(|_| vocab_range.sample(&mut rng)) + .collect::>(); + let mut allocator = RadixAllocator::new(1, 100, None); + + let mut allocations = Vec::new(); + + for i in 0..100_000 { + // Allocate until all blocks are used. + 'allocation: loop { + // Use offset 0 half of the times for prefix sharing. + let prefill_offset = data_range.sample(&mut rng); + let prefill_len = prefill_len_range.sample(&mut rng); + let decode_len = decode_len_range.sample(&mut rng); + + let prefill = + data[prefill_offset..data.len().min(prefill_offset + prefill_len)].to_vec(); + + let allocation = match allocator + .allocate((prefill.len() + decode_len) as u32, Some(Arc::new(prefill))) + { + Some(allocation) => allocation, + None => break 'allocation, + }; + let non_prefix_blocks = allocation.blocks[allocation.prefix_len as usize..] + .iter() + .copied() + .collect::>(); + let blockset = allocation.blocks.iter().copied().collect::>(); + + // No duplicate blocks in an allocation. + assert_eq!( + allocation.blocks.len(), + blockset.len(), + "Duplicate blocks in allocation" + ); + + allocations.push(AllocationWithInfo { + allocation, + blockset, + non_prefix_blocks, + }); + } + + // Check invariants. Skip first iteration, since there is no prefix sharing yet. + if i > 1 { + check_allocation_invariants(&allocations); + } + + // Remove 20% of the allocations, randomly. + if remove_all { + allocations.into_iter().for_each(|allocation| { + allocator.free( + allocation.allocation.blocks.clone(), + allocation.allocation.allocation_id, + ) + }); + allocations = Vec::new(); + } else { + allocations.shuffle(&mut rng); + let remove_index = (allocations.len() as f64 * 0.8) as usize; + for allocation in allocations.drain(remove_index..) { + allocator.free( + allocation.allocation.blocks.clone(), + allocation.allocation.allocation_id, + ); + } + } + } + } + + fn check_allocation_invariants(allocations: &[AllocationWithInfo]) { + for i in 0..allocations.len() { + let allocation = &allocations[i]; + + // 0 is used for health checks, must not be used. + assert!( + !allocation.blockset.contains(&0), + "Block 0 must not be allocated" + ); + + // No duplicate blocks in an allocation. + assert_eq!( + allocation.allocation.blocks.len(), + allocation.blockset.len(), + "Duplicate blocks in allocation" + ); + + for other_allocation in &allocations[i + 1..] { + assert!( + other_allocation + .non_prefix_blocks + .is_disjoint(&allocation.non_prefix_blocks), + "Allocations share non-prefix blocks" + ) + } + } + } } diff --git a/docs/openapi.json b/docs/openapi.json index 9de76e47..e16ca7f9 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "3.1.1-dev0" + "version": "3.1.2-dev0" }, "paths": { "/": { diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md index 8e80701f..e7ba8873 100644 --- a/docs/source/backends/neuron.md +++ b/docs/source/backends/neuron.md @@ -31,7 +31,7 @@ deployment instructions in the model card: The service is launched simply by running the text-generation-inference container with two sets of parameters: ``` -docker run ghcr.io/huggingface/text-generation-inference:3.1.0-neuron +docker run ghcr.io/huggingface/text-generation-inference:3.1.1-neuron ``` - system parameters are used to map ports, volumes and devices between the host and the service, diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md index 949aab56..2cbd7e06 100644 --- a/docs/source/basic_tutorials/gated_model_access.md +++ b/docs/source/basic_tutorials/gated_model_access.md @@ -19,6 +19,6 @@ docker run --gpus all \ --shm-size 1g \ -e HF_TOKEN=$token \ -p 8080:80 \ - -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 \ + -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 \ --model-id $model ``` diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md index 449cc79b..77a64a88 100644 --- a/docs/source/conceptual/quantization.md +++ b/docs/source/conceptual/quantization.md @@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇 ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes ``` 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load. @@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇 ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes-nf4 +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes-nf4 ``` You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes). @@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$ TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇 ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize gptq +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize gptq ``` Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md index 100bc2a9..20ef26a8 100644 --- a/docs/source/installation_amd.md +++ b/docs/source/installation_amd.md @@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --device=/dev/kfd --device=/dev/dri --group-add video \ --ipc=host --shm-size 256g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.0-rocm \ + ghcr.io/huggingface/text-generation-inference:3.1.1-rocm \ --model-id $model ``` diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md index b2279bb4..a0bf11d1 100644 --- a/docs/source/installation_intel.md +++ b/docs/source/installation_intel.md @@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm --privileged --cap-add=sys_nice \ --device=/dev/dri \ --ipc=host --shm-size 1g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.0-intel-xpu \ + ghcr.io/huggingface/text-generation-inference:3.1.1-intel-xpu \ --model-id $model --cuda-graphs 0 ``` @@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm --privileged --cap-add=sys_nice \ --device=/dev/dri \ --ipc=host --shm-size 1g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.0-intel-cpu \ + ghcr.io/huggingface/text-generation-inference:3.1.1-intel-cpu \ --model-id $model --cuda-graphs 0 ``` diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md index 8c4bdaee..3b20c7e1 100644 --- a/docs/source/installation_nvidia.md +++ b/docs/source/installation_nvidia.md @@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.0 \ + ghcr.io/huggingface/text-generation-inference:3.1.1 \ --model-id $model ``` diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index bd4956a0..be905102 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.0 \ + ghcr.io/huggingface/text-generation-inference:3.1.1 \ --model-id $model ``` @@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \ To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more. ```bash -docker run ghcr.io/huggingface/text-generation-inference:3.1.0 --help +docker run ghcr.io/huggingface/text-generation-inference:3.1.1 --help ``` diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md index ee34d587..bc4029e4 100644 --- a/docs/source/reference/api_reference.md +++ b/docs/source/reference/api_reference.md @@ -163,7 +163,7 @@ hub = { # create Hugging Face Model Class huggingface_model = HuggingFaceModel( - image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.0"), + image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.1"), env=hub, role=role, ) diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml index fdbb5994..2d2571ce 100644 --- a/launcher/Cargo.toml +++ b/launcher/Cargo.toml @@ -9,7 +9,7 @@ homepage.workspace = true [dependencies] clap = { version = "4.4.5", features = ["derive", "env"] } ctrlc = { version = "3.4.1", features = ["termination"] } -hf-hub = "0.4.1" +hf-hub = "0.4.2" nix = { version = "0.28.0", features = ["signal"] } once_cell = "1.19.0" pyo3 = { workspace = true } diff --git a/launcher/src/main.rs b/launcher/src/main.rs index d5d1ba83..cd4b2231 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -1,8 +1,5 @@ use clap::{Parser, ValueEnum}; -use hf_hub::{ - api::sync::{Api, ApiBuilder}, - Repo, RepoType, -}; +use hf_hub::{api::sync::ApiBuilder, Repo, RepoType}; use nix::sys::signal::{self, Signal}; use nix::unistd::Pid; use serde::Deserialize; @@ -100,12 +97,16 @@ fn get_config( let filename = if !path.exists() { // Assume it's a hub id - let api = if let Ok(token) = std::env::var("HF_TOKEN") { + let mut builder = if let Ok(token) = std::env::var("HF_TOKEN") { // env variable has precedence over on file token. - ApiBuilder::new().with_token(Some(token)).build()? + ApiBuilder::new().with_token(Some(token)) } else { - Api::new()? + ApiBuilder::new() }; + if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") { + builder = builder.with_user_agent("origin", origin.as_str()); + } + let api = builder.build()?; let repo = if let Some(ref revision) = revision { api.repo(Repo::with_revision( model_id, diff --git a/router/src/server.rs b/router/src/server.rs index e9aa4612..c566cf98 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1719,6 +1719,10 @@ pub async fn run( builder = builder.with_cache_dir(cache_dir.into()); } + if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") { + builder = builder.with_user_agent("origin", origin.as_str()); + } + builder }; diff --git a/server/pyproject.toml b/server/pyproject.toml index bda9df1b..7fd690cc 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -31,7 +31,8 @@ dependencies = [ "sentencepiece>=0.2.0", "tokenizers>=0.20.3", "typer>=0.15.1", - "transformers>=4.48.0" + "transformers>=4.48.0", + "huggingface-hub>=0.29.0", ] [build-system] diff --git a/server/uv.lock b/server/uv.lock index bbb2c9d8..fecccecf 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -720,7 +720,7 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.28.1" +version = "0.29.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -731,9 +731,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074 } +sdist = { url = "https://files.pythonhosted.org/packages/22/37/797d6476f13e5ef6af5fc48a5d641d32b39c37e166ccf40c3714c5854a85/huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250", size = 389776 } wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068 }, + { url = "https://files.pythonhosted.org/packages/ae/05/75b90de9093de0aadafc868bb2fa7c57651fd8f45384adf39bd77f63980d/huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5", size = 468049 }, ] [[package]] @@ -2563,6 +2563,7 @@ dependencies = [ { name = "grpcio-status" }, { name = "hf-kernels" }, { name = "hf-transfer" }, + { name = "huggingface-hub" }, { name = "loguru" }, { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, @@ -2627,6 +2628,7 @@ requires-dist = [ { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" }, { name = "hf-kernels", specifier = ">=0.1.5" }, { name = "hf-transfer", specifier = ">=0.1.8" }, + { name = "huggingface-hub", specifier = ">=0.29.0" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" }, { name = "numpy", specifier = ">=1.26,<3" }, diff --git a/update_doc.py b/update_doc.py index 28430b1a..e570b5ad 100644 --- a/update_doc.py +++ b/update_doc.py @@ -142,14 +142,12 @@ def check_openapi(check: bool): with open(tmp_filename, "w") as f: json.dump(new_openapi_data, f, indent=2) + f.write("\n") if check: diff = subprocess.run( [ "diff", - # allow for trailing whitespace since it's not significant - # and the precommit hook will remove it - "--ignore-trailing-space", tmp_filename, filename, ],