Merge branch 'main' into patch_rust

This commit is contained in:
Nicolas Patry 2025-03-04 16:50:57 +01:00 committed by GitHub
commit bbc68748b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 244 additions and 68 deletions

59
Cargo.lock generated
View File

@ -1007,9 +1007,9 @@ dependencies = [
[[package]]
name = "cxx"
version = "1.0.140"
version = "1.0.141"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc49567e08c72902f4cbc7242ee8d874ec9cbe97fbabf77b4e0e1f447513e13a"
checksum = "8bc580dceb395cae0efdde0a88f034cfd8a276897e40c693a7b87bed17971d33"
dependencies = [
"cc",
"cxxbridge-cmd",
@ -1021,9 +1021,9 @@ dependencies = [
[[package]]
name = "cxx-build"
version = "1.0.140"
version = "1.0.141"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe46b5309c99e9775e7a338c98e4097455f52db5b684fd793ca22848fde6e371"
checksum = "49d8c1baedad72a7efda12ad8d7ad687b3e7221dfb304a12443fd69e9de8bb30"
dependencies = [
"cc",
"codespan-reporting",
@ -1035,9 +1035,9 @@ dependencies = [
[[package]]
name = "cxxbridge-cmd"
version = "1.0.140"
version = "1.0.141"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4315c4ce8d23c26d87f2f83698725fd5718d8e6ace4a9093da2664d23294d372"
checksum = "e43afb0e3b2ef293492a31ecd796af902112460d53e5f923f7804f348a769f9c"
dependencies = [
"clap 4.5.30",
"codespan-reporting",
@ -1048,15 +1048,15 @@ dependencies = [
[[package]]
name = "cxxbridge-flags"
version = "1.0.140"
version = "1.0.141"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f55d69deb3a92f610a60ecc524a72c7374b6dc822f8fb7bb4e5d9473f10530c4"
checksum = "0257ad2096a2474fe877e9e055ab69603851c3d6b394efcc7e0443899c2492ce"
[[package]]
name = "cxxbridge-macro"
version = "1.0.140"
version = "1.0.141"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bee7a1d9b5091462002c2b8de2a4ed0f0fde011d503cc272633f66075bd5141"
checksum = "b46cbd7358a46b760609f1cb5093683328e58ca50e594a308716f5403fdc03e5"
dependencies = [
"proc-macro2",
"quote",
@ -1660,30 +1660,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732"
dependencies = [
"dirs",
"futures",
"indicatif",
"log",
"native-tls",
"num_cpus",
"rand 0.8.5",
"reqwest 0.11.27",
"serde",
"serde_json",
"thiserror 1.0.69",
"tokio",
"ureq",
]
[[package]]
name = "hf-hub"
version = "0.4.1"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112fa2f6ad4ab815b9e1b938b4b1e437032d055e2f92ed10fd6ab2e62d02c6b6"
checksum = "cc03dcb0b0a83ae3f3363ec811014ae669f083e4e499c66602f447c4828737a1"
dependencies = [
"dirs",
"futures",
"http 1.2.0",
"indicatif",
"libc",
"log",
"native-tls",
"num_cpus",
@ -1694,6 +1691,7 @@ dependencies = [
"thiserror 2.0.11",
"tokio",
"ureq",
"windows-sys 0.59.0",
]
[[package]]
@ -4619,7 +4617,7 @@ dependencies = [
[[package]]
name = "text-generation-backends-trtllm"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"async-trait",
"clap 4.5.30",
@ -4627,7 +4625,7 @@ dependencies = [
"cxx",
"cxx-build",
"hashbrown 0.15.2",
"hf-hub 0.3.2",
"hf-hub 0.4.2",
"pkg-config",
"pyo3",
"text-generation-router",
@ -4640,12 +4638,12 @@ dependencies = [
[[package]]
name = "text-generation-benchmark"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"average",
"clap 4.5.30",
"float-ord",
"hf-hub 0.3.2",
"hf-hub 0.4.2",
"ratatui",
"serde",
"serde_json",
@ -4660,7 +4658,7 @@ dependencies = [
[[package]]
name = "text-generation-client"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"async-trait",
"base64 0.22.1",
@ -4678,12 +4676,12 @@ dependencies = [
[[package]]
name = "text-generation-launcher"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"clap 4.5.30",
"ctrlc",
"float_eq",
"hf-hub 0.4.1",
"hf-hub 0.4.2",
"nix 0.28.0",
"once_cell",
"pyo3",
@ -4699,7 +4697,7 @@ dependencies = [
[[package]]
name = "text-generation-router"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"anyhow",
"async-stream",
@ -4712,7 +4710,7 @@ dependencies = [
"csv",
"futures",
"futures-util",
"hf-hub 0.3.2",
"hf-hub 0.4.2",
"image",
"init-tracing-opentelemetry",
"itertools 0.10.5",
@ -4751,7 +4749,7 @@ dependencies = [
[[package]]
name = "text-generation-router-llamacpp"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"async-trait",
"bindgen 0.71.1",
@ -4768,7 +4766,7 @@ dependencies = [
[[package]]
name = "text-generation-router-v2"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"async-stream",
"async-trait",
@ -4779,7 +4777,7 @@ dependencies = [
"futures",
"futures-util",
"grpc-metadata",
"hf-hub 0.3.2",
"hf-hub 0.4.2",
"image",
"init-tracing-opentelemetry",
"jsonschema",
@ -4817,7 +4815,7 @@ dependencies = [
[[package]]
name = "text-generation-router-v3"
version = "3.1.1-dev0"
version = "3.1.2-dev0"
dependencies = [
"async-stream",
"async-trait",
@ -4829,7 +4827,7 @@ dependencies = [
"futures",
"futures-util",
"grpc-metadata",
"hf-hub 0.3.2",
"hf-hub 0.4.2",
"image",
"init-tracing-opentelemetry",
"itertools 0.13.0",
@ -4847,6 +4845,7 @@ dependencies = [
"rand 0.8.5",
"regex",
"reqwest 0.11.27",
"rustc-hash 2.1.1",
"serde",
"serde_json",
"slotmap",

View File

@ -21,7 +21,7 @@ default-members = [
resolver = "2"
[workspace.package]
version = "3.1.1-dev0"
version = "3.1.2-dev0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"
@ -29,7 +29,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
[workspace.dependencies]
base64 = "0.22.0"
tokenizers = { version = "0.20.0", features = ["http"] }
hf-hub = { version = "0.3.1", features = ["tokio"] }
hf-hub = { version = "0.4.1", features = ["tokio"] }
metrics = { version = "0.23.0" }
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
minijinja = { version = "2.2.0", features = ["json"] }

View File

@ -18,7 +18,8 @@ RUN apt-get update -y \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y
COPY rust-toolchain.toml rust-toolchain.toml
RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"
RUN cargo install cargo-chef --locked

View File

@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
volume=$PWD/data
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
```
And then you can make requests like
@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0-rocm --model-id $model` instead of the command above.
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1-rocm --model-id $model` instead of the command above.
To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
```
@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
token=<your cli READ token>
docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
```
### A note on Shared Memory (shm)

View File

@ -22,7 +22,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
[workspace.dependencies]
base64 = "0.22.0"
tokenizers = { version = "0.20.0", features = ["http"] }
hf-hub = { version = "0.3.1", features = ["tokio"] }
hf-hub = { version = "0.4.2", features = ["tokio"] }
metrics = { version = "0.23.0" }
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
minijinja = { version = "2.2.0", features = ["json"] }

View File

@ -86,6 +86,10 @@ async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<T
builder = builder.with_cache_dir(cache_dir.into());
}
if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
builder = builder.with_user_agent("origin", origin.as_str());
}
builder
};

View File

@ -71,6 +71,7 @@ prost-build = "0.12.1"
[dev-dependencies]
criterion = "0.3"
itertools = "0.13"
rustc-hash = "2"
[features]
default = ["ngrok"]

View File

@ -283,7 +283,7 @@ impl RadixTrie {
}
/// Find worker.
fn find_(&mut self, mut node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
fn find_(&mut self, node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
let node = &self.nodes[node_id];
if key.len() >= self.block_size {
@ -295,9 +295,13 @@ impl RadixTrie {
assert_eq!(shared_prefix_len % self.block_size, 0);
blocks.extend(&child.blocks[..shared_prefix_len / self.block_size]);
// A node represents the prefix of its children. So, only
// recurse when there is a full prefix match.
let key = &key[shared_prefix_len..];
if !key.is_empty() {
node_id = self.find_(child_id, key, blocks);
if !key.is_empty() && shared_prefix_len == child.key.len() {
return self.find_(child_id, key, blocks);
} else {
return child_id;
}
}
}
@ -631,6 +635,12 @@ fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize {
mod tests {
use std::sync::Arc;
use rand::{
distributions::Uniform, prelude::Distribution, rngs::SmallRng, seq::SliceRandom,
SeedableRng,
};
use rustc_hash::FxHashSet;
use super::*;
#[test]
@ -873,4 +883,159 @@ mod tests {
// Clear out the whole trie.
assert_eq!(trie.evict(10), vec![1, 2, 3, 0, 1]);
}
#[test]
fn full_match_returns_correct_node() {
let mut trie = RadixTrie::new(1);
trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
let node_id = trie.find(&[0, 1, 2], &mut vec![]);
// At this point, there are only two nodes: the root and the node
// with tokens 0, 1, 2. Looking up the exact prefix must return
// the non-root node.
assert_ne!(node_id, trie.root);
}
#[test]
fn partial_match_does_not_recurse() {
let mut trie = RadixTrie::new(1);
trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
trie.insert(&[0, 1, 2, 3, 4, 5], &[0, 1, 2, 3, 4, 5])
.unwrap();
let mut blocks = Vec::new();
let node_id = trie.find(&[0, 1, 3, 4, 5], &mut blocks);
assert_eq!(blocks, vec![0, 1]);
assert_eq!(node_id, trie.find(&[0, 1], &mut blocks))
}
struct AllocationWithInfo {
allocation: BlockAllocation,
// We are doing a lot of set operations and `FxBuildHasher` is
// muc faster for a set of integers.
blockset: FxHashSet<u32>,
non_prefix_blocks: FxHashSet<u32>,
}
#[test]
fn invariants_hold_on_many_operations_remove_all() {
invariants_hold_on_many_insertions(true);
}
#[test]
fn invariants_hold_on_many_operations_remove_subset() {
invariants_hold_on_many_insertions(false);
}
fn invariants_hold_on_many_insertions(remove_all: bool) {
// Small vocabulary sizes lead to violations more quickly due to
// prefix sharing, etc.
const VOCAB_SIZE: u32 = 2;
const DATA_LEN: usize = 1_000;
const MAX_PREFILL_LEN: usize = 8;
const MAX_DECODE_LEN: usize = 8;
let vocab_range = Uniform::new(0, VOCAB_SIZE);
let data_range = Uniform::new(0, DATA_LEN);
let prefill_len_range = Uniform::new(0, MAX_PREFILL_LEN);
let decode_len_range = Uniform::new(0, MAX_DECODE_LEN);
let mut rng = SmallRng::seed_from_u64(64);
let data = (0..DATA_LEN)
.map(|_| vocab_range.sample(&mut rng))
.collect::<Vec<_>>();
let mut allocator = RadixAllocator::new(1, 100, None);
let mut allocations = Vec::new();
for i in 0..100_000 {
// Allocate until all blocks are used.
'allocation: loop {
// Use offset 0 half of the times for prefix sharing.
let prefill_offset = data_range.sample(&mut rng);
let prefill_len = prefill_len_range.sample(&mut rng);
let decode_len = decode_len_range.sample(&mut rng);
let prefill =
data[prefill_offset..data.len().min(prefill_offset + prefill_len)].to_vec();
let allocation = match allocator
.allocate((prefill.len() + decode_len) as u32, Some(Arc::new(prefill)))
{
Some(allocation) => allocation,
None => break 'allocation,
};
let non_prefix_blocks = allocation.blocks[allocation.prefix_len as usize..]
.iter()
.copied()
.collect::<FxHashSet<_>>();
let blockset = allocation.blocks.iter().copied().collect::<FxHashSet<_>>();
// No duplicate blocks in an allocation.
assert_eq!(
allocation.blocks.len(),
blockset.len(),
"Duplicate blocks in allocation"
);
allocations.push(AllocationWithInfo {
allocation,
blockset,
non_prefix_blocks,
});
}
// Check invariants. Skip first iteration, since there is no prefix sharing yet.
if i > 1 {
check_allocation_invariants(&allocations);
}
// Remove 20% of the allocations, randomly.
if remove_all {
allocations.into_iter().for_each(|allocation| {
allocator.free(
allocation.allocation.blocks.clone(),
allocation.allocation.allocation_id,
)
});
allocations = Vec::new();
} else {
allocations.shuffle(&mut rng);
let remove_index = (allocations.len() as f64 * 0.8) as usize;
for allocation in allocations.drain(remove_index..) {
allocator.free(
allocation.allocation.blocks.clone(),
allocation.allocation.allocation_id,
);
}
}
}
}
fn check_allocation_invariants(allocations: &[AllocationWithInfo]) {
for i in 0..allocations.len() {
let allocation = &allocations[i];
// 0 is used for health checks, must not be used.
assert!(
!allocation.blockset.contains(&0),
"Block 0 must not be allocated"
);
// No duplicate blocks in an allocation.
assert_eq!(
allocation.allocation.blocks.len(),
allocation.blockset.len(),
"Duplicate blocks in allocation"
);
for other_allocation in &allocations[i + 1..] {
assert!(
other_allocation
.non_prefix_blocks
.is_disjoint(&allocation.non_prefix_blocks),
"Allocations share non-prefix blocks"
)
}
}
}
}

View File

@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "3.1.1-dev0"
"version": "3.1.2-dev0"
},
"paths": {
"/": {

View File

@ -31,7 +31,7 @@ deployment instructions in the model card:
The service is launched simply by running the text-generation-inference container with two sets of parameters:
```
docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.0-neuron <service_parameters>
docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.1-neuron <service_parameters>
```
- system parameters are used to map ports, volumes and devices between the host and the service,

View File

@ -19,6 +19,6 @@ docker run --gpus all \
--shm-size 1g \
-e HF_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 \
--model-id $model
```

View File

@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes
```
4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes-nf4
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes-nf4
```
You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize gptq
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize gptq
```
Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.

View File

@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video \
--ipc=host --shm-size 256g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.1.0-rocm \
ghcr.io/huggingface/text-generation-inference:3.1.1-rocm \
--model-id $model
```

View File

@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.1.0-intel-xpu \
ghcr.io/huggingface/text-generation-inference:3.1.1-intel-xpu \
--model-id $model --cuda-graphs 0
```
@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
docker run --rm --privileged --cap-add=sys_nice \
--device=/dev/dri \
--ipc=host --shm-size 1g --net host -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.1.0-intel-cpu \
ghcr.io/huggingface/text-generation-inference:3.1.1-intel-cpu \
--model-id $model --cuda-graphs 0
```

View File

@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.1.0 \
ghcr.io/huggingface/text-generation-inference:3.1.1 \
--model-id $model
```

View File

@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:3.1.0 \
ghcr.io/huggingface/text-generation-inference:3.1.1 \
--model-id $model
```
@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
```bash
docker run ghcr.io/huggingface/text-generation-inference:3.1.0 --help
docker run ghcr.io/huggingface/text-generation-inference:3.1.1 --help
```
</Tip>

View File

@ -163,7 +163,7 @@ hub = {
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.0"),
image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.1"),
env=hub,
role=role,
)

View File

@ -9,7 +9,7 @@ homepage.workspace = true
[dependencies]
clap = { version = "4.4.5", features = ["derive", "env"] }
ctrlc = { version = "3.4.1", features = ["termination"] }
hf-hub = "0.4.1"
hf-hub = "0.4.2"
nix = { version = "0.28.0", features = ["signal"] }
once_cell = "1.19.0"
pyo3 = { workspace = true }

View File

@ -1,8 +1,5 @@
use clap::{Parser, ValueEnum};
use hf_hub::{
api::sync::{Api, ApiBuilder},
Repo, RepoType,
};
use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
use nix::sys::signal::{self, Signal};
use nix::unistd::Pid;
use serde::Deserialize;
@ -100,12 +97,16 @@ fn get_config(
let filename = if !path.exists() {
// Assume it's a hub id
let api = if let Ok(token) = std::env::var("HF_TOKEN") {
let mut builder = if let Ok(token) = std::env::var("HF_TOKEN") {
// env variable has precedence over on file token.
ApiBuilder::new().with_token(Some(token)).build()?
ApiBuilder::new().with_token(Some(token))
} else {
Api::new()?
ApiBuilder::new()
};
if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") {
builder = builder.with_user_agent("origin", origin.as_str());
}
let api = builder.build()?;
let repo = if let Some(ref revision) = revision {
api.repo(Repo::with_revision(
model_id,

View File

@ -1719,6 +1719,10 @@ pub async fn run(
builder = builder.with_cache_dir(cache_dir.into());
}
if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
builder = builder.with_user_agent("origin", origin.as_str());
}
builder
};

View File

@ -31,7 +31,8 @@ dependencies = [
"sentencepiece>=0.2.0",
"tokenizers>=0.20.3",
"typer>=0.15.1",
"transformers>=4.48.0"
"transformers>=4.48.0",
"huggingface-hub>=0.29.0",
]
[build-system]

View File

@ -720,7 +720,7 @@ wheels = [
[[package]]
name = "huggingface-hub"
version = "0.28.1"
version = "0.29.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "filelock" },
@ -731,9 +731,9 @@ dependencies = [
{ name = "tqdm" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074 }
sdist = { url = "https://files.pythonhosted.org/packages/22/37/797d6476f13e5ef6af5fc48a5d641d32b39c37e166ccf40c3714c5854a85/huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250", size = 389776 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068 },
{ url = "https://files.pythonhosted.org/packages/ae/05/75b90de9093de0aadafc868bb2fa7c57651fd8f45384adf39bd77f63980d/huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5", size = 468049 },
]
[[package]]
@ -2563,6 +2563,7 @@ dependencies = [
{ name = "grpcio-status" },
{ name = "hf-kernels" },
{ name = "hf-transfer" },
{ name = "huggingface-hub" },
{ name = "loguru" },
{ name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
{ name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@ -2627,6 +2628,7 @@ requires-dist = [
{ name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
{ name = "hf-kernels", specifier = ">=0.1.5" },
{ name = "hf-transfer", specifier = ">=0.1.8" },
{ name = "huggingface-hub", specifier = ">=0.29.0" },
{ name = "loguru", specifier = ">=0.7.3" },
{ name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
{ name = "numpy", specifier = ">=1.26,<3" },

View File

@ -142,14 +142,12 @@ def check_openapi(check: bool):
with open(tmp_filename, "w") as f:
json.dump(new_openapi_data, f, indent=2)
f.write("\n")
if check:
diff = subprocess.run(
[
"diff",
# allow for trailing whitespace since it's not significant
# and the precommit hook will remove it
"--ignore-trailing-space",
tmp_filename,
filename,
],