Preping 1.1.0

2025-09-11 04:14:52 +00:00 · 2023-09-26 20:04:38 +00:00 · 2023-09-26 20:04:38 +00:00 · 853f09035c
commit 853f09035c
parent 8672cad2cb
19 changed files with 662 additions and 766 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,7 +8,7 @@ members = [
 ]

 [workspace.package]
-version = "1.0.3"
+version = "1.1.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/README.md
+++ b/README.md
@ -87,7 +87,7 @@ The easiest way of getting started is using the official Docker container:
 model=tiiuae/falcon-7b-instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
 ```
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -14,18 +14,19 @@ name = "text-generation-benchmark"
 path = "src/main.rs"

 [dependencies]
-average = "0.13"
-clap = { version = "4.1.4", features = ["derive", "env"] }
-crossterm = "0.26"
+average = "0.14"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+crossterm = "0.27"
 float-ord = "0.3.2"
-serde = {version = "1.0.142", features = ["derive"]}
+serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
-tabled = "0.12.0"
+tabled = "0.14.0"
 text-generation-client = { path = "../router/client" }
-thiserror = "1.0.38"
-tokenizers = "0.13.3"
-tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tui = {package = "ratatui", version = "0.20", default-features = false, features = ["crossterm"]}
+thiserror = "1.0.48"
+tokenizers = { version = "0.14.0", features = ["http"] }
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
+tui = {package = "ratatui", version = "0.23", default-features = false, features = ["crossterm"]}
 tracing = "0.1.37"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+hf-hub = "0.3.1"

--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@ -6,7 +6,7 @@ use tokio::sync::mpsc;
 use tui::backend::Backend;
 use tui::layout::{Alignment, Constraint, Direction, Layout};
 use tui::style::{Color, Modifier, Style};
-use tui::text::{Span, Spans};
+use tui::text::{Span, Line};
 use tui::widgets::{
    Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
 };
@ -244,7 +244,7 @@ impl App {
            .batch_size
            .iter()
            .map(|b| {
-                Spans::from(vec![Span::styled(
+                Line::from(vec![Span::styled(
                    format!("Batch: {b}"),
                    Style::default().fg(Color::White),
                )])
@ -468,7 +468,7 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
    // Latency p50/p90/p99 texts
    let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
    for (i, (name, value)) in latency_percentiles.iter().enumerate() {
-        let span = Spans::from(vec![Span::styled(
+        let span = Line::from(vec![Span::styled(
            format!("{name}:     {value:.2} ms"),
            Style::default().fg(colors[i]),
        )]);
@ -483,16 +483,16 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
 }

 /// Average/High/Low spans
-fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
+fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Line<'a>> {
    vec![
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
            format!(
                "Average: {:.2} {unit}",
                data.iter().sum::<f64>() / data.len() as f64
            ),
            Style::default().fg(Color::LightBlue),
        )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
            format!(
                "Lowest:  {:.2} {unit}",
                data.iter()
@ -501,7 +501,7 @@ fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
            ),
            Style::default().fg(Color::Reset),
        )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
            format!(
                "Highest: {:.2} {unit}",
                data.iter()
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@ -33,7 +33,7 @@ pub async fn run(
    watermark: bool,
    do_sample: bool,
    client: ShardedClient,
-) -> Result<(), crossterm::ErrorKind> {
+) -> Result<(), std::io::Error> {
    let parameters = NextTokenChooserParameters {
        temperature: temperature.unwrap_or(1.0),
        top_k: top_k.unwrap_or(0),
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "1.0.3"
+    "version": "1.1.0"
  },
  "paths": {
    "/": {
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HUGGING_FACE_HUB_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
    --model-id $model
 ```
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -8,7 +8,7 @@ Let's say you want to deploy [Falcon-7B Instruct](https://huggingface.co/tiiuae/
 model=tiiuae/falcon-7b-instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
 ```

 <Tip warning={true}>
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.0.3"
+version = "1.1.0"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]

--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@ -7,17 +7,17 @@ authors.workspace = true
 homepage.workspace = true

 [dependencies]
-clap = { version = "4.1.4", features = ["derive", "env"] }
-ctrlc = { version = "3.2.5", features = ["termination"] }
-nix = "0.26.2"
-serde = { version = "1.0.152", features = ["derive"]  }
-serde_json = "1.0.93"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+ctrlc = { version = "3.4.1", features = ["termination"] }
+nix = "0.27.1"
+serde = { version = "1.0.188", features = ["derive"]  }
+serde_json = "1.0.107"
 tracing = "0.1.37"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }

 [dev-dependencies]
 float_eq = "1.0.1"
-reqwest = { version = "0.11.14", features = ["blocking", "json"] }
+reqwest = { version = "0.11.20", features = ["blocking", "json"] }

 [build-dependencies]
-vergen = { version = "8.0.0", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }
+vergen = { version = "8.2.5", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -15,35 +15,37 @@ name = "text-generation-router"
 path = "src/main.rs"

 [dependencies]
-async-stream = "0.3.3"
-axum = { version = "0.6.4", features = ["json"] }
-axum-tracing-opentelemetry = "0.10.0"
+async-stream = "0.3.5"
+axum = { version = "0.6.20", features = ["json"] }
+axum-tracing-opentelemetry = "0.14.1"
 text-generation-client = { path = "client" }
-clap = { version = "4.1.4", features = ["derive", "env"] }
-flume = "0.10.14"
-futures = "0.3.26"
-metrics = "0.21.0"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+flume = "0.11.0"
+futures = "0.3.28"
+metrics = "0.21.1"
 metrics-exporter-prometheus = { version = "0.12.1", features = [] }
 nohash-hasher = "0.2.0"
-opentelemetry = { version = "0.19.0", features = ["rt-tokio"] }
-opentelemetry-otlp = "0.12.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
 rand = "0.8.5"
-reqwest = { version = "0.11.14", features = [] }
-serde = "1.0.152"
-serde_json = "1.0.93"
-thiserror = "1.0.38"
-tokenizers = "0.13.3"
-tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tower-http = { version = "0.4.0", features = ["cors"] }
+reqwest = { version = "0.11.20", features = [] }
+serde = "1.0.188"
+serde_json = "1.0.107"
+thiserror = "1.0.48"
+tokenizers = { version = "0.14.0", features = ["http"] }
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tower-http = { version = "0.4.4", features = ["cors"] }
 tracing = "0.1.37"
-tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
-utoipa = { version = "3.0.1", features = ["axum_extras"] }
-utoipa-swagger-ui = { version = "3.0.2", features = ["axum"] }
-ngrok = { version = "0.12.3", features = ["axum"], optional = true }
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+utoipa = { version = "3.5.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "3.1.5", features = ["axum"] }
+ngrok = { version = "0.13.1", features = ["axum"], optional = true }
+hf-hub = "0.3.1"
+init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }

 [build-dependencies]
-vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
+vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }

 [features]
 default = ["ngrok"]
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@ -8,13 +8,13 @@ homepage.workspace = true
 [dependencies]
 futures = "^0.3"
 grpc-metadata = { path = "../grpc-metadata" }
-prost = "^0.11"
+prost = "^0.12"
 thiserror = "^1.0"
-tokio = { version = "^1.25", features = ["sync"] }
-tonic = "^0.9"
+tokio = { version = "^1.32", features = ["sync"] }
+tonic = "^0.10"
 tower = "^0.4"
 tracing = "^0.1"

 [build-dependencies]
-tonic-build = "0.9.2"
-prost-build = "0.11.6"
+tonic-build = "0.10.1"
+prost-build = "0.12.1"
--- a/router/grpc-metadata/Cargo.toml
+++ b/router/grpc-metadata/Cargo.toml
@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-opentelemetry = "^0.19"
-tonic = "^0.9"
+opentelemetry = "^0.20"
+tonic = "^0.10"
 tracing = "^0.1"
-tracing-opentelemetry = "^0.19"
+tracing-opentelemetry = "^0.21"
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -324,7 +324,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {

        if let Ok(tracer) = tracer {
            layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
-            axum_tracing_opentelemetry::init_propagator().unwrap();
+            init_tracing_opentelemetry::init_propagator().unwrap();
        };
    }

--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -13,7 +13,7 @@ use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
 use axum::{http, Json, Router};
-use axum_tracing_opentelemetry::opentelemetry_tracing_layer;
+use axum_tracing_opentelemetry::middleware::OtelAxumLayer;
 use futures::stream::StreamExt;
 use futures::Stream;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
@ -396,7 +396,7 @@ async fn generate_stream(
                                        // StreamResponse
                                        let stream_token = StreamResponse {
                                            token,
-                                            top_tokens: top_tokens,
+                                            top_tokens,
                                            generated_text: None,
                                            details: None,
                                        };
@ -458,7 +458,7 @@ async fn generate_stream(

                                        let stream_token = StreamResponse {
                                            token,
-                                            top_tokens: top_tokens,
+                                            top_tokens,
                                            generated_text: Some(output_text),
                                            details
                                        };
@ -695,7 +695,7 @@ pub async fn run(
        .layer(Extension(compat_return_full_text))
        .layer(Extension(infer))
        .layer(Extension(prom_handle.clone()))
-        .layer(opentelemetry_tracing_layer())
+        .layer(OtelAxumLayer::default())
        .layer(cors_layer);

    if ngrok {
@ -792,7 +792,7 @@ async fn shutdown_signal() {

 impl From<i32> for FinishReason {
    fn from(finish_reason: i32) -> Self {
-        let finish_reason = text_generation_client::FinishReason::from_i32(finish_reason).unwrap();
+        let finish_reason = text_generation_client::FinishReason::try_from(finish_reason).unwrap();
        match finish_reason {
            text_generation_client::FinishReason::Length => FinishReason::Length,
            text_generation_client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -276,7 +276,7 @@ impl Validation {
            truncate: truncate.unwrap_or(self.max_input_length) as u32,
            parameters,
            stopping_parameters,
-            top_n_tokens: top_n_tokens,
+            top_n_tokens,
        })
    }

--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "1.0.3"
+version = "1.1.0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]

--- a/server/requirements.txt
+++ b/server/requirements.txt
@ -9,19 +9,19 @@ certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
 charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-datasets==2.14.4 ; python_version >= "3.9" and python_version < "3.13"
+datasets==2.14.5 ; python_version >= "3.9" and python_version < "3.13"
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.12.3 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.12.4 ; python_version >= "3.9" and python_version < "3.13"
 frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "3.13"
 fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
 fsspec[http]==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
 googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.3 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
@ -32,7 +32,7 @@ mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
 multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
 multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
 networkx==3.1 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.25.2 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
@ -43,32 +43,32 @@ opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13
 opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
 packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
-pandas==2.0.3 ; python_version >= "3.9" and python_version < "3.13"
+pandas==2.1.1 ; python_version >= "3.9" and python_version < "3.13"
 peft==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
-pillow==10.0.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.24.2 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.0.1 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.24.3 ; python_version >= "3.9" and python_version < "3.13"
 psutil==5.9.5 ; python_version >= "3.9" and python_version < "3.13"
 pyarrow==13.0.0 ; python_version >= "3.9" and python_version < "3.13"
 python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "3.13"
-pytz==2023.3 ; python_version >= "3.9" and python_version < "3.13"
+pytz==2023.3.post1 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
 regex==2023.8.8 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.11.2 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==68.1.2 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==68.2.2 ; python_version >= "3.9" and python_version < "3.13"
 six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
 sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
 texttable==1.6.7 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
 torch==2.0.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.32.1 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.33.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
 tzdata==2023.3 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.0.5 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 xxhash==3.3.0 ; python_version >= "3.9" and python_version < "3.13"