mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
Preping 1.1.0
This commit is contained in:
parent
8672cad2cb
commit
853f09035c
1255
Cargo.lock
generated
1255
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.0.3"
|
||||
version = "1.1.0"
|
||||
edition = "2021"
|
||||
authors = ["Olivier Dehaene"]
|
||||
homepage = "https://github.com/huggingface/text-generation-inference"
|
||||
|
@ -87,7 +87,7 @@ The easiest way of getting started is using the official Docker container:
|
||||
model=tiiuae/falcon-7b-instruct
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
|
||||
```
|
||||
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
|
||||
|
||||
|
@ -14,18 +14,19 @@ name = "text-generation-benchmark"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
average = "0.13"
|
||||
clap = { version = "4.1.4", features = ["derive", "env"] }
|
||||
crossterm = "0.26"
|
||||
average = "0.14"
|
||||
clap = { version = "4.4.5", features = ["derive", "env"] }
|
||||
crossterm = "0.27"
|
||||
float-ord = "0.3.2"
|
||||
serde = {version = "1.0.142", features = ["derive"]}
|
||||
serde = {version = "1.0.188", features = ["derive"]}
|
||||
serde_json = "1.0"
|
||||
tabled = "0.12.0"
|
||||
tabled = "0.14.0"
|
||||
text-generation-client = { path = "../router/client" }
|
||||
thiserror = "1.0.38"
|
||||
tokenizers = "0.13.3"
|
||||
tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
|
||||
tui = {package = "ratatui", version = "0.20", default-features = false, features = ["crossterm"]}
|
||||
thiserror = "1.0.48"
|
||||
tokenizers = { version = "0.14.0", features = ["http"] }
|
||||
tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
|
||||
tui = {package = "ratatui", version = "0.23", default-features = false, features = ["crossterm"]}
|
||||
tracing = "0.1.37"
|
||||
tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
|
||||
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
|
||||
hf-hub = "0.3.1"
|
||||
|
||||
|
@ -6,7 +6,7 @@ use tokio::sync::mpsc;
|
||||
use tui::backend::Backend;
|
||||
use tui::layout::{Alignment, Constraint, Direction, Layout};
|
||||
use tui::style::{Color, Modifier, Style};
|
||||
use tui::text::{Span, Spans};
|
||||
use tui::text::{Span, Line};
|
||||
use tui::widgets::{
|
||||
Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
|
||||
};
|
||||
@ -244,7 +244,7 @@ impl App {
|
||||
.batch_size
|
||||
.iter()
|
||||
.map(|b| {
|
||||
Spans::from(vec![Span::styled(
|
||||
Line::from(vec![Span::styled(
|
||||
format!("Batch: {b}"),
|
||||
Style::default().fg(Color::White),
|
||||
)])
|
||||
@ -468,7 +468,7 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
|
||||
// Latency p50/p90/p99 texts
|
||||
let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
|
||||
for (i, (name, value)) in latency_percentiles.iter().enumerate() {
|
||||
let span = Spans::from(vec![Span::styled(
|
||||
let span = Line::from(vec![Span::styled(
|
||||
format!("{name}: {value:.2} ms"),
|
||||
Style::default().fg(colors[i]),
|
||||
)]);
|
||||
@ -483,16 +483,16 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
|
||||
}
|
||||
|
||||
/// Average/High/Low spans
|
||||
fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
|
||||
fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Line<'a>> {
|
||||
vec![
|
||||
Spans::from(vec![Span::styled(
|
||||
Line::from(vec![Span::styled(
|
||||
format!(
|
||||
"Average: {:.2} {unit}",
|
||||
data.iter().sum::<f64>() / data.len() as f64
|
||||
),
|
||||
Style::default().fg(Color::LightBlue),
|
||||
)]),
|
||||
Spans::from(vec![Span::styled(
|
||||
Line::from(vec![Span::styled(
|
||||
format!(
|
||||
"Lowest: {:.2} {unit}",
|
||||
data.iter()
|
||||
@ -501,7 +501,7 @@ fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
|
||||
),
|
||||
Style::default().fg(Color::Reset),
|
||||
)]),
|
||||
Spans::from(vec {
|
||||
|
||||
if let Ok(tracer) = tracer {
|
||||
layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
|
||||
axum_tracing_opentelemetry::init_propagator().unwrap();
|
||||
init_tracing_opentelemetry::init_propagator().unwrap();
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,7 @@ use axum::response::sse::{Event, KeepAlive, Sse};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::{get, post};
|
||||
use axum::{http, Json, Router};
|
||||
use axum_tracing_opentelemetry::opentelemetry_tracing_layer;
|
||||
use axum_tracing_opentelemetry::middleware::OtelAxumLayer;
|
||||
use futures::stream::StreamExt;
|
||||
use futures::Stream;
|
||||
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
|
||||
@ -396,7 +396,7 @@ async fn generate_stream(
|
||||
// StreamResponse
|
||||
let stream_token = StreamResponse {
|
||||
token,
|
||||
top_tokens: top_tokens,
|
||||
top_tokens,
|
||||
generated_text: None,
|
||||
details: None,
|
||||
};
|
||||
@ -458,7 +458,7 @@ async fn generate_stream(
|
||||
|
||||
let stream_token = StreamResponse {
|
||||
token,
|
||||
top_tokens: top_tokens,
|
||||
top_tokens,
|
||||
generated_text: Some(output_text),
|
||||
details
|
||||
};
|
||||
@ -695,7 +695,7 @@ pub async fn run(
|
||||
.layer(Extension(compat_return_full_text))
|
||||
.layer(Extension(infer))
|
||||
.layer(Extension(prom_handle.clone()))
|
||||
.layer(opentelemetry_tracing_layer())
|
||||
.layer(OtelAxumLayer::default())
|
||||
.layer(cors_layer);
|
||||
|
||||
if ngrok {
|
||||
@ -792,7 +792,7 @@ async fn shutdown_signal() {
|
||||
|
||||
impl From<i32> for FinishReason {
|
||||
fn from(finish_reason: i32) -> Self {
|
||||
let finish_reason = text_generation_client::FinishReason::from_i32(finish_reason).unwrap();
|
||||
let finish_reason = text_generation_client::FinishReason::try_from(finish_reason).unwrap();
|
||||
match finish_reason {
|
||||
text_generation_client::FinishReason::Length => FinishReason::Length,
|
||||
text_generation_client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
|
||||
|
@ -276,7 +276,7 @@ impl Validation {
|
||||
truncate: truncate.unwrap_or(self.max_input_length) as u32,
|
||||
parameters,
|
||||
stopping_parameters,
|
||||
top_n_tokens: top_n_tokens,
|
||||
top_n_tokens,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "text-generation-server"
|
||||
version = "1.0.3"
|
||||
version = "1.1.0"
|
||||
description = "Text Generation Inference Python gRPC Server"
|
||||
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
||||
|
||||
|
@ -9,19 +9,19 @@ certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
|
||||
charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
|
||||
datasets==2.14.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
datasets==2.14.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
||||
dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
filelock==3.12.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
filelock==3.12.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec[http]==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-reflection==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-status==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
hf-transfer==0.1.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
@ -32,7 +32,7 @@ mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
|
||||
networkx==3.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
numpy==1.25.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
numpy==1.26.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
@ -43,32 +43,32 @@ opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13
|
||||
opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pandas==2.0.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pandas==2.1.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
peft==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pillow==10.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
protobuf==4.24.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pillow==10.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
protobuf==4.24.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
psutil==5.9.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyarrow==13.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pytz==2023.3.post1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
regex==2023.8.8 ; python_version >= "3.9" and python_version < "3.13"
|
||||
requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
scipy==1.11.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||
setuptools==68.1.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
setuptools==68.2.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
|
||||
texttable==1.6.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
torch==2.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.32.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.33.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.0.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||
wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
xxhash==3.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
|
Loading…
Reference in New Issue
Block a user