mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
add latency per token
This commit is contained in:
parent
b2d1276c16
commit
163c23f174
Binary file not shown.
Before Width: | Height: | Size: 100 KiB After Width: | Height: | Size: 102 KiB |
2801
benchmark/Cargo.lock
generated
Normal file
2801
benchmark/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -5,6 +5,12 @@ edition = "2021"
|
|||||||
authors = ["Olivier Dehaene"]
|
authors = ["Olivier Dehaene"]
|
||||||
description = "Text Generation Benchmarking tool"
|
description = "Text Generation Benchmarking tool"
|
||||||
|
|
||||||
|
[profile.release]
|
||||||
|
debug = 1
|
||||||
|
incremental = true
|
||||||
|
lto = "off"
|
||||||
|
panic = "abort"
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
path = "src/lib.rs"
|
path = "src/lib.rs"
|
||||||
|
|
||||||
|
3
benchmark/rust-toolchain.toml
Normal file
3
benchmark/rust-toolchain.toml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[toolchain]
|
||||||
|
channel = "1.67.0"
|
||||||
|
components = ["rustfmt", "clippy"]
|
@ -78,7 +78,7 @@ impl App {
|
|||||||
| KeyEvent {
|
| KeyEvent {
|
||||||
code: KeyCode::Tab, ..
|
code: KeyCode::Tab, ..
|
||||||
} => {
|
} => {
|
||||||
self.touched_tab=true;
|
self.touched_tab = true;
|
||||||
self.current_tab = (self.current_tab + 1) % self.batch_size.len();
|
self.current_tab = (self.current_tab + 1) % self.batch_size.len();
|
||||||
}
|
}
|
||||||
// Decrease and wrap tab
|
// Decrease and wrap tab
|
||||||
@ -86,7 +86,7 @@ impl App {
|
|||||||
code: KeyCode::Left,
|
code: KeyCode::Left,
|
||||||
..
|
..
|
||||||
} => {
|
} => {
|
||||||
self.touched_tab=true;
|
self.touched_tab = true;
|
||||||
if self.current_tab > 0 {
|
if self.current_tab > 0 {
|
||||||
self.current_tab -= 1;
|
self.current_tab -= 1;
|
||||||
} else {
|
} else {
|
||||||
@ -186,10 +186,10 @@ impl App {
|
|||||||
.direction(Direction::Horizontal)
|
.direction(Direction::Horizontal)
|
||||||
.constraints(
|
.constraints(
|
||||||
[
|
[
|
||||||
Constraint::Percentage(20),
|
Constraint::Percentage(25),
|
||||||
Constraint::Percentage(30),
|
Constraint::Percentage(25),
|
||||||
Constraint::Percentage(20),
|
Constraint::Percentage(25),
|
||||||
Constraint::Percentage(30),
|
Constraint::Percentage(25),
|
||||||
]
|
]
|
||||||
.as_ref(),
|
.as_ref(),
|
||||||
)
|
)
|
||||||
@ -206,6 +206,10 @@ impl App {
|
|||||||
.direction(Direction::Vertical)
|
.direction(Direction::Vertical)
|
||||||
.constraints([Constraint::Length(8), Constraint::Length(5)].as_ref())
|
.constraints([Constraint::Length(8), Constraint::Length(5)].as_ref())
|
||||||
.split(mid[2]);
|
.split(mid[2]);
|
||||||
|
let decode_text_latency = Layout::default()
|
||||||
|
.direction(Direction::Horizontal)
|
||||||
|
.constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
|
||||||
|
.split(decode_text[0]);
|
||||||
|
|
||||||
// Bottom row horizontal layout
|
// Bottom row horizontal layout
|
||||||
let bottom = Layout::default()
|
let bottom = Layout::default()
|
||||||
@ -289,13 +293,15 @@ impl App {
|
|||||||
f.render_widget(run_gauge, top[1]);
|
f.render_widget(run_gauge, top[1]);
|
||||||
|
|
||||||
// Prefill text infos
|
// Prefill text infos
|
||||||
let (prefill_latency_statics, prefill_throughput_statics) = text_info(
|
let prefill_latency_block = latency_paragraph(
|
||||||
&mut self.data.prefill_latencies[self.current_tab],
|
&mut self.data.prefill_latencies[self.current_tab],
|
||||||
&self.data.prefill_throughputs[self.current_tab],
|
|
||||||
"Prefill",
|
"Prefill",
|
||||||
);
|
);
|
||||||
f.render_widget(prefill_latency_statics, prefill_text[0]);
|
let prefill_throughput_block =
|
||||||
f.render_widget(prefill_throughput_statics, prefill_text[1]);
|
throughput_paragraph(&self.data.prefill_throughputs[self.current_tab], "Prefill");
|
||||||
|
|
||||||
|
f.render_widget(prefill_latency_block, prefill_text[0]);
|
||||||
|
f.render_widget(prefill_throughput_block, prefill_text[1]);
|
||||||
|
|
||||||
// Prefill latency histogram
|
// Prefill latency histogram
|
||||||
let histo_width = 7;
|
let histo_width = 7;
|
||||||
@ -315,13 +321,19 @@ impl App {
|
|||||||
f.render_widget(prefill_histogram, mid[1]);
|
f.render_widget(prefill_histogram, mid[1]);
|
||||||
|
|
||||||
// Decode text info
|
// Decode text info
|
||||||
let (decode_latency_statics, decode_throughput_statics) = text_info(
|
let decode_latency_block = latency_paragraph(
|
||||||
&mut self.data.decode_latencies[self.current_tab],
|
&mut self.data.decode_latencies[self.current_tab],
|
||||||
&self.data.decode_throughputs[self.current_tab],
|
"Decode Total",
|
||||||
"Decode",
|
|
||||||
);
|
);
|
||||||
f.render_widget(decode_latency_statics, decode_text[0]);
|
let decode_token_latency_block = latency_paragraph(
|
||||||
f.render_widget(decode_throughput_statics, decode_text[1]);
|
&mut self.data.decode_token_latencies[self.current_tab],
|
||||||
|
"Decode Token",
|
||||||
|
);
|
||||||
|
let decode_throughput_block =
|
||||||
|
throughput_paragraph(&self.data.decode_throughputs[self.current_tab], "Decode");
|
||||||
|
f.render_widget(decode_latency_block, decode_text_latency[0]);
|
||||||
|
f.render_widget(decode_token_latency_block, decode_text_latency[1]);
|
||||||
|
f.render_widget(decode_throughput_block, decode_text[1]);
|
||||||
|
|
||||||
// Decode latency histogram
|
// Decode latency histogram
|
||||||
let histo_data =
|
let histo_data =
|
||||||
@ -357,6 +369,7 @@ struct Data {
|
|||||||
prefill_latencies: Vec<Vec<f64>>,
|
prefill_latencies: Vec<Vec<f64>>,
|
||||||
prefill_throughputs: Vec<Vec<f64>>,
|
prefill_throughputs: Vec<Vec<f64>>,
|
||||||
decode_latencies: Vec<Vec<f64>>,
|
decode_latencies: Vec<Vec<f64>>,
|
||||||
|
decode_token_latencies: Vec<Vec<f64>>,
|
||||||
decode_throughputs: Vec<Vec<f64>>,
|
decode_throughputs: Vec<Vec<f64>>,
|
||||||
prefill_batch_latency_throughput: Vec<(f64, f64)>,
|
prefill_batch_latency_throughput: Vec<(f64, f64)>,
|
||||||
decode_batch_latency_throughput: Vec<(f64, f64)>,
|
decode_batch_latency_throughput: Vec<(f64, f64)>,
|
||||||
@ -366,22 +379,21 @@ impl Data {
|
|||||||
fn new(n_run: usize, n_batch: usize) -> Self {
|
fn new(n_run: usize, n_batch: usize) -> Self {
|
||||||
let prefill_latencies: Vec<Vec<f64>> =
|
let prefill_latencies: Vec<Vec<f64>> =
|
||||||
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
|
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
|
||||||
let prefill_throughputs: Vec<Vec<f64>> =
|
let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
|
||||||
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
|
|
||||||
|
|
||||||
let decode_latencies: Vec<Vec<f64>> =
|
let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
|
||||||
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
|
let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
|
||||||
let decode_throughputs: Vec<Vec<f64>> =
|
let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
|
||||||
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
|
|
||||||
|
|
||||||
let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
|
let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
|
||||||
|
let decode_batch_latency_throughput: Vec<(f64, f64)> =
|
||||||
let decode_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
|
prefill_batch_latency_throughput.clone();
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
prefill_latencies,
|
prefill_latencies,
|
||||||
prefill_throughputs,
|
prefill_throughputs,
|
||||||
decode_latencies,
|
decode_latencies,
|
||||||
|
decode_token_latencies,
|
||||||
decode_throughputs,
|
decode_throughputs,
|
||||||
prefill_batch_latency_throughput,
|
prefill_batch_latency_throughput,
|
||||||
decode_batch_latency_throughput,
|
decode_batch_latency_throughput,
|
||||||
@ -394,10 +406,12 @@ impl Data {
|
|||||||
self.prefill_throughputs[batch_idx].push(prefill.throughput);
|
self.prefill_throughputs[batch_idx].push(prefill.throughput);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_decode(&mut self, prefill: Decode, batch_idx: usize) {
|
fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
|
||||||
let latency = prefill.latency.as_millis() as f64;
|
let latency = decode.latency.as_millis() as f64;
|
||||||
|
let token_latency = decode.token_latency.as_millis() as f64;
|
||||||
self.decode_latencies[batch_idx].push(latency);
|
self.decode_latencies[batch_idx].push(latency);
|
||||||
self.decode_throughputs[batch_idx].push(prefill.throughput);
|
self.decode_token_latencies[batch_idx].push(token_latency);
|
||||||
|
self.decode_throughputs[batch_idx].push(decode.throughput);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn end_batch(&mut self, batch_idx: usize) {
|
fn end_batch(&mut self, batch_idx: usize) {
|
||||||
@ -425,12 +439,21 @@ fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Ga
|
|||||||
.ratio(progress)
|
.ratio(progress)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Prefill or Decode text infos
|
/// Throughput paragraph
|
||||||
fn text_info<'a>(
|
fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragraph<'a> {
|
||||||
latency: &mut Vec<f64>,
|
// Throughput average/high/low texts
|
||||||
throughput: &Vec<f64>,
|
let throughput_texts = statis_spans(throughput, "tokens/secs");
|
||||||
name: &'static str,
|
|
||||||
) -> (Paragraph<'a>, Paragraph<'a>) {
|
// Throughput block
|
||||||
|
Paragraph::new(throughput_texts).block(
|
||||||
|
Block::default()
|
||||||
|
.title(Span::raw(format!("{name} Throughput")))
|
||||||
|
.borders(Borders::ALL),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Latency paragraph
|
||||||
|
fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragraph<'a> {
|
||||||
// Latency average/high/low texts
|
// Latency average/high/low texts
|
||||||
let mut latency_texts = statis_spans(latency, "ms");
|
let mut latency_texts = statis_spans(latency, "ms");
|
||||||
|
|
||||||
@ -442,30 +465,17 @@ fn text_info<'a>(
|
|||||||
let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
|
let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
|
||||||
for (i, (name, value)) in latency_percentiles.iter().enumerate() {
|
for (i, (name, value)) in latency_percentiles.iter().enumerate() {
|
||||||
let span = Spans::from(vec![Span::styled(
|
let span = Spans::from(vec![Span::styled(
|
||||||
format!("{name}: {value:.4} ms"),
|
format!("{name}: {value:.2} ms"),
|
||||||
Style::default().fg(colors[i]),
|
Style::default().fg(colors[i]),
|
||||||
)]);
|
)]);
|
||||||
latency_texts.push(span);
|
latency_texts.push(span);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Throughput average/high/low texts
|
Paragraph::new(latency_texts).block(
|
||||||
let throughput_texts = statis_spans(throughput, "tokens/secs");
|
|
||||||
|
|
||||||
// Latency Block
|
|
||||||
let latency_statics = Paragraph::new(latency_texts).block(
|
|
||||||
Block::default()
|
Block::default()
|
||||||
.title(Span::raw(format!("{name} Latency")))
|
.title(Span::raw(format!("{name} Latency")))
|
||||||
.borders(Borders::ALL),
|
.borders(Borders::ALL),
|
||||||
);
|
)
|
||||||
|
|
||||||
// Throughput block
|
|
||||||
let throughput_statics = Paragraph::new(throughput_texts).block(
|
|
||||||
Block::default()
|
|
||||||
.title(Span::raw(format!("{name} Throughput")))
|
|
||||||
.borders(Borders::ALL),
|
|
||||||
);
|
|
||||||
|
|
||||||
(latency_statics, throughput_statics)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Average/High/Low spans
|
/// Average/High/Low spans
|
||||||
@ -473,14 +483,14 @@ fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
|
|||||||
vec![
|
vec![
|
||||||
Spans::from(vec![Span::styled(
|
Spans::from(vec![Span::styled(
|
||||||
format!(
|
format!(
|
||||||
"Average: {:.4} {unit}",
|
"Average: {:.2} {unit}",
|
||||||
data.iter().sum::<f64>() / data.len() as f64
|
data.iter().sum::<f64>() / data.len() as f64
|
||||||
),
|
),
|
||||||
Style::default().fg(Color::LightBlue),
|
Style::default().fg(Color::LightBlue),
|
||||||
)]),
|
)]),
|
||||||
Spans::from(vec![Span::styled(
|
Spans::from(vec![Span::styled(
|
||||||
format!(
|
format!(
|
||||||
"Lowest: {:.4} {unit}",
|
"Lowest: {:.2} {unit}",
|
||||||
data.iter()
|
data.iter()
|
||||||
.min_by(|a, b| a.total_cmp(b))
|
.min_by(|a, b| a.total_cmp(b))
|
||||||
.unwrap_or(&std::f64::NAN)
|
.unwrap_or(&std::f64::NAN)
|
||||||
@ -489,7 +499,7 @@ fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
|
|||||||
)]),
|
)]),
|
||||||
Spans::from(vec![Span::styled(
|
Spans::from(vec![Span::styled(
|
||||||
format!(
|
format!(
|
||||||
"Highest: {:.4} {unit}",
|
"Highest: {:.2} {unit}",
|
||||||
data.iter()
|
data.iter()
|
||||||
.max_by(|a, b| a.total_cmp(b))
|
.max_by(|a, b| a.total_cmp(b))
|
||||||
.unwrap_or(&std::f64::NAN)
|
.unwrap_or(&std::f64::NAN)
|
||||||
|
@ -17,6 +17,7 @@ pub(crate) struct Prefill {
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub(crate) struct Decode {
|
pub(crate) struct Decode {
|
||||||
pub(crate) latency: Duration,
|
pub(crate) latency: Duration,
|
||||||
|
pub(crate) token_latency: Duration,
|
||||||
pub(crate) throughput: f64,
|
pub(crate) throughput: f64,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,12 +181,14 @@ async fn decode(batch: Batch, client: &mut ShardedClient) -> Result<Decode, Clie
|
|||||||
|
|
||||||
// Get latency
|
// Get latency
|
||||||
let latency = start_time.elapsed();
|
let latency = start_time.elapsed();
|
||||||
|
let token_latency = latency / decode_length;
|
||||||
|
|
||||||
// Compute throughput from latency, batch size and decode length
|
// Compute throughput from latency, batch size and decode length
|
||||||
let throughput = (batch_size * decode_length) as f64 / latency.as_secs_f64();
|
let throughput = (batch_size * decode_length) as f64 / latency.as_secs_f64();
|
||||||
|
|
||||||
let step = Decode {
|
let step = Decode {
|
||||||
latency,
|
latency,
|
||||||
|
token_latency,
|
||||||
throughput,
|
throughput,
|
||||||
};
|
};
|
||||||
Ok(step)
|
Ok(step)
|
||||||
|
@ -20,11 +20,11 @@ struct Args {
|
|||||||
batch_size: Option<Vec<u32>>,
|
batch_size: Option<Vec<u32>>,
|
||||||
#[clap(default_value = "10", short, long, env)]
|
#[clap(default_value = "10", short, long, env)]
|
||||||
sequence_length: u32,
|
sequence_length: u32,
|
||||||
#[clap(default_value = "8", short,long, env)]
|
#[clap(default_value = "8", short, long, env)]
|
||||||
decode_length: u32,
|
decode_length: u32,
|
||||||
#[clap(default_value = "10", short,long, env)]
|
#[clap(default_value = "10", short, long, env)]
|
||||||
runs: usize,
|
runs: usize,
|
||||||
#[clap(default_value = "1", short,long, env)]
|
#[clap(default_value = "1", short, long, env)]
|
||||||
warmups: usize,
|
warmups: usize,
|
||||||
#[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
|
#[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
|
||||||
master_shard_uds_path: String,
|
master_shard_uds_path: String,
|
||||||
|
Loading…
Reference in New Issue
Block a user