add latency per token

This commit is contained in:
OlivierDehaene 2023-03-30 14:49:08 +02:00
parent b2d1276c16
commit 163c23f174
7 changed files with 2877 additions and 54 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

After

Width:  |  Height:  |  Size: 102 KiB

2801
benchmark/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,12 @@ edition = "2021"
authors = ["Olivier Dehaene"] authors = ["Olivier Dehaene"]
description = "Text Generation Benchmarking tool" description = "Text Generation Benchmarking tool"
[profile.release]
debug = 1
incremental = true
lto = "off"
panic = "abort"
[lib] [lib]
path = "src/lib.rs" path = "src/lib.rs"

View File

@ -0,0 +1,3 @@
[toolchain]
channel = "1.67.0"
components = ["rustfmt", "clippy"]

View File

@ -78,7 +78,7 @@ impl App {
| KeyEvent { | KeyEvent {
code: KeyCode::Tab, .. code: KeyCode::Tab, ..
} => { } => {
self.touched_tab=true; self.touched_tab = true;
self.current_tab = (self.current_tab + 1) % self.batch_size.len(); self.current_tab = (self.current_tab + 1) % self.batch_size.len();
} }
// Decrease and wrap tab // Decrease and wrap tab
@ -86,7 +86,7 @@ impl App {
code: KeyCode::Left, code: KeyCode::Left,
.. ..
} => { } => {
self.touched_tab=true; self.touched_tab = true;
if self.current_tab > 0 { if self.current_tab > 0 {
self.current_tab -= 1; self.current_tab -= 1;
} else { } else {
@ -186,10 +186,10 @@ impl App {
.direction(Direction::Horizontal) .direction(Direction::Horizontal)
.constraints( .constraints(
[ [
Constraint::Percentage(20), Constraint::Percentage(25),
Constraint::Percentage(30), Constraint::Percentage(25),
Constraint::Percentage(20), Constraint::Percentage(25),
Constraint::Percentage(30), Constraint::Percentage(25),
] ]
.as_ref(), .as_ref(),
) )
@ -206,6 +206,10 @@ impl App {
.direction(Direction::Vertical) .direction(Direction::Vertical)
.constraints([Constraint::Length(8), Constraint::Length(5)].as_ref()) .constraints([Constraint::Length(8), Constraint::Length(5)].as_ref())
.split(mid[2]); .split(mid[2]);
let decode_text_latency = Layout::default()
.direction(Direction::Horizontal)
.constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
.split(decode_text[0]);
// Bottom row horizontal layout // Bottom row horizontal layout
let bottom = Layout::default() let bottom = Layout::default()
@ -289,13 +293,15 @@ impl App {
f.render_widget(run_gauge, top[1]); f.render_widget(run_gauge, top[1]);
// Prefill text infos // Prefill text infos
let (prefill_latency_statics, prefill_throughput_statics) = text_info( let prefill_latency_block = latency_paragraph(
&mut self.data.prefill_latencies[self.current_tab], &mut self.data.prefill_latencies[self.current_tab],
&self.data.prefill_throughputs[self.current_tab],
"Prefill", "Prefill",
); );
f.render_widget(prefill_latency_statics, prefill_text[0]); let prefill_throughput_block =
f.render_widget(prefill_throughput_statics, prefill_text[1]); throughput_paragraph(&self.data.prefill_throughputs[self.current_tab], "Prefill");
f.render_widget(prefill_latency_block, prefill_text[0]);
f.render_widget(prefill_throughput_block, prefill_text[1]);
// Prefill latency histogram // Prefill latency histogram
let histo_width = 7; let histo_width = 7;
@ -315,13 +321,19 @@ impl App {
f.render_widget(prefill_histogram, mid[1]); f.render_widget(prefill_histogram, mid[1]);
// Decode text info // Decode text info
let (decode_latency_statics, decode_throughput_statics) = text_info( let decode_latency_block = latency_paragraph(
&mut self.data.decode_latencies[self.current_tab], &mut self.data.decode_latencies[self.current_tab],
&self.data.decode_throughputs[self.current_tab], "Decode Total",
"Decode",
); );
f.render_widget(decode_latency_statics, decode_text[0]); let decode_token_latency_block = latency_paragraph(
f.render_widget(decode_throughput_statics, decode_text[1]); &mut self.data.decode_token_latencies[self.current_tab],
"Decode Token",
);
let decode_throughput_block =
throughput_paragraph(&self.data.decode_throughputs[self.current_tab], "Decode");
f.render_widget(decode_latency_block, decode_text_latency[0]);
f.render_widget(decode_token_latency_block, decode_text_latency[1]);
f.render_widget(decode_throughput_block, decode_text[1]);
// Decode latency histogram // Decode latency histogram
let histo_data = let histo_data =
@ -357,6 +369,7 @@ struct Data {
prefill_latencies: Vec<Vec<f64>>, prefill_latencies: Vec<Vec<f64>>,
prefill_throughputs: Vec<Vec<f64>>, prefill_throughputs: Vec<Vec<f64>>,
decode_latencies: Vec<Vec<f64>>, decode_latencies: Vec<Vec<f64>>,
decode_token_latencies: Vec<Vec<f64>>,
decode_throughputs: Vec<Vec<f64>>, decode_throughputs: Vec<Vec<f64>>,
prefill_batch_latency_throughput: Vec<(f64, f64)>, prefill_batch_latency_throughput: Vec<(f64, f64)>,
decode_batch_latency_throughput: Vec<(f64, f64)>, decode_batch_latency_throughput: Vec<(f64, f64)>,
@ -366,22 +379,21 @@ impl Data {
fn new(n_run: usize, n_batch: usize) -> Self { fn new(n_run: usize, n_batch: usize) -> Self {
let prefill_latencies: Vec<Vec<f64>> = let prefill_latencies: Vec<Vec<f64>> =
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect(); (0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
let prefill_throughputs: Vec<Vec<f64>> = let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
let decode_latencies: Vec<Vec<f64>> = let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect(); let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
let decode_throughputs: Vec<Vec<f64>> = let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch); let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
let decode_batch_latency_throughput: Vec<(f64, f64)> =
let decode_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch); prefill_batch_latency_throughput.clone();
Self { Self {
prefill_latencies, prefill_latencies,
prefill_throughputs, prefill_throughputs,
decode_latencies, decode_latencies,
decode_token_latencies,
decode_throughputs, decode_throughputs,
prefill_batch_latency_throughput, prefill_batch_latency_throughput,
decode_batch_latency_throughput, decode_batch_latency_throughput,
@ -394,10 +406,12 @@ impl Data {
self.prefill_throughputs[batch_idx].push(prefill.throughput); self.prefill_throughputs[batch_idx].push(prefill.throughput);
} }
fn push_decode(&mut self, prefill: Decode, batch_idx: usize) { fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
let latency = prefill.latency.as_millis() as f64; let latency = decode.latency.as_millis() as f64;
let token_latency = decode.token_latency.as_millis() as f64;
self.decode_latencies[batch_idx].push(latency); self.decode_latencies[batch_idx].push(latency);
self.decode_throughputs[batch_idx].push(prefill.throughput); self.decode_token_latencies[batch_idx].push(token_latency);
self.decode_throughputs[batch_idx].push(decode.throughput);
} }
fn end_batch(&mut self, batch_idx: usize) { fn end_batch(&mut self, batch_idx: usize) {
@ -425,12 +439,21 @@ fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Ga
.ratio(progress) .ratio(progress)
} }
/// Prefill or Decode text infos /// Throughput paragraph
fn text_info<'a>( fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragraph<'a> {
latency: &mut Vec<f64>, // Throughput average/high/low texts
throughput: &Vec<f64>, let throughput_texts = statis_spans(throughput, "tokens/secs");
name: &'static str,
) -> (Paragraph<'a>, Paragraph<'a>) { // Throughput block
Paragraph::new(throughput_texts).block(
Block::default()
.title(Span::raw(format!("{name} Throughput")))
.borders(Borders::ALL),
)
}
/// Latency paragraph
fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragraph<'a> {
// Latency average/high/low texts // Latency average/high/low texts
let mut latency_texts = statis_spans(latency, "ms"); let mut latency_texts = statis_spans(latency, "ms");
@ -442,30 +465,17 @@ fn text_info<'a>(
let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed]; let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
for (i, (name, value)) in latency_percentiles.iter().enumerate() { for (i, (name, value)) in latency_percentiles.iter().enumerate() {
let span = Spans::from(vec![Span::styled( let span = Spans::from(vec![Span::styled(
format!("{name}: {value:.4} ms"), format!("{name}: {value:.2} ms"),
Style::default().fg(colors[i]), Style::default().fg(colors[i]),
)]); )]);
latency_texts.push(span); latency_texts.push(span);
} }
// Throughput average/high/low texts Paragraph::new(latency_texts).block(
let throughput_texts = statis_spans(throughput, "tokens/secs");
// Latency Block
let latency_statics = Paragraph::new(latency_texts).block(
Block::default() Block::default()
.title(Span::raw(format!("{name} Latency"))) .title(Span::raw(format!("{name} Latency")))
.borders(Borders::ALL), .borders(Borders::ALL),
); )
// Throughput block
let throughput_statics = Paragraph::new(throughput_texts).block(
Block::default()
.title(Span::raw(format!("{name} Throughput")))
.borders(Borders::ALL),
);
(latency_statics, throughput_statics)
} }
/// Average/High/Low spans /// Average/High/Low spans
@ -473,14 +483,14 @@ fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
vec![ vec![
Spans::from(vec![Span::styled( Spans::from(vec![Span::styled(
format!( format!(
"Average: {:.4} {unit}", "Average: {:.2} {unit}",
data.iter().sum::<f64>() / data.len() as f64 data.iter().sum::<f64>() / data.len() as f64
), ),
Style::default().fg(Color::LightBlue), Style::default().fg(Color::LightBlue),
)]), )]),
Spans::from(vec![Span::styled( Spans::from(vec![Span::styled(
format!( format!(
"Lowest: {:.4} {unit}", "Lowest: {:.2} {unit}",
data.iter() data.iter()
.min_by(|a, b| a.total_cmp(b)) .min_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN) .unwrap_or(&std::f64::NAN)
@ -489,7 +499,7 @@ fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
)]), )]),
Spans::from(vec![Span::styled( Spans::from(vec![Span::styled(
format!( format!(
"Highest: {:.4} {unit}", "Highest: {:.2} {unit}",
data.iter() data.iter()
.max_by(|a, b| a.total_cmp(b)) .max_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN) .unwrap_or(&std::f64::NAN)

View File

@ -17,6 +17,7 @@ pub(crate) struct Prefill {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub(crate) struct Decode { pub(crate) struct Decode {
pub(crate) latency: Duration, pub(crate) latency: Duration,
pub(crate) token_latency: Duration,
pub(crate) throughput: f64, pub(crate) throughput: f64,
} }
@ -180,12 +181,14 @@ async fn decode(batch: Batch, client: &mut ShardedClient) -> Result<Decode, Clie
// Get latency // Get latency
let latency = start_time.elapsed(); let latency = start_time.elapsed();
let token_latency = latency / decode_length;
// Compute throughput from latency, batch size and decode length // Compute throughput from latency, batch size and decode length
let throughput = (batch_size * decode_length) as f64 / latency.as_secs_f64(); let throughput = (batch_size * decode_length) as f64 / latency.as_secs_f64();
let step = Decode { let step = Decode {
latency, latency,
token_latency,
throughput, throughput,
}; };
Ok(step) Ok(step)

View File

@ -20,11 +20,11 @@ struct Args {
batch_size: Option<Vec<u32>>, batch_size: Option<Vec<u32>>,
#[clap(default_value = "10", short, long, env)] #[clap(default_value = "10", short, long, env)]
sequence_length: u32, sequence_length: u32,
#[clap(default_value = "8", short,long, env)] #[clap(default_value = "8", short, long, env)]
decode_length: u32, decode_length: u32,
#[clap(default_value = "10", short,long, env)] #[clap(default_value = "10", short, long, env)]
runs: usize, runs: usize,
#[clap(default_value = "1", short,long, env)] #[clap(default_value = "1", short, long, env)]
warmups: usize, warmups: usize,
#[clap(default_value = "/tmp/text-generation-server-0", short, long, env)] #[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
master_shard_uds_path: String, master_shard_uds_path: String,