diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs index 249eebf76..c0ac3e879 100644 --- a/backends/v3/src/queue.rs +++ b/backends/v3/src/queue.rs @@ -138,8 +138,9 @@ async fn queue_task( while let Some(cmd) = receiver.recv().await { match cmd { QueueCommand::Append(entry, span) => { - span.in_scope(|| state.append(*entry)); metrics::gauge!("tgi_queue_size").increment(1.0); + metrics::gauge!("tgi_queue_size_tokens").increment(entry.request.input_length); + span.in_scope(|| state.append(*entry)); } QueueCommand::NextBatch { min_size, @@ -154,7 +155,15 @@ async fn queue_task( .instrument(span) .await; response_sender.send(next_batch).unwrap(); + metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64); + metrics::gauge!("tgi_queue_size_tokens").set( + state + .entries + .iter() + .map(|(_, e)| e.request.input_length as f64) + .sum::(), + ); } } } diff --git a/router/src/server.rs b/router/src/server.rs index e9aa4612b..293927716 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -67,6 +67,7 @@ use tracing::{info_span, instrument, Instrument}; use utoipa::OpenApi; use utoipa_swagger_ui::SwaggerUi; + fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec { let offsets = encoding.get_offsets(); let input_ids = encoding.get_ids(); @@ -2171,6 +2172,11 @@ async fn start( "Current batch size" ); metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size"); + metrics::describe_gauge!( + "tgi_queue_size_tokens", + metrics::Unit::Count, + "Current queue size in number of tokens" + ); metrics::describe_gauge!( "tgi_batch_current_max_tokens", metrics::Unit::Count,