From bbdd26e2be9638e53ca246000113507f9a84408f Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 30 Jul 2024 18:18:39 +0200 Subject: [PATCH] Backporting telemetry. --- .pre-commit-config.yaml | 4 +- backends/v3/src/main.rs | 8 +++ router/src/server.rs | 133 +++++++++++++++++++++++++++++++++++++- router/src/usage_stats.rs | 30 ++++----- 4 files changed, 155 insertions(+), 20 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 145e2b77..bb3879b2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,8 +14,8 @@ repos: rev: v1.0 hooks: - id: fmt - # - id: cargo-check - # - id: clippy + - id: cargo-check + - id: clippy - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.3.0 hooks: diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs index 52e13cfa..ef10514f 100644 --- a/backends/v3/src/main.rs +++ b/backends/v3/src/main.rs @@ -68,6 +68,10 @@ struct Args { disable_grammar_support: bool, #[clap(default_value = "4", long, env)] max_client_batch_size: usize, + #[clap(long, env, default_value_t)] + disable_usage_stats: bool, + #[clap(long, env, default_value_t)] + disable_crash_reports: bool, } #[derive(Debug, Subcommand)] @@ -110,6 +114,8 @@ async fn main() -> Result<(), RouterError> { ngrok_edge, messages_api_enabled, disable_grammar_support, + disable_usage_stats, + disable_crash_reports, max_client_batch_size, } = args; @@ -182,6 +188,8 @@ async fn main() -> Result<(), RouterError> { messages_api_enabled, disable_grammar_support, max_client_batch_size, + disable_usage_stats, + disable_crash_reports, ) .await?; Ok(()) diff --git a/router/src/server.rs b/router/src/server.rs index c3b32c00..24c92873 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -7,6 +7,7 @@ use crate::kserve::{ kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer, kserve_model_metadata, kserve_model_metadata_ready, }; +use crate::usage_stats; use crate::validation::ValidationError; use crate::{ BestOfSequence, Details, ErrorResponse, FinishReason, FunctionName, GenerateParameters, @@ -1502,8 +1503,10 @@ pub async fn run( _ngrok_authtoken: Option, _ngrok_edge: Option, messages_api_enabled: bool, - grammar_support: bool, + disable_grammar_support: bool, max_client_batch_size: usize, + disable_usage_stats: bool, + disable_crash_reports: bool, ) -> Result<(), WebServerError> { // CORS allowed origins // map to go inside the option and then map to parse from String to HeaderValue @@ -1690,7 +1693,44 @@ pub async fn run( tracing::warn!("Rust input length validation and truncation is disabled"); } - // if pipeline-tag == text-generation we default to return_full_text = true + // Only send usage stats when TGI is run in container and the function returns Some + let is_container = matches!(usage_stats::is_container(), Ok(true)); + + let user_agent = if !disable_usage_stats && is_container { + let reduced_args = usage_stats::Args::new( + config.clone(), + tokenizer_config.tokenizer_class.clone(), + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + // waiting_served_ratio, + // max_batch_prefill_tokens, + // max_batch_total_tokens, + // max_waiting_tokens, + // max_batch_size, + revision.clone(), + validation_workers, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + disable_usage_stats, + disable_crash_reports, + ); + Some(usage_stats::UserAgent::new(reduced_args)) + } else { + None + }; + + if let Some(ref ua) = user_agent { + let start_event = + usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None); + tokio::spawn(async move { + start_event.send().await; + }); + }; let compat_return_full_text = match &model_info.pipeline_tag { None => { tracing::warn!("no pipeline tag found for model {tokenizer_name}"); @@ -1698,7 +1738,94 @@ pub async fn run( } Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation", }; + let result = start( + backend, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + validation_workers, + api_key, + config, + (tokenizer, tokenizer_config), + (preprocessor_config, processor_config), + hostname, + port, + ngrok, + _ngrok_authtoken, + _ngrok_edge, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + model_info, + compat_return_full_text, + allow_origin, + ) + .await; + if let Some(ua) = user_agent { + match result { + Ok(_) => { + let stop_event = usage_stats::UsageStatsEvent::new( + ua.clone(), + usage_stats::EventType::Stop, + None, + ); + stop_event.send().await; + Ok(()) + } + Err(e) => { + if !disable_crash_reports { + let error_event = usage_stats::UsageStatsEvent::new( + ua.clone(), + usage_stats::EventType::Error, + Some(e.to_string()), + ); + error_event.send().await; + } else { + let unknow_error_event = usage_stats::UsageStatsEvent::new( + ua.clone(), + usage_stats::EventType::Error, + Some("unknow_error".to_string()), + ); + unknow_error_event.send().await; + } + Err(e) + } + } + } else { + result + } +} + +#[allow(clippy::too_many_arguments)] +async fn start( + backend: impl Backend + Send + Sync + 'static, + max_concurrent_requests: usize, + max_best_of: usize, + max_stop_sequences: usize, + max_top_n_tokens: u32, + max_input_tokens: usize, + max_total_tokens: usize, + validation_workers: usize, + api_key: Option, + config: Option, + (tokenizer, tokenizer_config): (Option, HubTokenizerConfig), + (preprocessor_config, processor_config): (Option, HubProcessorConfig), + hostname: String, + port: u16, + ngrok: bool, + _ngrok_authtoken: Option, + _ngrok_edge: Option, + messages_api_enabled: bool, + disable_grammar_support: bool, + max_client_batch_size: usize, + model_info: HubModelInfo, + compat_return_full_text: bool, + allow_origin: Option, +) -> Result<(), WebServerError> { // Determine the server port based on the feature and environment variable. let port = if cfg!(feature = "google") { std::env::var("AIP_HTTP_PORT") @@ -1727,7 +1854,7 @@ pub async fn run( max_top_n_tokens, max_input_tokens, max_total_tokens, - grammar_support, + disable_grammar_support, ); let infer = Infer::new( diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs index 8559ae90..fa9f3637 100644 --- a/router/src/usage_stats.rs +++ b/router/src/usage_stats.rs @@ -78,11 +78,11 @@ pub struct Args { max_top_n_tokens: u32, max_input_tokens: usize, max_total_tokens: usize, - waiting_served_ratio: f32, - max_batch_prefill_tokens: u32, - max_batch_total_tokens: Option, - max_waiting_tokens: usize, - max_batch_size: Option, + // waiting_served_ratio: f32, + // max_batch_prefill_tokens: u32, + // max_batch_total_tokens: Option, + // max_waiting_tokens: usize, + // max_batch_size: Option, revision: Option, validation_workers: usize, messages_api_enabled: bool, @@ -103,11 +103,11 @@ impl Args { max_top_n_tokens: u32, max_input_tokens: usize, max_total_tokens: usize, - waiting_served_ratio: f32, - max_batch_prefill_tokens: u32, - max_batch_total_tokens: Option, - max_waiting_tokens: usize, - max_batch_size: Option, + // waiting_served_ratio: f32, + // max_batch_prefill_tokens: u32, + // max_batch_total_tokens: Option, + // max_waiting_tokens: usize, + // max_batch_size: Option, revision: Option, validation_workers: usize, messages_api_enabled: bool, @@ -125,11 +125,11 @@ impl Args { max_top_n_tokens, max_input_tokens, max_total_tokens, - waiting_served_ratio, - max_batch_prefill_tokens, - max_batch_total_tokens, - max_waiting_tokens, - max_batch_size, + // waiting_served_ratio, + // max_batch_prefill_tokens, + // max_batch_total_tokens, + // max_waiting_tokens, + // max_batch_size, revision, validation_workers, messages_api_enabled,