From 8649de39906e7928f9d41942722f7896b86c1bf4 Mon Sep 17 00:00:00 2001 From: ErikKaumk Date: Thu, 11 Jul 2024 15:08:49 +0200 Subject: [PATCH] draft of usage stats --- Cargo.lock | 40 ++++-- docs/source/usage_statistics.md | 63 +++++++++ launcher/Cargo.toml | 1 + launcher/src/main.rs | 20 +++ router/Cargo.toml | 3 + router/src/lib.rs | 8 +- router/src/main.rs | 74 +++++++++- router/src/usage_stats.rs | 233 ++++++++++++++++++++++++++++++++ 8 files changed, 425 insertions(+), 17 deletions(-) create mode 100644 docs/source/usage_statistics.md create mode 100644 router/src/usage_stats.rs diff --git a/Cargo.lock b/Cargo.lock index 090e2e80..57cd5cc1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3424,9 +3424,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.118" +version = "1.0.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" dependencies = [ "itoa", "ryu", @@ -3672,15 +3672,16 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "sysinfo" -version = "0.30.12" +version = "0.30.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732ffa00f53e6b2af46208fba5718d9662a421049204e156328b66791ffa15ae" +checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" dependencies = [ "cfg-if", "core-foundation-sys", "libc", "ntapi", "once_cell", + "rayon", "windows", ] @@ -3762,7 +3763,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "2.1.1-dev0" +version = "2.1.2-dev0" dependencies = [ "average", "clap", @@ -3783,7 +3784,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "2.1.1-dev0" +version = "2.1.2-dev0" dependencies = [ "async-trait", "base64 0.22.1", @@ -3801,7 +3802,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "2.1.1-dev0" +version = "2.1.2-dev0" dependencies = [ "clap", "ctrlc", @@ -3812,6 +3813,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sysinfo", "thiserror", "tracing", "tracing-subscriber", @@ -3820,7 +3822,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "2.1.1-dev0" +version = "2.1.2-dev0" dependencies = [ "async-stream", "axum 0.7.5", @@ -3848,6 +3850,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sysinfo", "text-generation-client", "thiserror", "tokenizers", @@ -3859,6 +3862,7 @@ dependencies = [ "tracing-subscriber", "utoipa", "utoipa-swagger-ui", + "uuid", "vergen", ] @@ -4530,9 +4534,25 @@ dependencies = [ [[package]] name = "uuid" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +dependencies = [ + "getrandom", + "rand", + "uuid-macro-internal", +] + +[[package]] +name = "uuid-macro-internal" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] [[package]] name = "v_frame" diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md new file mode 100644 index 00000000..995fc28f --- /dev/null +++ b/docs/source/usage_statistics.md @@ -0,0 +1,63 @@ + +# Collection of Usage Statistics + +Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted. + +Data is sent twice 1) on server startup and 2) and when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine. + +## What data is collected + +The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/usage_stats.rs). +As of release 2.1.2 this is an example of the data collected: + +- From the TGI configuration: +```json +{ + "event_type": "start", + "disable_grammar_support": false, + "json_output": false, + "max_batch_prefill_tokens": 4096, + "max_batch_size": null, + "max_batch_total_tokens": null, + "max_best_of": 2, + "max_client_batch_size": 4, + "max_concurrent_requests": 128, + "max_input_tokens": 1024, + "max_stop_sequences": 4, + "max_top_n_tokens": 5, + "max_total_tokens": 2048, + "max_waiting_tokens": 20, + "messages_api_enabled": false, + "model_config": { + "model_type": "bloom" + }, + "ngrok": false, + "revision": null, + "tokenizer_config": { + "add_bos_token": null, + "add_eos_token": null, + "bos_token": "", + "chat_template": null, + "completion_template": null, + "eos_token": "", + "tokenizer_class": "BloomTokenizerFast" + }, + "validation_workers": 2, + "waiting_served_ratio": 1.2, + "docker_label": "latest", + "git_sha": "245d3de94877d4910ea7876f6bd19b4d7d4a47d9", + "nvidia_env": "N/A", + "system_env": { + "architecture": "aarch64", + "cpu_count": 16, + "cpu_type": "Apple M3", + "platform": "macos-unix-aarch64", + "total_memory": 25769803767 + }, + "xpu_env": "N/A" +} +``` + +## How to opt-out + +You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending of crash reports, but allows anonymous usage statistics. diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml index eb219423..4ab915de 100644 --- a/launcher/Cargo.toml +++ b/launcher/Cargo.toml @@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] } once_cell = "1.19.0" serde = { version = "1.0.188", features = ["derive"] } serde_json = "1.0.107" +sysinfo = "0.30.12" thiserror = "1.0.59" tracing = "0.1.37" tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] } diff --git a/launcher/src/main.rs b/launcher/src/main.rs index d2ca38e5..35eb6e04 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -457,6 +457,14 @@ struct Args { /// startup that will be available to callers via the `adapter_id` field in a request. #[clap(long, env)] lora_adapters: Option, + + /// Disable sending of all usage statistics + #[clap(default_value = "false", long, env)] + disable_usage_stats: bool, + + /// Disable sending of crash reports, but allow anonymous usage statistics + #[clap(default_value = "false", long, env)] + disable_crash_reports: bool, } #[derive(Debug)] @@ -1199,8 +1207,20 @@ fn spawn_webserver( format!("{}-0", args.shard_uds_path), "--tokenizer-name".to_string(), args.model_id, + "--disable-usage-stats".to_string(), + args.disable_usage_stats.to_string(), + "--disable-crash-reports".to_string(), + args.disable_crash_reports.to_string(), ]; + // Pass usage stats flags to router + if args.disable_usage_stats { + router_args.push("--usage-stats".to_string()); + } + if args.disable_crash_reports { + router_args.push("--crash-reports".to_string()); + } + // Grammar support if args.disable_grammar_support { router_args.push("--disable-grammar-support".to_string()); diff --git a/router/Cargo.toml b/router/Cargo.toml index 5855ac86..1a7611bf 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -52,6 +52,9 @@ regex = "1.10.3" once_cell = "1.19.0" image = "0.25.1" base64 = { workspace = true } +sysinfo = "0.30.13" +uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] } + [build-dependencies] vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] } diff --git a/router/src/lib.rs b/router/src/lib.rs index 165b2ad2..fbbca8bb 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -7,6 +7,8 @@ mod validation; #[cfg(feature = "kserve")] mod kserve; +pub mod usage_stats; + use serde::{Deserialize, Serialize}; use tracing::warn; use utoipa::ToSchema; @@ -40,13 +42,13 @@ pub struct HubModelInfo { pub pipeline_tag: Option, } -#[derive(Debug, Clone, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct ChatTemplate { name: String, template: String, } -#[derive(Debug, Clone, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(untagged)] pub enum ChatTemplateVersions { Single(String), @@ -55,7 +57,7 @@ pub enum ChatTemplateVersions { use std::path::Path; -#[derive(Debug, Clone, Deserialize, Default)] +#[derive(Debug, Clone, Serialize ,Deserialize, Default)] pub struct HubTokenizerConfig { pub chat_template: Option, pub completion_template: Option, diff --git a/router/src/main.rs b/router/src/main.rs index 21cd6649..fde9c6e3 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -23,6 +23,7 @@ use tower_http::cors::AllowOrigin; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer}; +use text_generation_router::usage_stats; /// App Configuration #[derive(Parser, Debug)] @@ -87,6 +88,10 @@ struct Args { disable_grammar_support: bool, #[clap(default_value = "4", long, env)] max_client_batch_size: usize, + #[clap(long, env, default_value_t)] + disable_usage_stats: bool, + #[clap(long, env, default_value_t)] + disable_crash_reports: bool, } #[derive(Debug, Subcommand)] @@ -128,9 +133,11 @@ async fn main() -> Result<(), RouterError> { messages_api_enabled, disable_grammar_support, max_client_batch_size, + disable_usage_stats, + disable_crash_reports, command, } = args; - + let print_schema_command = match command { Some(Commands::PrintSchema) => true, None => { @@ -374,8 +381,49 @@ async fn main() -> Result<(), RouterError> { } }; + // Only send usage stats when TGI is run in docker + let is_docker = option_env!("DOCKER_LABEL").is_some(); + + let user_agent = if !disable_usage_stats && is_docker { + let reducded_args = usage_stats::Args::new( + config.clone(), + tokenizer_config.clone(), + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + waiting_served_ratio, + max_batch_prefill_tokens, + max_batch_total_tokens, + max_waiting_tokens, + max_batch_size, + revision, + validation_workers, + json_output, + ngrok, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + disable_usage_stats, + disable_crash_reports, + ); + Some(usage_stats::UserAgent::new(reducded_args)) + } + else { + None + }; + + if let Some(ref ua) = user_agent { + let start_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start); + tokio::spawn(async move { + start_event.send().await; + }); + }; + // Run server - server::run( + let result = server::run( master_shard_uds_path, model_info, compat_return_full_text, @@ -406,8 +454,26 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, print_schema_command, ) - .await?; - Ok(()) + .await; + + match result { + Ok(_) => { + if let Some(ref ua) = user_agent { + let stop_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Stop); + stop_event.send().await; + }; + Ok(()) + } + Err(e) => { + if let Some(ref ua) = user_agent { + if !disable_crash_reports { + let error_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Error(e.to_string())); + error_event.send().await; + } + }; + Err(RouterError::WebServer(e)) + } + } } /// Init logging using env variables LOG_LEVEL and LOG_FORMAT: diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs new file mode 100644 index 00000000..bac1b9d8 --- /dev/null +++ b/router/src/usage_stats.rs @@ -0,0 +1,233 @@ +use crate::{config::Config, HubTokenizerConfig}; +use reqwest::header::HeaderMap; +use serde::Serialize; +use uuid::Uuid; +use std::{fmt, process::Command, time::Duration}; + +const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi"; + +#[derive(Debug, Clone, Serialize)] +pub struct UserAgent { + pub uid: String, + pub args: Args, + pub env: Env, +} + +impl UserAgent { + pub fn new(reduced_args: Args) -> Self { + Self { + uid: Uuid::new_v4().to_string(), + args: reduced_args, + env: Env::new(), + } + } +} + +#[derive(Serialize, Debug)] +pub enum EventType { + Start, + Stop, + Error(String), +} + +#[derive(Debug, Serialize)] +pub struct UsageStatsEvent { + user_agent: UserAgent, + event_type: EventType, +} + +impl UsageStatsEvent { + pub fn new(user_agent: UserAgent, event_type: EventType) -> Self { + Self { + user_agent, + event_type, + } + } + pub async fn send(&self) { + let mut headers = HeaderMap::new(); + headers.insert("Content-Type", "application/json".parse().unwrap()); + let body = serde_json::to_string(&self).unwrap(); + let client = reqwest::Client::new(); + let _ = client.post(TELEMETRY_URL) + .body(body) + .timeout(Duration::from_secs(5)) + .send() + .await; + } +} + + +#[derive(Debug, Clone, Serialize)] +pub struct Args { + model_config: Option, + tokenizer_config: HubTokenizerConfig, + max_concurrent_requests: usize, + max_best_of: usize, + max_stop_sequences: usize, + max_top_n_tokens: u32, + max_input_tokens: usize, + max_total_tokens: usize, + waiting_served_ratio: f32, + max_batch_prefill_tokens: u32, + max_batch_total_tokens: Option, + max_waiting_tokens: usize, + max_batch_size: Option, + revision: Option, + validation_workers: usize, + json_output: bool, + ngrok: bool, + messages_api_enabled: bool, + disable_grammar_support: bool, + max_client_batch_size: usize, + disable_usage_stats: bool, + disable_crash_reports: bool, +} + +impl Args { + pub fn new( + model_config: Option, + tokenizer_config: HubTokenizerConfig, + max_concurrent_requests: usize, + max_best_of: usize, + max_stop_sequences: usize, + max_top_n_tokens: u32, + max_input_tokens: usize, + max_total_tokens: usize, + waiting_served_ratio: f32, + max_batch_prefill_tokens: u32, + max_batch_total_tokens: Option, + max_waiting_tokens: usize, + max_batch_size: Option, + revision: Option, + validation_workers: usize, + json_output: bool, + ngrok: bool, + messages_api_enabled: bool, + disable_grammar_support: bool, + max_client_batch_size: usize, + disable_usage_stats: bool, + disable_crash_reports: bool, + ) -> Self { + Self { + model_config, + tokenizer_config, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + waiting_served_ratio, + max_batch_prefill_tokens, + max_batch_total_tokens, + max_waiting_tokens, + max_batch_size, + revision, + validation_workers, + json_output, + ngrok, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + disable_usage_stats, + disable_crash_reports, + } + } +} + +/// This is more or less a copy of the code from the `text-generation-launcher` crate to avoid a dependency +#[derive(Serialize, Debug, Clone)] +pub struct Env { + git_sha: &'static str, + docker_label: &'static str, + nvidia_env: String, + xpu_env: String, + system_env: SystemInfo, +} + +#[derive(Serialize, Debug, Clone)] +pub struct SystemInfo { + cpu_count: usize, + cpu_type: String, + total_memory: u64, + architecture: String, + platform: String, +} + +impl SystemInfo { + fn new() -> Self { + let mut system = sysinfo::System::new_all(); + system.refresh_all(); + + let cpu_count = system.cpus().len(); + let cpu_type = system.cpus()[0].brand().to_string(); + let total_memory = system.total_memory(); + let architecture = std::env::consts::ARCH.to_string(); + let platform = format!("{}-{}-{}", + std::env::consts::OS, + std::env::consts::FAMILY, + std::env::consts::ARCH + ); + Self { + cpu_count, + cpu_type, + total_memory, + architecture, + platform, + } + } +} + +impl fmt::Display for SystemInfo { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "CPU Count: {}", self.cpu_count)?; + writeln!(f, "CPU Type: {}", self.cpu_type)?; + writeln!(f, "Total Memory: {}", self.total_memory)?; + writeln!(f, "Architecture: {}", self.architecture)?; + writeln!(f, "Platform: {}", self.platform)?; + Ok(()) + } +} + +impl Env { + pub fn new() -> Self { + let nvidia_env = nvidia_smi(); + let xpu_env = xpu_smi(); + let system_env = SystemInfo::new(); + + Self { + system_env, + nvidia_env: nvidia_env.unwrap_or("N/A".to_string()), + xpu_env: xpu_env.unwrap_or("N/A".to_string()), + git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"), + docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"), + } + } +} + +impl fmt::Display for Env { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "Runtime environment:")?; + writeln!(f, "Commit sha: {}", self.git_sha)?; + writeln!(f, "Docker label: {}", self.docker_label)?; + writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?; + write!(f, "xpu-smi:\n{}\n", self.xpu_env)?; + write!(f, "System:\n{}", self.system_env)?; + + Ok(()) + } +} + +fn nvidia_smi() -> Option { + let output = Command::new("nvidia-smi").output().ok()?; + let nvidia_smi = String::from_utf8(output.stdout).ok()?; + let output = nvidia_smi.replace('\n', "\n "); + Some(output.trim().to_string()) +} + +fn xpu_smi() -> Option { + let output = Command::new("xpu-smi").arg("discovery").output().ok()?; + let xpu_smi = String::from_utf8(output.stdout).ok()?; + let output = xpu_smi.replace('\n', "\n "); + Some(output.trim().to_string()) +}