draft of usage stats

2025-09-12 21:04:53 +00:00 · 2024-07-11 15:08:49 +02:00 · 2024-07-11 15:08:49 +02:00 · 8649de3990
commit 8649de3990
parent 245d3de948
8 changed files with 425 additions and 17 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3424,9 +3424,9 @@ dependencies = [
 [[package]]
 name = "serde_json"
-version = "1.0.118"
+version = "1.0.120"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4"
+checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5"
 dependencies = [
 "itoa",
 "ryu",
@ -3672,15 +3672,16 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
 [[package]]
 name = "sysinfo"
-version = "0.30.12"
+version = "0.30.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "732ffa00f53e6b2af46208fba5718d9662a421049204e156328b66791ffa15ae"
+checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3"
 dependencies = [
 "cfg-if",
 "core-foundation-sys",
 "libc",
 "ntapi",
 "once_cell",
 "rayon",
 "windows",
 ]
@ -3762,7 +3763,7 @@ dependencies = [
 [[package]]
 name = "text-generation-benchmark"
-version = "2.1.1-dev0"
+version = "2.1.2-dev0"
 dependencies = [
 "average",
 "clap",
@ -3783,7 +3784,7 @@ dependencies = [
 [[package]]
 name = "text-generation-client"
-version = "2.1.1-dev0"
+version = "2.1.2-dev0"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@ -3801,7 +3802,7 @@ dependencies = [
 [[package]]
 name = "text-generation-launcher"
-version = "2.1.1-dev0"
+version = "2.1.2-dev0"
 dependencies = [
 "clap",
 "ctrlc",
@ -3812,6 +3813,7 @@ dependencies = [
 "reqwest",
 "serde",
 "serde_json",
 "sysinfo",
 "thiserror",
 "tracing",
 "tracing-subscriber",
@ -3820,7 +3822,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router"
-version = "2.1.1-dev0"
+version = "2.1.2-dev0"
 dependencies = [
 "async-stream",
 "axum 0.7.5",
@ -3848,6 +3850,7 @@ dependencies = [
 "reqwest",
 "serde",
 "serde_json",
 "sysinfo",
 "text-generation-client",
 "thiserror",
 "tokenizers",
@ -3859,6 +3862,7 @@ dependencies = [
 "tracing-subscriber",
 "utoipa",
 "utoipa-swagger-ui",
 "uuid",
 "vergen",
 ]
@ -4530,9 +4534,25 @@ dependencies = [
 [[package]]
 name = "uuid"
-version = "1.9.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
 dependencies = [
 "getrandom",
 "rand",
 "uuid-macro-internal",
 ]
 [[package]]
 name = "uuid-macro-internal"
 version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 2.0.68",
 ]
 [[package]]
 name = "v_frame"
--- a/docs/source/usage_statistics.md
+++ b/docs/source/usage_statistics.md
@ -0,0 +1,63 @@
 # Collection of Usage Statistics
 Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.
 Data is sent twice 1) on server startup and 2) and when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.
 ## What data is collected
 The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/usage_stats.rs).
 As of release 2.1.2 this is an example of the data collected:
 - From the TGI configuration:
 ```json
 {
  "event_type": "start",           
  "disable_grammar_support": false,
  "json_output": false,
  "max_batch_prefill_tokens": 4096,
  "max_batch_size": null,
  "max_batch_total_tokens": null,
  "max_best_of": 2,
  "max_client_batch_size": 4,
  "max_concurrent_requests": 128,
  "max_input_tokens": 1024,
  "max_stop_sequences": 4,
  "max_top_n_tokens": 5,
  "max_total_tokens": 2048,
  "max_waiting_tokens": 20,
  "messages_api_enabled": false,
  "model_config": {
      "model_type": "bloom"
  },
  "ngrok": false,
  "revision": null,
  "tokenizer_config": {
      "add_bos_token": null,
      "add_eos_token": null,
      "bos_token": "<s>",
      "chat_template": null,
      "completion_template": null,
      "eos_token": "</s>",
      "tokenizer_class": "BloomTokenizerFast"
  },
  "validation_workers": 2,
  "waiting_served_ratio": 1.2,
  "docker_label": "latest",
  "git_sha": "245d3de94877d4910ea7876f6bd19b4d7d4a47d9",
  "nvidia_env": "N/A",
  "system_env": {
      "architecture": "aarch64",
      "cpu_count": 16,
      "cpu_type": "Apple M3",
      "platform": "macos-unix-aarch64",
      "total_memory": 25769803767
  },
  "xpu_env": "N/A"
 }
 ```
 ## How to opt-out
 You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending of crash reports, but allows anonymous usage statistics.
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] }
 once_cell = "1.19.0"
 serde = { version = "1.0.188", features = ["derive"] }
 serde_json = "1.0.107"
 sysinfo = "0.30.12"
 thiserror = "1.0.59"
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -457,6 +457,14 @@ struct Args {
    /// startup that will be available to callers via the `adapter_id` field in a request.
    #[clap(long, env)]
    lora_adapters: Option<String>,
    /// Disable sending of all usage statistics
    #[clap(default_value = "false", long, env)]
    disable_usage_stats: bool,
    /// Disable sending of crash reports, but allow anonymous usage statistics
    #[clap(default_value = "false", long, env)]
    disable_crash_reports: bool,
 }
 #[derive(Debug)]
@ -1199,8 +1207,20 @@ fn spawn_webserver(
        format!("{}-0", args.shard_uds_path),
        "--tokenizer-name".to_string(),
        args.model_id,
        "--disable-usage-stats".to_string(),
        args.disable_usage_stats.to_string(),
        "--disable-crash-reports".to_string(),
        args.disable_crash_reports.to_string(),
    ];
    // Pass usage stats flags to router
    if args.disable_usage_stats {
        router_args.push("--usage-stats".to_string());
    }
    if args.disable_crash_reports {
        router_args.push("--crash-reports".to_string());
    }
    // Grammar support
    if args.disable_grammar_support {
        router_args.push("--disable-grammar-support".to_string());
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -52,6 +52,9 @@ regex = "1.10.3"
 once_cell = "1.19.0"
 image = "0.25.1"
 base64 = { workspace = true }
 sysinfo = "0.30.13"
 uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] }
 [build-dependencies]
 vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -7,6 +7,8 @@ mod validation;
 #[cfg(feature = "kserve")]
 mod kserve;
 pub mod usage_stats;
 use serde::{Deserialize, Serialize};
 use tracing::warn;
 use utoipa::ToSchema;
@ -40,13 +42,13 @@ pub struct HubModelInfo {
    pub pipeline_tag: Option<String>,
 }
-#[derive(Debug, Clone, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct ChatTemplate {
    name: String,
    template: String,
 }
-#[derive(Debug, Clone, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 #[serde(untagged)]
 pub enum ChatTemplateVersions {
    Single(String),
@ -55,7 +57,7 @@ pub enum ChatTemplateVersions {
 use std::path::Path;
-#[derive(Debug, Clone, Deserialize, Default)]
+#[derive(Debug, Clone, Serialize ,Deserialize, Default)]
 pub struct HubTokenizerConfig {
    pub chat_template: Option<ChatTemplateVersions>,
    pub completion_template: Option<String>,
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -23,6 +23,7 @@ use tower_http::cors::AllowOrigin;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
 use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
 use text_generation_router::usage_stats;
 /// App Configuration
 #[derive(Parser, Debug)]
@ -87,6 +88,10 @@ struct Args {
    disable_grammar_support: bool,
    #[clap(default_value = "4", long, env)]
    max_client_batch_size: usize,
    #[clap(long, env, default_value_t)]
    disable_usage_stats: bool,
    #[clap(long, env, default_value_t)]
    disable_crash_reports: bool,
 }
 #[derive(Debug, Subcommand)]
@ -128,6 +133,8 @@ async fn main() -> Result<(), RouterError> {
        messages_api_enabled,
        disable_grammar_support,
        max_client_batch_size,
        disable_usage_stats,
        disable_crash_reports,
        command,
    } = args;
@ -374,8 +381,49 @@ async fn main() -> Result<(), RouterError> {
        }
    };
    // Only send usage stats when TGI is run in docker
    let is_docker = option_env!("DOCKER_LABEL").is_some();
    let user_agent = if !disable_usage_stats && is_docker {
        let reducded_args = usage_stats::Args::new(
            config.clone(),
            tokenizer_config.clone(),
            max_concurrent_requests,
            max_best_of,
            max_stop_sequences,
            max_top_n_tokens,
            max_input_tokens,
            max_total_tokens,
            waiting_served_ratio,
            max_batch_prefill_tokens,
            max_batch_total_tokens,
            max_waiting_tokens,
            max_batch_size,
            revision,
            validation_workers,
            json_output,
            ngrok,
            messages_api_enabled,
            disable_grammar_support,
            max_client_batch_size,
            disable_usage_stats,
            disable_crash_reports,
        );
        Some(usage_stats::UserAgent::new(reducded_args))
    }
    else {
        None
    };
    if let Some(ref ua) = user_agent {
        let start_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start);
        tokio::spawn(async move {
            start_event.send().await;
        });
    }; 
    // Run server
-    server::run(
+    let result = server::run(
        master_shard_uds_path,
        model_info,
        compat_return_full_text,
@ -406,9 +454,27 @@ async fn main() -> Result<(), RouterError> {
        max_client_batch_size,
        print_schema_command,
    )
-    .await?;
+    .await;
    match result {
        Ok(_) => {
            if let Some(ref ua) = user_agent {
                let stop_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Stop);
                stop_event.send().await;
            };
            Ok(())
        }
        Err(e) => {
            if let Some(ref ua) = user_agent {
                if !disable_crash_reports {
                    let error_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Error(e.to_string()));
                    error_event.send().await;
                }
            };
            Err(RouterError::WebServer(e))
        }
    }
 }
 /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
 ///     - otlp_endpoint is an optional URL to an Open Telemetry collector
--- a/router/src/usage_stats.rs
+++ b/router/src/usage_stats.rs
@ -0,0 +1,233 @@
 use crate::{config::Config, HubTokenizerConfig};
 use reqwest::header::HeaderMap;
 use serde::Serialize;
 use uuid::Uuid;
 use std::{fmt, process::Command, time::Duration};
 const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";
 #[derive(Debug, Clone, Serialize)]
 pub struct UserAgent {
    pub uid: String,
    pub args: Args,
    pub env: Env,
 }
 impl UserAgent {
    pub fn new(reduced_args: Args) -> Self {
        Self {
            uid: Uuid::new_v4().to_string(),
            args: reduced_args,
            env: Env::new(),
        }
    }
 }
 #[derive(Serialize, Debug)]
 pub enum  EventType {
    Start,
    Stop,
    Error(String),
 }
 #[derive(Debug, Serialize)]
 pub struct UsageStatsEvent {
    user_agent: UserAgent,
    event_type: EventType,
 }
 impl UsageStatsEvent {
    pub fn new(user_agent: UserAgent, event_type: EventType) -> Self {
        Self {
            user_agent,
            event_type,
        }
    }
    pub async fn send(&self) {
        let mut headers = HeaderMap::new();
        headers.insert("Content-Type", "application/json".parse().unwrap());
        let body = serde_json::to_string(&self).unwrap();
        let client = reqwest::Client::new();
        let _ = client.post(TELEMETRY_URL)
            .body(body)
            .timeout(Duration::from_secs(5))
            .send()
            .await;
    }
 }
 #[derive(Debug, Clone, Serialize)]
 pub struct Args {
    model_config: Option<Config>,
    tokenizer_config: HubTokenizerConfig,
    max_concurrent_requests: usize,
    max_best_of: usize,
    max_stop_sequences: usize,
    max_top_n_tokens: u32,
    max_input_tokens: usize,
    max_total_tokens: usize,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: Option<u32>,
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
    revision: Option<String>,
    validation_workers: usize,
    json_output: bool,
    ngrok: bool,
    messages_api_enabled: bool,
    disable_grammar_support: bool,
    max_client_batch_size: usize,
    disable_usage_stats: bool,
    disable_crash_reports: bool,
 }
 impl Args {
    pub fn new(
        model_config: Option<Config>,
        tokenizer_config: HubTokenizerConfig,
        max_concurrent_requests: usize,
        max_best_of: usize,
        max_stop_sequences: usize,
        max_top_n_tokens: u32,
        max_input_tokens: usize,
        max_total_tokens: usize,
        waiting_served_ratio: f32,
        max_batch_prefill_tokens: u32,
        max_batch_total_tokens: Option<u32>,
        max_waiting_tokens: usize,
        max_batch_size: Option<usize>,
        revision: Option<String>,
        validation_workers: usize,
        json_output: bool,
        ngrok: bool,
        messages_api_enabled: bool,
        disable_grammar_support: bool,
        max_client_batch_size: usize,
        disable_usage_stats: bool,
        disable_crash_reports: bool,
    ) -> Self {
        Self {
            model_config,
            tokenizer_config,
            max_concurrent_requests,
            max_best_of,
            max_stop_sequences,
            max_top_n_tokens,
            max_input_tokens,
            max_total_tokens,
            waiting_served_ratio,
            max_batch_prefill_tokens,
            max_batch_total_tokens,
            max_waiting_tokens,
            max_batch_size,
            revision,
            validation_workers,
            json_output,
            ngrok,
            messages_api_enabled,
            disable_grammar_support,
            max_client_batch_size,
            disable_usage_stats,
            disable_crash_reports,
        }
    }
 }
 /// This is more or less a copy of the code from the `text-generation-launcher` crate to avoid a dependency
 #[derive(Serialize, Debug, Clone)]
 pub struct Env {
    git_sha: &'static str,
    docker_label: &'static str,
    nvidia_env: String,
    xpu_env: String,
    system_env: SystemInfo,
 }
 #[derive(Serialize, Debug, Clone)]
 pub struct SystemInfo {
    cpu_count: usize,
    cpu_type: String,
    total_memory: u64,
    architecture: String,
    platform: String,
 }
 impl SystemInfo {
    fn new() -> Self {
        let mut system = sysinfo::System::new_all();
        system.refresh_all();
        let cpu_count = system.cpus().len();
        let cpu_type = system.cpus()[0].brand().to_string();
        let total_memory = system.total_memory();
        let architecture = std::env::consts::ARCH.to_string();
        let platform = format!("{}-{}-{}",
            std::env::consts::OS,
            std::env::consts::FAMILY,
            std::env::consts::ARCH
        );
        Self {
            cpu_count,
            cpu_type,
            total_memory,
            architecture,
            platform,
        }
    }
 }
 impl fmt::Display for SystemInfo {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        writeln!(f, "CPU Count: {}", self.cpu_count)?;
        writeln!(f, "CPU Type: {}", self.cpu_type)?;
        writeln!(f, "Total Memory: {}", self.total_memory)?;
        writeln!(f, "Architecture: {}", self.architecture)?;
        writeln!(f, "Platform: {}", self.platform)?;
        Ok(())
    }
 }
 impl Env {
    pub fn new() -> Self {
        let nvidia_env = nvidia_smi();
        let xpu_env = xpu_smi();
        let system_env = SystemInfo::new();
        Self {
            system_env,
            nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
            xpu_env: xpu_env.unwrap_or("N/A".to_string()),
            git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
            docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
        }
    }
 }
 impl fmt::Display for Env {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        writeln!(f, "Runtime environment:")?;
        writeln!(f, "Commit sha: {}", self.git_sha)?;
        writeln!(f, "Docker label: {}", self.docker_label)?;
        writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
        write!(f, "xpu-smi:\n{}\n", self.xpu_env)?;
        write!(f, "System:\n{}", self.system_env)?;
        Ok(())
    }
 }
 fn nvidia_smi() -> Option<String> {
    let output = Command::new("nvidia-smi").output().ok()?;
    let nvidia_smi = String::from_utf8(output.stdout).ok()?;
    let output = nvidia_smi.replace('\n', "\n   ");
    Some(output.trim().to_string())
 }
 fn xpu_smi() -> Option<String> {
    let output = Command::new("xpu-smi").arg("discovery").output().ok()?;
    let xpu_smi = String::from_utf8(output.stdout).ok()?;
    let output = xpu_smi.replace('\n', "\n   ");
    Some(output.trim().to_string())
 }