draft of usage stats

This commit is contained in:
ErikKaumk 2024-07-11 15:08:49 +02:00
parent 245d3de948
commit 8649de3990
8 changed files with 425 additions and 17 deletions

40
Cargo.lock generated
View File

@ -3424,9 +3424,9 @@ dependencies = [
[[package]] [[package]]
name = "serde_json" name = "serde_json"
version = "1.0.118" version = "1.0.120"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4" checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5"
dependencies = [ dependencies = [
"itoa", "itoa",
"ryu", "ryu",
@ -3672,15 +3672,16 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
[[package]] [[package]]
name = "sysinfo" name = "sysinfo"
version = "0.30.12" version = "0.30.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "732ffa00f53e6b2af46208fba5718d9662a421049204e156328b66791ffa15ae" checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"core-foundation-sys", "core-foundation-sys",
"libc", "libc",
"ntapi", "ntapi",
"once_cell", "once_cell",
"rayon",
"windows", "windows",
] ]
@ -3762,7 +3763,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-benchmark" name = "text-generation-benchmark"
version = "2.1.1-dev0" version = "2.1.2-dev0"
dependencies = [ dependencies = [
"average", "average",
"clap", "clap",
@ -3783,7 +3784,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-client" name = "text-generation-client"
version = "2.1.1-dev0" version = "2.1.2-dev0"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"base64 0.22.1", "base64 0.22.1",
@ -3801,7 +3802,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-launcher" name = "text-generation-launcher"
version = "2.1.1-dev0" version = "2.1.2-dev0"
dependencies = [ dependencies = [
"clap", "clap",
"ctrlc", "ctrlc",
@ -3812,6 +3813,7 @@ dependencies = [
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
"sysinfo",
"thiserror", "thiserror",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
@ -3820,7 +3822,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router" name = "text-generation-router"
version = "2.1.1-dev0" version = "2.1.2-dev0"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"axum 0.7.5", "axum 0.7.5",
@ -3848,6 +3850,7 @@ dependencies = [
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
"sysinfo",
"text-generation-client", "text-generation-client",
"thiserror", "thiserror",
"tokenizers", "tokenizers",
@ -3859,6 +3862,7 @@ dependencies = [
"tracing-subscriber", "tracing-subscriber",
"utoipa", "utoipa",
"utoipa-swagger-ui", "utoipa-swagger-ui",
"uuid",
"vergen", "vergen",
] ]
@ -4530,9 +4534,25 @@ dependencies = [
[[package]] [[package]]
name = "uuid" name = "uuid"
version = "1.9.1" version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
dependencies = [
"getrandom",
"rand",
"uuid-macro-internal",
]
[[package]]
name = "uuid-macro-internal"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.68",
]
[[package]] [[package]]
name = "v_frame" name = "v_frame"

View File

@ -0,0 +1,63 @@
# Collection of Usage Statistics
Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.
Data is sent twice 1) on server startup and 2) and when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.
## What data is collected
The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/usage_stats.rs).
As of release 2.1.2 this is an example of the data collected:
- From the TGI configuration:
```json
{
"event_type": "start",
"disable_grammar_support": false,
"json_output": false,
"max_batch_prefill_tokens": 4096,
"max_batch_size": null,
"max_batch_total_tokens": null,
"max_best_of": 2,
"max_client_batch_size": 4,
"max_concurrent_requests": 128,
"max_input_tokens": 1024,
"max_stop_sequences": 4,
"max_top_n_tokens": 5,
"max_total_tokens": 2048,
"max_waiting_tokens": 20,
"messages_api_enabled": false,
"model_config": {
"model_type": "bloom"
},
"ngrok": false,
"revision": null,
"tokenizer_config": {
"add_bos_token": null,
"add_eos_token": null,
"bos_token": "<s>",
"chat_template": null,
"completion_template": null,
"eos_token": "</s>",
"tokenizer_class": "BloomTokenizerFast"
},
"validation_workers": 2,
"waiting_served_ratio": 1.2,
"docker_label": "latest",
"git_sha": "245d3de94877d4910ea7876f6bd19b4d7d4a47d9",
"nvidia_env": "N/A",
"system_env": {
"architecture": "aarch64",
"cpu_count": 16,
"cpu_type": "Apple M3",
"platform": "macos-unix-aarch64",
"total_memory": 25769803767
},
"xpu_env": "N/A"
}
```
## How to opt-out
You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending of crash reports, but allows anonymous usage statistics.

View File

@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] }
once_cell = "1.19.0" once_cell = "1.19.0"
serde = { version = "1.0.188", features = ["derive"] } serde = { version = "1.0.188", features = ["derive"] }
serde_json = "1.0.107" serde_json = "1.0.107"
sysinfo = "0.30.12"
thiserror = "1.0.59" thiserror = "1.0.59"
tracing = "0.1.37" tracing = "0.1.37"
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] } tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }

View File

@ -457,6 +457,14 @@ struct Args {
/// startup that will be available to callers via the `adapter_id` field in a request. /// startup that will be available to callers via the `adapter_id` field in a request.
#[clap(long, env)] #[clap(long, env)]
lora_adapters: Option<String>, lora_adapters: Option<String>,
/// Disable sending of all usage statistics
#[clap(default_value = "false", long, env)]
disable_usage_stats: bool,
/// Disable sending of crash reports, but allow anonymous usage statistics
#[clap(default_value = "false", long, env)]
disable_crash_reports: bool,
} }
#[derive(Debug)] #[derive(Debug)]
@ -1199,8 +1207,20 @@ fn spawn_webserver(
format!("{}-0", args.shard_uds_path), format!("{}-0", args.shard_uds_path),
"--tokenizer-name".to_string(), "--tokenizer-name".to_string(),
args.model_id, args.model_id,
"--disable-usage-stats".to_string(),
args.disable_usage_stats.to_string(),
"--disable-crash-reports".to_string(),
args.disable_crash_reports.to_string(),
]; ];
// Pass usage stats flags to router
if args.disable_usage_stats {
router_args.push("--usage-stats".to_string());
}
if args.disable_crash_reports {
router_args.push("--crash-reports".to_string());
}
// Grammar support // Grammar support
if args.disable_grammar_support { if args.disable_grammar_support {
router_args.push("--disable-grammar-support".to_string()); router_args.push("--disable-grammar-support".to_string());

View File

@ -52,6 +52,9 @@ regex = "1.10.3"
once_cell = "1.19.0" once_cell = "1.19.0"
image = "0.25.1" image = "0.25.1"
base64 = { workspace = true } base64 = { workspace = true }
sysinfo = "0.30.13"
uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] }
[build-dependencies] [build-dependencies]
vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] } vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }

View File

@ -7,6 +7,8 @@ mod validation;
#[cfg(feature = "kserve")] #[cfg(feature = "kserve")]
mod kserve; mod kserve;
pub mod usage_stats;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tracing::warn; use tracing::warn;
use utoipa::ToSchema; use utoipa::ToSchema;
@ -40,13 +42,13 @@ pub struct HubModelInfo {
pub pipeline_tag: Option<String>, pub pipeline_tag: Option<String>,
} }
#[derive(Debug, Clone, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ChatTemplate { pub struct ChatTemplate {
name: String, name: String,
template: String, template: String,
} }
#[derive(Debug, Clone, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(untagged)] #[serde(untagged)]
pub enum ChatTemplateVersions { pub enum ChatTemplateVersions {
Single(String), Single(String),
@ -55,7 +57,7 @@ pub enum ChatTemplateVersions {
use std::path::Path; use std::path::Path;
#[derive(Debug, Clone, Deserialize, Default)] #[derive(Debug, Clone, Serialize ,Deserialize, Default)]
pub struct HubTokenizerConfig { pub struct HubTokenizerConfig {
pub chat_template: Option<ChatTemplateVersions>, pub chat_template: Option<ChatTemplateVersions>,
pub completion_template: Option<String>, pub completion_template: Option<String>,

View File

@ -23,6 +23,7 @@ use tower_http::cors::AllowOrigin;
use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt; use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer}; use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
use text_generation_router::usage_stats;
/// App Configuration /// App Configuration
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
@ -87,6 +88,10 @@ struct Args {
disable_grammar_support: bool, disable_grammar_support: bool,
#[clap(default_value = "4", long, env)] #[clap(default_value = "4", long, env)]
max_client_batch_size: usize, max_client_batch_size: usize,
#[clap(long, env, default_value_t)]
disable_usage_stats: bool,
#[clap(long, env, default_value_t)]
disable_crash_reports: bool,
} }
#[derive(Debug, Subcommand)] #[derive(Debug, Subcommand)]
@ -128,6 +133,8 @@ async fn main() -> Result<(), RouterError> {
messages_api_enabled, messages_api_enabled,
disable_grammar_support, disable_grammar_support,
max_client_batch_size, max_client_batch_size,
disable_usage_stats,
disable_crash_reports,
command, command,
} = args; } = args;
@ -374,8 +381,49 @@ async fn main() -> Result<(), RouterError> {
} }
}; };
// Only send usage stats when TGI is run in docker
let is_docker = option_env!("DOCKER_LABEL").is_some();
let user_agent = if !disable_usage_stats && is_docker {
let reducded_args = usage_stats::Args::new(
config.clone(),
tokenizer_config.clone(),
max_concurrent_requests,
max_best_of,
max_stop_sequences,
max_top_n_tokens,
max_input_tokens,
max_total_tokens,
waiting_served_ratio,
max_batch_prefill_tokens,
max_batch_total_tokens,
max_waiting_tokens,
max_batch_size,
revision,
validation_workers,
json_output,
ngrok,
messages_api_enabled,
disable_grammar_support,
max_client_batch_size,
disable_usage_stats,
disable_crash_reports,
);
Some(usage_stats::UserAgent::new(reducded_args))
}
else {
None
};
if let Some(ref ua) = user_agent {
let start_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start);
tokio::spawn(async move {
start_event.send().await;
});
};
// Run server // Run server
server::run( let result = server::run(
master_shard_uds_path, master_shard_uds_path,
model_info, model_info,
compat_return_full_text, compat_return_full_text,
@ -406,8 +454,26 @@ async fn main() -> Result<(), RouterError> {
max_client_batch_size, max_client_batch_size,
print_schema_command, print_schema_command,
) )
.await?; .await;
match result {
Ok(_) => {
if let Some(ref ua) = user_agent {
let stop_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Stop);
stop_event.send().await;
};
Ok(()) Ok(())
}
Err(e) => {
if let Some(ref ua) = user_agent {
if !disable_crash_reports {
let error_event = usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Error(e.to_string()));
error_event.send().await;
}
};
Err(RouterError::WebServer(e))
}
}
} }
/// Init logging using env variables LOG_LEVEL and LOG_FORMAT: /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:

233
router/src/usage_stats.rs Normal file
View File

@ -0,0 +1,233 @@
use crate::{config::Config, HubTokenizerConfig};
use reqwest::header::HeaderMap;
use serde::Serialize;
use uuid::Uuid;
use std::{fmt, process::Command, time::Duration};
const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";
#[derive(Debug, Clone, Serialize)]
pub struct UserAgent {
pub uid: String,
pub args: Args,
pub env: Env,
}
impl UserAgent {
pub fn new(reduced_args: Args) -> Self {
Self {
uid: Uuid::new_v4().to_string(),
args: reduced_args,
env: Env::new(),
}
}
}
#[derive(Serialize, Debug)]
pub enum EventType {
Start,
Stop,
Error(String),
}
#[derive(Debug, Serialize)]
pub struct UsageStatsEvent {
user_agent: UserAgent,
event_type: EventType,
}
impl UsageStatsEvent {
pub fn new(user_agent: UserAgent, event_type: EventType) -> Self {
Self {
user_agent,
event_type,
}
}
pub async fn send(&self) {
let mut headers = HeaderMap::new();
headers.insert("Content-Type", "application/json".parse().unwrap());
let body = serde_json::to_string(&self).unwrap();
let client = reqwest::Client::new();
let _ = client.post(TELEMETRY_URL)
.body(body)
.timeout(Duration::from_secs(5))
.send()
.await;
}
}
#[derive(Debug, Clone, Serialize)]
pub struct Args {
model_config: Option<Config>,
tokenizer_config: HubTokenizerConfig,
max_concurrent_requests: usize,
max_best_of: usize,
max_stop_sequences: usize,
max_top_n_tokens: u32,
max_input_tokens: usize,
max_total_tokens: usize,
waiting_served_ratio: f32,
max_batch_prefill_tokens: u32,
max_batch_total_tokens: Option<u32>,
max_waiting_tokens: usize,
max_batch_size: Option<usize>,
revision: Option<String>,
validation_workers: usize,
json_output: bool,
ngrok: bool,
messages_api_enabled: bool,
disable_grammar_support: bool,
max_client_batch_size: usize,
disable_usage_stats: bool,
disable_crash_reports: bool,
}
impl Args {
pub fn new(
model_config: Option<Config>,
tokenizer_config: HubTokenizerConfig,
max_concurrent_requests: usize,
max_best_of: usize,
max_stop_sequences: usize,
max_top_n_tokens: u32,
max_input_tokens: usize,
max_total_tokens: usize,
waiting_served_ratio: f32,
max_batch_prefill_tokens: u32,
max_batch_total_tokens: Option<u32>,
max_waiting_tokens: usize,
max_batch_size: Option<usize>,
revision: Option<String>,
validation_workers: usize,
json_output: bool,
ngrok: bool,
messages_api_enabled: bool,
disable_grammar_support: bool,
max_client_batch_size: usize,
disable_usage_stats: bool,
disable_crash_reports: bool,
) -> Self {
Self {
model_config,
tokenizer_config,
max_concurrent_requests,
max_best_of,
max_stop_sequences,
max_top_n_tokens,
max_input_tokens,
max_total_tokens,
waiting_served_ratio,
max_batch_prefill_tokens,
max_batch_total_tokens,
max_waiting_tokens,
max_batch_size,
revision,
validation_workers,
json_output,
ngrok,
messages_api_enabled,
disable_grammar_support,
max_client_batch_size,
disable_usage_stats,
disable_crash_reports,
}
}
}
/// This is more or less a copy of the code from the `text-generation-launcher` crate to avoid a dependency
#[derive(Serialize, Debug, Clone)]
pub struct Env {
git_sha: &'static str,
docker_label: &'static str,
nvidia_env: String,
xpu_env: String,
system_env: SystemInfo,
}
#[derive(Serialize, Debug, Clone)]
pub struct SystemInfo {
cpu_count: usize,
cpu_type: String,
total_memory: u64,
architecture: String,
platform: String,
}
impl SystemInfo {
fn new() -> Self {
let mut system = sysinfo::System::new_all();
system.refresh_all();
let cpu_count = system.cpus().len();
let cpu_type = system.cpus()[0].brand().to_string();
let total_memory = system.total_memory();
let architecture = std::env::consts::ARCH.to_string();
let platform = format!("{}-{}-{}",
std::env::consts::OS,
std::env::consts::FAMILY,
std::env::consts::ARCH
);
Self {
cpu_count,
cpu_type,
total_memory,
architecture,
platform,
}
}
}
impl fmt::Display for SystemInfo {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "CPU Count: {}", self.cpu_count)?;
writeln!(f, "CPU Type: {}", self.cpu_type)?;
writeln!(f, "Total Memory: {}", self.total_memory)?;
writeln!(f, "Architecture: {}", self.architecture)?;
writeln!(f, "Platform: {}", self.platform)?;
Ok(())
}
}
impl Env {
pub fn new() -> Self {
let nvidia_env = nvidia_smi();
let xpu_env = xpu_smi();
let system_env = SystemInfo::new();
Self {
system_env,
nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
xpu_env: xpu_env.unwrap_or("N/A".to_string()),
git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
}
}
}
impl fmt::Display for Env {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "Runtime environment:")?;
writeln!(f, "Commit sha: {}", self.git_sha)?;
writeln!(f, "Docker label: {}", self.docker_label)?;
writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
write!(f, "xpu-smi:\n{}\n", self.xpu_env)?;
write!(f, "System:\n{}", self.system_env)?;
Ok(())
}
}
fn nvidia_smi() -> Option<String> {
let output = Command::new("nvidia-smi").output().ok()?;
let nvidia_smi = String::from_utf8(output.stdout).ok()?;
let output = nvidia_smi.replace('\n', "\n ");
Some(output.trim().to_string())
}
fn xpu_smi() -> Option<String> {
let output = Command::new("xpu-smi").arg("discovery").output().ok()?;
let xpu_smi = String::from_utf8(output.stdout).ok()?;
let output = xpu_smi.replace('\n', "\n ");
Some(output.trim().to_string())
}