diff --git a/Cargo.lock b/Cargo.lock index 7660f1f1..923b5cbe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -801,6 +801,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "ctrlc" version = "3.4.4" @@ -3806,6 +3827,7 @@ dependencies = [ "axum-tracing-opentelemetry", "base64 0.22.1", "clap", + "csv", "futures", "futures-util", "hf-hub", diff --git a/router/Cargo.toml b/router/Cargo.toml index 342a1c48..0fc700a0 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -54,6 +54,7 @@ image = "0.25.1" base64 = { workspace = true } sysinfo = "0.30.13" uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] } +csv = "1.3.0" [build-dependencies] diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs index 929765c8..234cb23a 100644 --- a/router/src/usage_stats.rs +++ b/router/src/usage_stats.rs @@ -3,6 +3,7 @@ use reqwest::header::HeaderMap; use serde::Serialize; use std::{fmt, process::Command, time::Duration}; use uuid::Uuid; +use csv::ReaderBuilder; const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi"; @@ -135,11 +136,85 @@ impl Args { pub struct Env { git_sha: &'static str, docker_label: &'static str, - nvidia_env: String, + nvidia_info: Option, xpu_env: String, system_env: SystemInfo, } +#[derive(Debug, Serialize, Clone)] +struct NvidiaSmiInfo { + name: String, + pci_bus_id: String, + driver_version: String, + pstate: String, + pcie_link_gen_max: String, + pcie_link_gen_current: String, + temperature_gpu: String, + utilization_gpu: String, + utilization_memory: String, + memory_total: String, + memory_free: String, + memory_used: String, + reset_status_reset_required: String, + reset_status_drain_and_reset_recommended: String, + compute_cap: String, + ecc_errors_corrected_volatile_total: String, + mig_mode_current: String, + power_draw_instant: String, + power_limit: String, +} + +impl NvidiaSmiInfo { + fn new() -> Option> { + let output = Command::new("nvidia-smi") + .args(&[ + "--query-gpu=name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.gpucurrent,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,reset_status.reset_required,reset_status.drain_and_reset_recommended,compute_cap,ecc.errors.corrected.volatile.total,mig.mode.current,power.draw.instant,power.limit", + "--format=csv" + ]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let stdout = String::from_utf8(output.stdout).ok()?; + + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .from_reader(stdout.as_bytes()); + + let mut infos = Vec::new(); + + for result in rdr.records() { + let record = result.ok()?; + infos.push(NvidiaSmiInfo { + name: record[0].to_string(), + pci_bus_id: record[1].to_string(), + driver_version: record[2].to_string(), + pstate: record[3].to_string(), + pcie_link_gen_max: record[4].to_string(), + pcie_link_gen_current: record[5].to_string(), + temperature_gpu: record[6].to_string(), + utilization_gpu: record[7].to_string(), + utilization_memory: record[8].to_string(), + memory_total: record[9].to_string(), + memory_free: record[10].to_string(), + memory_used: record[11].to_string(), + reset_status_reset_required: record[12].to_string(), + reset_status_drain_and_reset_recommended: record[13].to_string(), + compute_cap: record[14].to_string(), + ecc_errors_corrected_volatile_total: record[15].to_string(), + mig_mode_current: record[16].to_string(), + power_draw_instant: record[17].to_string(), + power_limit: record[18].to_string(), + }); + } + + Some(infos) + } +} + #[derive(Serialize, Debug, Clone)] pub struct SystemInfo { cpu_count: usize, @@ -174,17 +249,6 @@ impl SystemInfo { } } -impl fmt::Display for SystemInfo { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - writeln!(f, "CPU Count: {}", self.cpu_count)?; - writeln!(f, "CPU Type: {}", self.cpu_type)?; - writeln!(f, "Total Memory: {}", self.total_memory)?; - writeln!(f, "Architecture: {}", self.architecture)?; - writeln!(f, "Platform: {}", self.platform)?; - Ok(()) - } -} - impl Default for Env { fn default() -> Self { Self::new() @@ -193,13 +257,11 @@ impl Default for Env { impl Env { pub fn new() -> Self { - let nvidia_env = nvidia_smi(); let xpu_env = xpu_smi(); - let system_env = SystemInfo::new(); Self { - system_env, - nvidia_env: nvidia_env.unwrap_or("N/A".to_string()), + system_env: SystemInfo::new(), + nvidia_info: NvidiaSmiInfo::new(), xpu_env: xpu_env.unwrap_or("N/A".to_string()), git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"), docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),