mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
more robust nvidia smi
This commit is contained in:
parent
9fc54cd91c
commit
aecbce351c
22
Cargo.lock
generated
22
Cargo.lock
generated
@ -801,6 +801,27 @@ dependencies = [
|
|||||||
"typenum",
|
"typenum",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv"
|
||||||
|
version = "1.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
|
||||||
|
dependencies = [
|
||||||
|
"csv-core",
|
||||||
|
"itoa",
|
||||||
|
"ryu",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv-core"
|
||||||
|
version = "0.1.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ctrlc"
|
name = "ctrlc"
|
||||||
version = "3.4.4"
|
version = "3.4.4"
|
||||||
@ -3806,6 +3827,7 @@ dependencies = [
|
|||||||
"axum-tracing-opentelemetry",
|
"axum-tracing-opentelemetry",
|
||||||
"base64 0.22.1",
|
"base64 0.22.1",
|
||||||
"clap",
|
"clap",
|
||||||
|
"csv",
|
||||||
"futures",
|
"futures",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hf-hub",
|
"hf-hub",
|
||||||
|
@ -54,6 +54,7 @@ image = "0.25.1"
|
|||||||
base64 = { workspace = true }
|
base64 = { workspace = true }
|
||||||
sysinfo = "0.30.13"
|
sysinfo = "0.30.13"
|
||||||
uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] }
|
uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] }
|
||||||
|
csv = "1.3.0"
|
||||||
|
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
|
@ -3,6 +3,7 @@ use reqwest::header::HeaderMap;
|
|||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::{fmt, process::Command, time::Duration};
|
use std::{fmt, process::Command, time::Duration};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
use csv::ReaderBuilder;
|
||||||
|
|
||||||
const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";
|
const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";
|
||||||
|
|
||||||
@ -135,11 +136,85 @@ impl Args {
|
|||||||
pub struct Env {
|
pub struct Env {
|
||||||
git_sha: &'static str,
|
git_sha: &'static str,
|
||||||
docker_label: &'static str,
|
docker_label: &'static str,
|
||||||
nvidia_env: String,
|
nvidia_info: Option<NvidiaSmiInfo>,
|
||||||
xpu_env: String,
|
xpu_env: String,
|
||||||
system_env: SystemInfo,
|
system_env: SystemInfo,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Clone)]
|
||||||
|
struct NvidiaSmiInfo {
|
||||||
|
name: String,
|
||||||
|
pci_bus_id: String,
|
||||||
|
driver_version: String,
|
||||||
|
pstate: String,
|
||||||
|
pcie_link_gen_max: String,
|
||||||
|
pcie_link_gen_current: String,
|
||||||
|
temperature_gpu: String,
|
||||||
|
utilization_gpu: String,
|
||||||
|
utilization_memory: String,
|
||||||
|
memory_total: String,
|
||||||
|
memory_free: String,
|
||||||
|
memory_used: String,
|
||||||
|
reset_status_reset_required: String,
|
||||||
|
reset_status_drain_and_reset_recommended: String,
|
||||||
|
compute_cap: String,
|
||||||
|
ecc_errors_corrected_volatile_total: String,
|
||||||
|
mig_mode_current: String,
|
||||||
|
power_draw_instant: String,
|
||||||
|
power_limit: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NvidiaSmiInfo {
|
||||||
|
fn new() -> Option<Vec<NvidiaSmiInfo>> {
|
||||||
|
let output = Command::new("nvidia-smi")
|
||||||
|
.args(&[
|
||||||
|
"--query-gpu=name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.gpucurrent,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,reset_status.reset_required,reset_status.drain_and_reset_recommended,compute_cap,ecc.errors.corrected.volatile.total,mig.mode.current,power.draw.instant,power.limit",
|
||||||
|
"--format=csv"
|
||||||
|
])
|
||||||
|
.output()
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let stdout = String::from_utf8(output.stdout).ok()?;
|
||||||
|
|
||||||
|
let mut rdr = ReaderBuilder::new()
|
||||||
|
.has_headers(true)
|
||||||
|
.from_reader(stdout.as_bytes());
|
||||||
|
|
||||||
|
let mut infos = Vec::new();
|
||||||
|
|
||||||
|
for result in rdr.records() {
|
||||||
|
let record = result.ok()?;
|
||||||
|
infos.push(NvidiaSmiInfo {
|
||||||
|
name: record[0].to_string(),
|
||||||
|
pci_bus_id: record[1].to_string(),
|
||||||
|
driver_version: record[2].to_string(),
|
||||||
|
pstate: record[3].to_string(),
|
||||||
|
pcie_link_gen_max: record[4].to_string(),
|
||||||
|
pcie_link_gen_current: record[5].to_string(),
|
||||||
|
temperature_gpu: record[6].to_string(),
|
||||||
|
utilization_gpu: record[7].to_string(),
|
||||||
|
utilization_memory: record[8].to_string(),
|
||||||
|
memory_total: record[9].to_string(),
|
||||||
|
memory_free: record[10].to_string(),
|
||||||
|
memory_used: record[11].to_string(),
|
||||||
|
reset_status_reset_required: record[12].to_string(),
|
||||||
|
reset_status_drain_and_reset_recommended: record[13].to_string(),
|
||||||
|
compute_cap: record[14].to_string(),
|
||||||
|
ecc_errors_corrected_volatile_total: record[15].to_string(),
|
||||||
|
mig_mode_current: record[16].to_string(),
|
||||||
|
power_draw_instant: record[17].to_string(),
|
||||||
|
power_limit: record[18].to_string(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(infos)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Debug, Clone)]
|
#[derive(Serialize, Debug, Clone)]
|
||||||
pub struct SystemInfo {
|
pub struct SystemInfo {
|
||||||
cpu_count: usize,
|
cpu_count: usize,
|
||||||
@ -174,17 +249,6 @@ impl SystemInfo {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for SystemInfo {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
writeln!(f, "CPU Count: {}", self.cpu_count)?;
|
|
||||||
writeln!(f, "CPU Type: {}", self.cpu_type)?;
|
|
||||||
writeln!(f, "Total Memory: {}", self.total_memory)?;
|
|
||||||
writeln!(f, "Architecture: {}", self.architecture)?;
|
|
||||||
writeln!(f, "Platform: {}", self.platform)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for Env {
|
impl Default for Env {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self::new()
|
Self::new()
|
||||||
@ -193,13 +257,11 @@ impl Default for Env {
|
|||||||
|
|
||||||
impl Env {
|
impl Env {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
let nvidia_env = nvidia_smi();
|
|
||||||
let xpu_env = xpu_smi();
|
let xpu_env = xpu_smi();
|
||||||
let system_env = SystemInfo::new();
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
system_env,
|
system_env: SystemInfo::new(),
|
||||||
nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
|
nvidia_info: NvidiaSmiInfo::new(),
|
||||||
xpu_env: xpu_env.unwrap_or("N/A".to_string()),
|
xpu_env: xpu_env.unwrap_or("N/A".to_string()),
|
||||||
git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
|
git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
|
||||||
docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
|
docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
|
||||||
|
Loading…
Reference in New Issue
Block a user