use xpu-smi to dump used memory (#2047)

* use xpu-smi to dump used memory xpu use "ZE_AFFINITY_MASK" to control card, usage is like CUDA_VISIBLE_DEVICES Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * Update server/text_generation_server/utils/import_utils.py Co-authored-by: Daniël de Kok <me@github.danieldk.eu> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
2025-06-07 09:52:18 +00:00 · 2024-06-25 16:15:46 +08:00 · 2024-06-25 16:15:46 +08:00 · e49aed4713
commit e49aed4713
parent 1952a0b03b
3 changed files with 14 additions and 6 deletions
--- a/2
+++ b/2
@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
-RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -763,7 +763,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
 fn num_cuda_devices() -> Option<usize> {
    let devices = match env::var("CUDA_VISIBLE_DEVICES") {
        Ok(devices) => devices,
-        Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
+        Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
            Ok(devices) => devices,
            Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
        }
    };
    let n_devices = devices.split(',').count();
    Some(n_devices)
@ -836,9 +839,9 @@ fn find_num_shards(
    let num_shard = match (sharded, num_shard) {
        (Some(true), None) => {
            // try to default to the number of available GPUs
-            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
+            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
            let n_devices = num_cuda_devices()
-                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
+                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
            if n_devices <= 1 {
                return Err(LauncherError::NotEnoughCUDADevices(format!(
                    "`sharded` is true but only found {n_devices} CUDA devices"
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@ -1,5 +1,6 @@
 import torch
 from loguru import logger
 import subprocess
 def is_xpu_available():
@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
 def get_xpu_free_memory(device, memory_fraction):
-    total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
+    total_memory = torch.xpu.get_device_properties(device).total_memory
-    free_memory = int(total_gpu_memory * 0.5)
+    device_id = device.index
    query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
    output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
    used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
    free_memory = int(total_memory * 0.95 - used_memory)
    return free_memory