mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
use xpu-smi to dump used memory (#2047)
* use xpu-smi to dump used memory xpu use "ZE_AFFINITY_MASK" to control card, usage is like CUDA_VISIBLE_DEVICES Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * Update server/text_generation_server/utils/import_utils.py Co-authored-by: Daniël de Kok <me@github.danieldk.eu> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
This commit is contained in:
parent
1952a0b03b
commit
e49aed4713
@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
|
|||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
|
||||||
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
|
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
|
||||||
|
|
||||||
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
|
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
|
||||||
|
|
||||||
# Text Generation Inference base env
|
# Text Generation Inference base env
|
||||||
ENV HUGGINGFACE_HUB_CACHE=/data \
|
ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||||
|
@ -763,7 +763,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
|
|||||||
fn num_cuda_devices() -> Option<usize> {
|
fn num_cuda_devices() -> Option<usize> {
|
||||||
let devices = match env::var("CUDA_VISIBLE_DEVICES") {
|
let devices = match env::var("CUDA_VISIBLE_DEVICES") {
|
||||||
Ok(devices) => devices,
|
Ok(devices) => devices,
|
||||||
Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
|
Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
|
||||||
|
Ok(devices) => devices,
|
||||||
|
Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let n_devices = devices.split(',').count();
|
let n_devices = devices.split(',').count();
|
||||||
Some(n_devices)
|
Some(n_devices)
|
||||||
@ -836,9 +839,9 @@ fn find_num_shards(
|
|||||||
let num_shard = match (sharded, num_shard) {
|
let num_shard = match (sharded, num_shard) {
|
||||||
(Some(true), None) => {
|
(Some(true), None) => {
|
||||||
// try to default to the number of available GPUs
|
// try to default to the number of available GPUs
|
||||||
tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
|
tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
|
||||||
let n_devices = num_cuda_devices()
|
let n_devices = num_cuda_devices()
|
||||||
.expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
|
.expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
|
||||||
if n_devices <= 1 {
|
if n_devices <= 1 {
|
||||||
return Err(LauncherError::NotEnoughCUDADevices(format!(
|
return Err(LauncherError::NotEnoughCUDADevices(format!(
|
||||||
"`sharded` is true but only found {n_devices} CUDA devices"
|
"`sharded` is true but only found {n_devices} CUDA devices"
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import torch
|
import torch
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
def is_xpu_available():
|
def is_xpu_available():
|
||||||
@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
|
|||||||
|
|
||||||
|
|
||||||
def get_xpu_free_memory(device, memory_fraction):
|
def get_xpu_free_memory(device, memory_fraction):
|
||||||
total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
|
total_memory = torch.xpu.get_device_properties(device).total_memory
|
||||||
free_memory = int(total_gpu_memory * 0.5)
|
device_id = device.index
|
||||||
|
query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
|
||||||
|
output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
|
||||||
|
used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
|
||||||
|
free_memory = int(total_memory * 0.95 - used_memory)
|
||||||
return free_memory
|
return free_memory
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user