Merge branch 'main' into ci_amd3

2025-09-10 20:04:52 +00:00 · 2024-06-25 11:20:00 +02:00 · 2024-06-25 11:20:00 +02:00 · dc53846456
commit dc53846456
parent 09a41f2c43 b69f078041
16 changed files with 96 additions and 119 deletions
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@ -11,66 +11,24 @@ on:
      - 'main'
 jobs:
  start-runner:
    name: Start self-hosted EC2 runner
    runs-on: ubuntu-latest
    env:
      AWS_REGION: eu-central-1
      EC2_AMI_ID: ami-0ab09c07cfd194259
      EC2_INSTANCE_TYPE: g5.12xlarge
      EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
      EC2_SECURITY_GROUP: sg-072f92ae3082936c6
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.AWS_REGION }}
      - name: Start EC2 runner
        id: start-ec2-runner
        uses: philschmid/philschmid-ec2-github-runner@main
        with:
          mode: start
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          ec2-image-id: ${{ env.EC2_AMI_ID }}
          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
          subnet-id: ${{ env.EC2_SUBNET_ID }}
          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
          aws-resource-tags: > # optional, requires additional permissions
            [
              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
            ]
  load-tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
+    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    env:
      DOCKER_VOLUME: /cache
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
      - name: Prepare disks
        run: |
          sudo mkfs -t ext4 /dev/nvme1n1
          sudo mkdir ${{ env.DOCKER_VOLUME }}
          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
      - name: Install k6
        run: |
          curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
      - name: Start starcoder
        run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
          sleep 10
          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
@ -82,27 +40,3 @@ jobs:
        if: ${{ always() }}
        run: |
          docker stop tgi-starcoder || true
  stop-runner:
    name: Stop self-hosted EC2 runner
    needs:
      - start-runner
      - load-tests
    runs-on: ubuntu-latest
    env:
      AWS_REGION: eu-central-1
    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.AWS_REGION }}
      - name: Stop EC2 runner
        uses: philschmid/philschmid-ec2-github-runner@main
        with:
          mode: stop
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          label: ${{ needs.start-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -72,7 +72,7 @@ jobs:
      - name: Run server tests
        run: |
          pip install pytest
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          pytest -s -vv server/tests
      - name: Pre-commit checks
        run: |
--- a/2
+++ b/2
@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
-RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
--- a/README.md
+++ b/README.md
@ -105,14 +105,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat
 ### Using a private or gated model
-You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
+You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
 `text-generation-inference`. This allows you to gain access to protected resources.
 For example, if you want to serve the gated Llama V2 model variants:
 1. Go to https://huggingface.co/settings/tokens
 2. Copy your cli READ token
-3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
+3. Export `HF_TOKEN=<your cli READ token>`
 or with Docker:
@ -121,7 +121,7 @@ model=meta-llama/Llama-2-7b-chat-hf
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 ```
 ### A note on Shared Memory (shm)
@ -153,7 +153,8 @@ this will impact performance.
 ### Distributed Tracing
 `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
-by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
+by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be 
 overridden with the `--otlp-service-name` argument
 ### Architecture
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@ -147,7 +147,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
            tracing::info!("Downloading tokenizer");
            // Parse Huggingface hub token
-            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+            let auth_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
            // Download and instantiate tokenizer
            // We need to download it outside of the Tokio runtime
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@ -1,5 +1,5 @@
 from enum import Enum
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, field_validator, ConfigDict
 from typing import Optional, List, Union, Any
 from text_generation.errors import ValidationError
@ -452,5 +452,9 @@ class StreamResponse(BaseModel):
 # Inference API currently deployed model
 class DeployedModel(BaseModel):
    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
    # with model_ prefixes, since this disables guardrails for colliding fields:
    # https://github.com/pydantic/pydantic/issues/9177
    model_config  = ConfigDict(protected_namespaces=())
    model_id: str
    sha: str
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@ -70,6 +70,8 @@ Options:
          [env: JSON_OUTPUT=]
      --otlp-endpoint <OTLP_ENDPOINT>
          [env: OTLP_ENDPOINT=]
      --otlp-service-name <OTLP_SERVICE_NAME>
          [env: OTLP_SERVICE_NAME=]
      --cors-allow-origin <CORS_ALLOW_ORIGIN>
          [env: CORS_ALLOW_ORIGIN=]
      --ngrok
@ -138,6 +140,8 @@ Serve's command line parameters on the TGI repository are these:
 │ --logger-level                                   TEXT                        [default: INFO]             │
 │ --json-output          --no-json-output                                      [default: no-json-output]   │
 │ --otlp-endpoint                                  TEXT                        [default: None]             │
 │ --otlp-service-name                              TEXT                        [default:                   │
 │                                                                              text-generation-inference...│
 │ --help                                                                       Show this message and exit. │
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -2,13 +2,13 @@
 If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
-If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
 ```
-export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
+export HF_TOKEN=<YOUR READ TOKEN>
 ```
-If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
 ```bash
 model=meta-llama/Llama-2-7b-chat-hf
@ -17,7 +17,7 @@ token=<your READ token>
 docker run --gpus all \
    --shm-size 1g \
-    -e HUGGING_FACE_HUB_TOKEN=$token \
+    -e HF_TOKEN=$token \
    -p 8080:80 \
    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
    --model-id $model
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
@ -336,6 +336,13 @@ Options:
      --otlp-endpoint <OTLP_ENDPOINT>
          [env: OTLP_ENDPOINT=]
 ```
 ## OTLP_SERVICE_NAME
 ```shell
      --otlp-service-name <OTLP_SERVICE_NAME>
          [env: OTLP_SERVICE_NAME=]
          [default: text-generation-inference.router]
 ```
 ## CORS_ALLOW_ORIGIN
 ```shell
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -1,38 +1,38 @@
 import sys
 import subprocess
 import contextlib
 import pytest
 import asyncio
-import os
+import contextlib
 import docker
 import json
 import math
 import os
 import random
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
 import time
-import random
+from typing import Dict, List, Optional
-from docker.errors import NotFound
+import docker
-from typing import Optional, List, Dict
+import pytest
 from syrupy.extensions.json import JSONSnapshotExtension
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
-
+from docker.errors import NotFound
 from syrupy.extensions.json import JSONSnapshotExtension
 from text_generation import AsyncClient
 from text_generation.types import (
    Response,
    Details,
    InputToken,
    Token,
    BestOfSequence,
    Grammar,
    ChatComplete,
    ChatCompletionChunk,
    ChatCompletionComplete,
    Completion,
    Details,
    Grammar,
    InputToken,
    Response,
    Token,
 )
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
-HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
 SYSTEM = os.getenv("SYSTEM", None)
@ -455,8 +455,8 @@ def launcher(event_loop):
        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"
-        if HUGGING_FACE_HUB_TOKEN is not None:
+        if HF_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+            env["HF_TOKEN"] = HF_TOKEN
        volumes = []
        if DOCKER_VOLUME:
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -413,6 +413,9 @@ struct Args {
    #[clap(long, env)]
    otlp_endpoint: Option<String>,
    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,
    #[clap(long, env)]
    cors_allow_origin: Vec<String>,
    #[clap(long, env)]
@ -483,6 +486,7 @@ fn shard_manager(
    max_batch_size: Option<usize>,
    max_input_tokens: usize,
    otlp_endpoint: Option<String>,
    otlp_service_name: String,
    log_level: LevelFilter,
    status_sender: mpsc::Sender<ShardStatus>,
    shutdown: Arc<AtomicBool>,
@ -548,12 +552,16 @@ fn shard_manager(
        (None, Some(factor)) => Some((RopeScaling::Linear, factor)),
    };
-    // OpenTelemetry
+    // OpenTelemetry Endpoint
    if let Some(otlp_endpoint) = otlp_endpoint {
        shard_args.push("--otlp-endpoint".to_string());
        shard_args.push(otlp_endpoint);
    }
    // OpenTelemetry Service Name
    shard_args.push("--otlp-service-name".to_string());
    shard_args.push(otlp_service_name);
    // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
    shard_args.push("--max-input-tokens".to_string());
    shard_args.push(max_input_tokens.to_string());
@ -592,7 +600,7 @@ fn shard_manager(
    // Parse Inference API token
    if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
    };
    // Detect rope scaling
@ -751,7 +759,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
 fn num_cuda_devices() -> Option<usize> {
    let devices = match env::var("CUDA_VISIBLE_DEVICES") {
        Ok(devices) => devices,
-        Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
+        Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
            Ok(devices) => devices,
            Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
        }
    };
    let n_devices = devices.split(',').count();
    Some(n_devices)
@ -824,9 +835,9 @@ fn find_num_shards(
    let num_shard = match (sharded, num_shard) {
        (Some(true), None) => {
            // try to default to the number of available GPUs
-            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
+            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
            let n_devices = num_cuda_devices()
-                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
+                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
            if n_devices <= 1 {
                return Err(LauncherError::NotEnoughCUDADevices(format!(
                    "`sharded` is true but only found {n_devices} CUDA devices"
@ -925,7 +936,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
    // Parse Inference API token
    if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
    };
    // If args.weights_cache_override is some, pass it to the download process
@ -1035,6 +1046,7 @@ fn spawn_shards(
        let shutdown = shutdown.clone();
        let shutdown_sender = shutdown_sender.clone();
        let otlp_endpoint = args.otlp_endpoint.clone();
        let otlp_service_name = args.otlp_service_name.clone();
        let quantize = args.quantize;
        let speculate = args.speculate;
        let dtype = args.dtype;
@ -1074,6 +1086,7 @@ fn spawn_shards(
                max_batch_size,
                max_input_tokens,
                otlp_endpoint,
                otlp_service_name,
                max_log_level,
                status_sender,
                shutdown,
@ -1207,6 +1220,12 @@ fn spawn_webserver(
        router_args.push(otlp_endpoint);
    }
    // OpenTelemetry
    let otlp_service_name = args.otlp_service_name;
    router_args.push("--otlp-service-name".to_string());
    router_args.push(otlp_service_name);
    // CORS origins
    for origin in args.cors_allow_origin.into_iter() {
        router_args.push("--cors-allow-origin".to_string());
@ -1227,7 +1246,7 @@ fn spawn_webserver(
    // Parse Inference API token
    if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
    };
    // Parse Compute type
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -570,7 +570,7 @@ impl ChatCompletion {
        };
        Self {
            id: String::new(),
-            object: "text_completion".into(),
+            object: "chat.completion".into(),
            created,
            model,
            system_fingerprint,
@ -682,7 +682,7 @@ impl ChatCompletionChunk {
        };
        Self {
            id: String::new(),
-            object: "text_completion".to_string(),
+            object: "chat.completion.chunk".to_string(),
            created,
            model,
            system_fingerprint,
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -65,6 +65,8 @@ struct Args {
    json_output: bool,
    #[clap(long, env)]
    otlp_endpoint: Option<String>,
    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,
    #[clap(long, env)]
    cors_allow_origin: Option<Vec<String>>,
    #[clap(long, env)]
@ -107,6 +109,7 @@ async fn main() -> Result<(), RouterError> {
        validation_workers,
        json_output,
        otlp_endpoint,
        otlp_service_name,
        cors_allow_origin,
        ngrok,
        ngrok_authtoken,
@ -117,7 +120,7 @@ async fn main() -> Result<(), RouterError> {
    } = args;
    // Launch Tokio runtime
-    init_logging(otlp_endpoint, json_output);
+    init_logging(otlp_endpoint, otlp_service_name, json_output);
    // Validate args
    if max_input_tokens >= max_total_tokens {
@ -156,7 +159,7 @@ async fn main() -> Result<(), RouterError> {
    });
    // Parse Huggingface hub token
-    let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+    let authorization_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
    // Tokenizer instance
    // This will only be used to validate payloads
@ -367,10 +370,11 @@ async fn main() -> Result<(), RouterError> {
 /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
 ///     - otlp_endpoint is an optional URL to an Open Telemetry collector
 ///     - otlp_service_name service name to appear in APM
 ///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
 ///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
 ///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
-fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
+fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
    let mut layers = Vec::new();
    // STDOUT/STDERR layer
@ -401,7 +405,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
                trace::config()
                    .with_resource(Resource::new(vec![KeyValue::new(
                        "service.name",
-                        "text-generation-inference.router",
+                        otlp_service_name,
                    )]))
                    .with_sampler(Sampler::AlwaysOn),
            )
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -42,6 +42,7 @@ def serve(
    logger_level: str = "INFO",
    json_output: bool = False,
    otlp_endpoint: Optional[str] = None,
    otlp_service_name: str = "text-generation-inference.server",
    max_input_tokens: Optional[int] = None,
 ):
    if sharded:
@ -76,7 +77,7 @@ def serve(
    # Setup OpenTelemetry distributed tracing
    if otlp_endpoint is not None:
-        setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)
+        setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
    # Downgrade enum into str for easier management later on
    quantize = None if quantize is None else quantize.value
--- a/server/text_generation_server/tracing.py
+++ b/server/text_generation_server/tracing.py
@ -54,10 +54,8 @@ class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
        )
-def setup_tracing(shard: int, otlp_endpoint: str):
+def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
-    resource = Resource.create(
+    resource = Resource.create(attributes={"service.name": otlp_service_name})
        attributes={"service.name": f"text-generation-inference.server-{shard}"}
    )
    span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
    span_processor = BatchSpanProcessor(span_exporter)
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@ -1,5 +1,6 @@
 import torch
 from loguru import logger
 import subprocess
 def is_xpu_available():
@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
 def get_xpu_free_memory(device, memory_fraction):
-    total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
+    total_memory = torch.xpu.get_device_properties(device).total_memory
-    free_memory = int(total_gpu_memory * 0.5)
+    device_id = device.index
    query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
    output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
    used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
    free_memory = int(total_memory * 0.95 - used_memory)
    return free_memory