From 3447c722fd5adc995c83c6c07f40dd7f44403f5d Mon Sep 17 00:00:00 2001
From: Lucain <lucain@huggingface.co>
Date: Tue, 25 Jun 2024 09:23:12 +0200
Subject: [PATCH 1/5] Support `HF_TOKEN` environment variable (#2066)

* Support HF_TOKEN environement variable

* Load test.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .github/workflows/build.yaml                  |  2 +-
 .github/workflows/load_test.yaml              | 70 +------------------
 .github/workflows/tests.yaml                  |  2 +-
 README.md                                     |  6 +-
 benchmark/src/main.rs                         |  2 +-
 .../basic_tutorials/gated_model_access.md     |  8 +--
 integration-tests/conftest.py                 | 38 +++++-----
 launcher/src/main.rs                          |  6 +-
 router/src/main.rs                            |  2 +-
 9 files changed, 35 insertions(+), 101 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 22fa06e3..90fb9d45 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -178,6 +178,6 @@ jobs:
           export DOCKER_VOLUME=/mnt/cache
           export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
           export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           echo $DOCKER_IMAGE
           pytest -s -vv integration-tests
diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml
index fd22e395..a10c9428 100644
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@@ -11,66 +11,24 @@ on:
       - 'main'
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-      EC2_AMI_ID: ami-0ab09c07cfd194259
-      EC2_INSTANCE_TYPE: g5.12xlarge
-      EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
-      EC2_SECURITY_GROUP: sg-072f92ae3082936c6
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
   load-tests:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
     env:
       DOCKER_VOLUME: /cache
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
 
-      - name: Prepare disks
-        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
-
       - name: Install k6
         run: |
           curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
 
       - name: Start starcoder
         run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
           sleep 10
           wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
 
@@ -82,27 +40,3 @@ jobs:
         if: ${{ always() }}
         run: |
           docker stop tgi-starcoder || true
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - load-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index a8074ddd..e21344d1 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -72,7 +72,7 @@ jobs:
       - name: Run server tests
         run: |
           pip install pytest
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           pytest -s -vv server/tests
       - name: Pre-commit checks
         run: |
diff --git a/README.md b/README.md
index 74616748..3b54af45 100644
--- a/README.md
+++ b/README.md
@@ -105,14 +105,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat
 
 ### Using a private or gated model
 
-You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
+You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
 `text-generation-inference`. This allows you to gain access to protected resources.
 
 For example, if you want to serve the gated Llama V2 model variants:
 
 1. Go to https://huggingface.co/settings/tokens
 2. Copy your cli READ token
-3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
+3. Export `HF_TOKEN=<your cli READ token>`
 
 or with Docker:
 
@@ -121,7 +121,7 @@ model=meta-llama/Llama-2-7b-chat-hf
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index b9d80b7a..603b4087 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -147,7 +147,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             tracing::info!("Downloading tokenizer");
 
             // Parse Huggingface hub token
-            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+            let auth_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
 
             // Download and instantiate tokenizer
             // We need to download it outside of the Tokio runtime
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index b49c59c9..ef3a1db7 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -2,13 +2,13 @@
 
 If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
 
-If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
 
 ```
-export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
+export HF_TOKEN=<YOUR READ TOKEN>
 ```
 
-If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
 
 ```bash
 model=meta-llama/Llama-2-7b-chat-hf
@@ -17,7 +17,7 @@ token=<your READ token>
 
 docker run --gpus all \
     --shm-size 1g \
-    -e HUGGING_FACE_HUB_TOKEN=$token \
+    -e HF_TOKEN=$token \
     -p 8080:80 \
     -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
     --model-id $model
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 0b239484..13337165 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,38 +1,38 @@
-import sys
-import subprocess
-import contextlib
-import pytest
 import asyncio
-import os
-import docker
+import contextlib
 import json
 import math
+import os
+import random
+import re
 import shutil
+import subprocess
+import sys
 import tempfile
 import time
-import random
+from typing import Dict, List, Optional
 
-from docker.errors import NotFound
-from typing import Optional, List, Dict
-from syrupy.extensions.json import JSONSnapshotExtension
+import docker
+import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
-
+from docker.errors import NotFound
+from syrupy.extensions.json import JSONSnapshotExtension
 from text_generation import AsyncClient
 from text_generation.types import (
-    Response,
-    Details,
-    InputToken,
-    Token,
     BestOfSequence,
-    Grammar,
     ChatComplete,
     ChatCompletionChunk,
     ChatCompletionComplete,
     Completion,
+    Details,
+    Grammar,
+    InputToken,
+    Response,
+    Token,
 )
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
-HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
 DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
 
@@ -447,8 +447,8 @@ def launcher(event_loop):
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
 
-        if HUGGING_FACE_HUB_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+        if HF_TOKEN is not None:
+            env["HF_TOKEN"] = HF_TOKEN
 
         volumes = []
         if DOCKER_VOLUME:
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index e4d5bb85..3e0c7a27 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -592,7 +592,7 @@ fn shard_manager(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // Detect rope scaling
@@ -925,7 +925,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // If args.weights_cache_override is some, pass it to the download process
@@ -1227,7 +1227,7 @@ fn spawn_webserver(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // Parse Compute type
diff --git a/router/src/main.rs b/router/src/main.rs
index c4203dbc..013176f3 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -156,7 +156,7 @@ async fn main() -> Result<(), RouterError> {
     });
 
     // Parse Huggingface hub token
-    let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+    let authorization_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
 
     // Tokenizer instance
     // This will only be used to validate payloads

From 1869ee2f5767db42aafa20712736aea99ded2bfc Mon Sep 17 00:00:00 2001
From: KevinDuffy94 <kevin.duffy94@gmail.com>
Date: Tue, 25 Jun 2024 08:33:01 +0100
Subject: [PATCH 2/5] Add OTLP Service Name Environment Variable (#2076)

* Adding Service Name Environment variable for https://github.com/huggingface/text-generation-inference/issues/2069

* Update Docs

* Update README.md

* Update Launcher Docs

* Update Launcher Docs
Removing Option
---
 README.md                                |  3 ++-
 docs/source/architecture.md              |  4 ++++
 docs/source/basic_tutorials/launcher.md  |  7 +++++++
 launcher/src/main.rs                     | 18 +++++++++++++++++-
 router/src/main.rs                       | 10 +++++++---
 server/text_generation_server/cli.py     |  3 ++-
 server/text_generation_server/tracing.py |  6 ++----
 7 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 3b54af45..2016b915 100644
--- a/README.md
+++ b/README.md
@@ -153,7 +153,8 @@ this will impact performance.
 ### Distributed Tracing
 
 `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
-by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
+by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be 
+overridden with the `--otlp-service-name` argument
 
 ### Architecture
 
diff --git a/docs/source/architecture.md b/docs/source/architecture.md
index b7885879..a8418817 100644
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@@ -70,6 +70,8 @@ Options:
           [env: JSON_OUTPUT=]
       --otlp-endpoint <OTLP_ENDPOINT>
           [env: OTLP_ENDPOINT=]
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
       --cors-allow-origin <CORS_ALLOW_ORIGIN>
           [env: CORS_ALLOW_ORIGIN=]
       --ngrok
@@ -138,6 +140,8 @@ Serve's command line parameters on the TGI repository are these:
 │ --logger-level                                   TEXT                        [default: INFO]             │
 │ --json-output          --no-json-output                                      [default: no-json-output]   │
 │ --otlp-endpoint                                  TEXT                        [default: None]             │
+│ --otlp-service-name                              TEXT                        [default:                   │
+│                                                                              text-generation-inference...│
 │ --help                                                                       Show this message and exit. │
 ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md
index 9246093e..f6175925 100644
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
@@ -336,6 +336,13 @@ Options:
       --otlp-endpoint <OTLP_ENDPOINT>
           [env: OTLP_ENDPOINT=]
 
+```
+## OTLP_SERVICE_NAME
+```shell
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
+          [default: text-generation-inference.router]
+
 ```
 ## CORS_ALLOW_ORIGIN
 ```shell
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 3e0c7a27..14d03429 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -413,6 +413,9 @@ struct Args {
     #[clap(long, env)]
     otlp_endpoint: Option<String>,
 
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+
     #[clap(long, env)]
     cors_allow_origin: Vec<String>,
     #[clap(long, env)]
@@ -483,6 +486,7 @@ fn shard_manager(
     max_batch_size: Option<usize>,
     max_input_tokens: usize,
     otlp_endpoint: Option<String>,
+    otlp_service_name: String,
     log_level: LevelFilter,
     status_sender: mpsc::Sender<ShardStatus>,
     shutdown: Arc<AtomicBool>,
@@ -548,12 +552,16 @@ fn shard_manager(
         (None, Some(factor)) => Some((RopeScaling::Linear, factor)),
     };
 
-    // OpenTelemetry
+    // OpenTelemetry Endpoint
     if let Some(otlp_endpoint) = otlp_endpoint {
         shard_args.push("--otlp-endpoint".to_string());
         shard_args.push(otlp_endpoint);
     }
 
+    // OpenTelemetry Service Name
+    shard_args.push("--otlp-service-name".to_string());
+    shard_args.push(otlp_service_name);
+
     // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
     shard_args.push("--max-input-tokens".to_string());
     shard_args.push(max_input_tokens.to_string());
@@ -1035,6 +1043,7 @@ fn spawn_shards(
         let shutdown = shutdown.clone();
         let shutdown_sender = shutdown_sender.clone();
         let otlp_endpoint = args.otlp_endpoint.clone();
+        let otlp_service_name = args.otlp_service_name.clone();
         let quantize = args.quantize;
         let speculate = args.speculate;
         let dtype = args.dtype;
@@ -1074,6 +1083,7 @@ fn spawn_shards(
                 max_batch_size,
                 max_input_tokens,
                 otlp_endpoint,
+                otlp_service_name,
                 max_log_level,
                 status_sender,
                 shutdown,
@@ -1207,6 +1217,12 @@ fn spawn_webserver(
         router_args.push(otlp_endpoint);
     }
 
+    // OpenTelemetry
+    let otlp_service_name = args.otlp_service_name;
+    router_args.push("--otlp-service-name".to_string());
+    router_args.push(otlp_service_name);
+
+
     // CORS origins
     for origin in args.cors_allow_origin.into_iter() {
         router_args.push("--cors-allow-origin".to_string());
diff --git a/router/src/main.rs b/router/src/main.rs
index 013176f3..dcb9ce99 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -65,6 +65,8 @@ struct Args {
     json_output: bool,
     #[clap(long, env)]
     otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
     #[clap(long, env)]
     cors_allow_origin: Option<Vec<String>>,
     #[clap(long, env)]
@@ -107,6 +109,7 @@ async fn main() -> Result<(), RouterError> {
         validation_workers,
         json_output,
         otlp_endpoint,
+        otlp_service_name,
         cors_allow_origin,
         ngrok,
         ngrok_authtoken,
@@ -117,7 +120,7 @@ async fn main() -> Result<(), RouterError> {
     } = args;
 
     // Launch Tokio runtime
-    init_logging(otlp_endpoint, json_output);
+    init_logging(otlp_endpoint, otlp_service_name, json_output);
 
     // Validate args
     if max_input_tokens >= max_total_tokens {
@@ -367,10 +370,11 @@ async fn main() -> Result<(), RouterError> {
 
 /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
 ///     - otlp_endpoint is an optional URL to an Open Telemetry collector
+///     - otlp_service_name service name to appear in APM
 ///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
 ///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
 ///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
-fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
+fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
     let mut layers = Vec::new();
 
     // STDOUT/STDERR layer
@@ -401,7 +405,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
                 trace::config()
                     .with_resource(Resource::new(vec![KeyValue::new(
                         "service.name",
-                        "text-generation-inference.router",
+                        otlp_service_name,
                     )]))
                     .with_sampler(Sampler::AlwaysOn),
             )
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index 5d25bfc5..18cad071 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -42,6 +42,7 @@ def serve(
     logger_level: str = "INFO",
     json_output: bool = False,
     otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
     max_input_tokens: Optional[int] = None,
 ):
     if sharded:
@@ -76,7 +77,7 @@ def serve(
 
     # Setup OpenTelemetry distributed tracing
     if otlp_endpoint is not None:
-        setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)
+        setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
 
     # Downgrade enum into str for easier management later on
     quantize = None if quantize is None else quantize.value
diff --git a/server/text_generation_server/tracing.py b/server/text_generation_server/tracing.py
index bf03c379..bc7a04ee 100644
--- a/server/text_generation_server/tracing.py
+++ b/server/text_generation_server/tracing.py
@@ -54,10 +54,8 @@ class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
         )
 
 
-def setup_tracing(shard: int, otlp_endpoint: str):
-    resource = Resource.create(
-        attributes={"service.name": f"text-generation-inference.server-{shard}"}
-    )
+def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
+    resource = Resource.create(attributes={"service.name": otlp_service_name})
     span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
     span_processor = BatchSpanProcessor(span_exporter)
 

From 5b2155b0f8ace2985b3d8e109fed88769d7395ee Mon Sep 17 00:00:00 2001
From: Jeff <jeffreyr@securitydept.com>
Date: Tue, 25 Jun 2024 04:10:32 -0400
Subject: [PATCH 3/5] corrected Pydantic warning. (#2095)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* corrected Pydantic warning.

* Update clients/python/text_generation/types.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
---
 clients/python/text_generation/types.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index eb872ee6..497468d9 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, field_validator, ConfigDict
 from typing import Optional, List, Union, Any
 
 from text_generation.errors import ValidationError
@@ -452,5 +452,9 @@ class StreamResponse(BaseModel):
 
 # Inference API currently deployed model
 class DeployedModel(BaseModel):
+    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
+    # with model_ prefixes, since this disables guardrails for colliding fields:
+    # https://github.com/pydantic/pydantic/issues/9177
+    model_config  = ConfigDict(protected_namespaces=())
     model_id: str
     sha: str

From 83634dc1227d24624514a9577050886e712ffb13 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 25 Jun 2024 16:15:46 +0800
Subject: [PATCH 4/5] use xpu-smi to dump used memory (#2047)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* use xpu-smi to dump used memory
xpu use "ZE_AFFINITY_MASK" to control card, usage is like CUDA_VISIBLE_DEVICES

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* Update server/text_generation_server/utils/import_utils.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
---
 Dockerfile_intel                                    | 2 +-
 launcher/src/main.rs                                | 9 ++++++---
 server/text_generation_server/utils/import_utils.py | 9 +++++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 131f49ba..f09614d4 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
 
-RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
 
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 14d03429..2e06c1ef 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -759,7 +759,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
 fn num_cuda_devices() -> Option<usize> {
     let devices = match env::var("CUDA_VISIBLE_DEVICES") {
         Ok(devices) => devices,
-        Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
+        Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
+            Ok(devices) => devices,
+            Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
+        }
     };
     let n_devices = devices.split(',').count();
     Some(n_devices)
@@ -832,9 +835,9 @@ fn find_num_shards(
     let num_shard = match (sharded, num_shard) {
         (Some(true), None) => {
             // try to default to the number of available GPUs
-            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
+            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
             let n_devices = num_cuda_devices()
-                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
+                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
             if n_devices <= 1 {
                 return Err(LauncherError::NotEnoughCUDADevices(format!(
                     "`sharded` is true but only found {n_devices} CUDA devices"
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
index d79e36c2..c3929392 100644
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@@ -1,5 +1,6 @@
 import torch
 from loguru import logger
+import subprocess
 
 
 def is_xpu_available():
@@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
 
 
 def get_xpu_free_memory(device, memory_fraction):
-    total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
-    free_memory = int(total_gpu_memory * 0.5)
+    total_memory = torch.xpu.get_device_properties(device).total_memory
+    device_id = device.index
+    query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
+    output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
+    used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
+    free_memory = int(total_memory * 0.95 - used_memory)
     return free_memory
 
 

From b69f07804184b802f1489eb314c3c13a34807c16 Mon Sep 17 00:00:00 2001
From: sunxichen <thisissunxichen@outlook.com>
Date: Tue, 25 Jun 2024 16:59:50 +0800
Subject: [PATCH 5/5] fix ChatCompletion and ChatCompletionChunk object string
 not compatible with standard openai api (#2089)

Co-authored-by: sunxichen <sun.xc@digitalcnzz.com>
---
 router/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index b0b93c13..5d201937 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -570,7 +570,7 @@ impl ChatCompletion {
         };
         Self {
             id: String::new(),
-            object: "text_completion".into(),
+            object: "chat.completion".into(),
             created,
             model,
             system_fingerprint,
@@ -682,7 +682,7 @@ impl ChatCompletionChunk {
         };
         Self {
             id: String::new(),
-            object: "text_completion".to_string(),
+            object: "chat.completion.chunk".to_string(),
             created,
             model,
             system_fingerprint,