Merge branch 'main' into ci_amd3

This commit is contained in:
fxmarty 2024-06-25 11:20:00 +02:00
commit dc53846456
16 changed files with 96 additions and 119 deletions

View File

@ -11,66 +11,24 @@ on:
- 'main' - 'main'
jobs: jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
env:
AWS_REGION: eu-central-1
EC2_AMI_ID: ami-0ab09c07cfd194259
EC2_INSTANCE_TYPE: g5.12xlarge
EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
EC2_SECURITY_GROUP: sg-072f92ae3082936c6
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ env.EC2_AMI_ID }}
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
subnet-id: ${{ env.EC2_SUBNET_ID }}
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-tgi-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
load-tests: load-tests:
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
needs: start-runner # required to start the main job when the runner is ready runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
env: env:
DOCKER_VOLUME: /cache DOCKER_VOLUME: /cache
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: Prepare disks
run: |
sudo mkfs -t ext4 /dev/nvme1n1
sudo mkdir ${{ env.DOCKER_VOLUME }}
sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
- name: Install k6 - name: Install k6
run: | run: |
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1 curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
- name: Start starcoder - name: Start starcoder
run: | run: |
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768 docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
sleep 10 sleep 10
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
@ -82,27 +40,3 @@ jobs:
if: ${{ always() }} if: ${{ always() }}
run: | run: |
docker stop tgi-starcoder || true docker stop tgi-starcoder || true
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- load-tests
runs-on: ubuntu-latest
env:
AWS_REGION: eu-central-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

View File

@ -72,7 +72,7 @@ jobs:
- name: Run server tests - name: Run server tests
run: | run: |
pip install pytest pip install pytest
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
pytest -s -vv server/tests pytest -s -vv server/tests
- name: Pre-commit checks - name: Pre-commit checks
run: | run: |

View File

@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
# Text Generation Inference base env # Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \ ENV HUGGINGFACE_HUB_CACHE=/data \

View File

@ -105,14 +105,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat
### Using a private or gated model ### Using a private or gated model
You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
`text-generation-inference`. This allows you to gain access to protected resources. `text-generation-inference`. This allows you to gain access to protected resources.
For example, if you want to serve the gated Llama V2 model variants: For example, if you want to serve the gated Llama V2 model variants:
1. Go to https://huggingface.co/settings/tokens 1. Go to https://huggingface.co/settings/tokens
2. Copy your cli READ token 2. Copy your cli READ token
3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>` 3. Export `HF_TOKEN=<your cli READ token>`
or with Docker: or with Docker:
@ -121,7 +121,7 @@ model=meta-llama/Llama-2-7b-chat-hf
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
token=<your cli READ token> token=<your cli READ token>
docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
``` ```
### A note on Shared Memory (shm) ### A note on Shared Memory (shm)
@ -153,7 +153,8 @@ this will impact performance.
### Distributed Tracing ### Distributed Tracing
`text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
by setting the address to an OTLP collector with the `--otlp-endpoint` argument. by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
overridden with the `--otlp-service-name` argument
### Architecture ### Architecture

View File

@ -147,7 +147,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing::info!("Downloading tokenizer"); tracing::info!("Downloading tokenizer");
// Parse Huggingface hub token // Parse Huggingface hub token
let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); let auth_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
// Download and instantiate tokenizer // Download and instantiate tokenizer
// We need to download it outside of the Tokio runtime // We need to download it outside of the Tokio runtime

View File

@ -1,5 +1,5 @@
from enum import Enum from enum import Enum
from pydantic import BaseModel, field_validator from pydantic import BaseModel, field_validator, ConfigDict
from typing import Optional, List, Union, Any from typing import Optional, List, Union, Any
from text_generation.errors import ValidationError from text_generation.errors import ValidationError
@ -452,5 +452,9 @@ class StreamResponse(BaseModel):
# Inference API currently deployed model # Inference API currently deployed model
class DeployedModel(BaseModel): class DeployedModel(BaseModel):
# Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
# with model_ prefixes, since this disables guardrails for colliding fields:
# https://github.com/pydantic/pydantic/issues/9177
model_config = ConfigDict(protected_namespaces=())
model_id: str model_id: str
sha: str sha: str

View File

@ -70,6 +70,8 @@ Options:
[env: JSON_OUTPUT=] [env: JSON_OUTPUT=]
--otlp-endpoint <OTLP_ENDPOINT> --otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=] [env: OTLP_ENDPOINT=]
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
--cors-allow-origin <CORS_ALLOW_ORIGIN> --cors-allow-origin <CORS_ALLOW_ORIGIN>
[env: CORS_ALLOW_ORIGIN=] [env: CORS_ALLOW_ORIGIN=]
--ngrok --ngrok
@ -138,6 +140,8 @@ Serve's command line parameters on the TGI repository are these:
│ --logger-level TEXT [default: INFO] │ │ --logger-level TEXT [default: INFO] │
│ --json-output --no-json-output [default: no-json-output] │ │ --json-output --no-json-output [default: no-json-output] │
│ --otlp-endpoint TEXT [default: None] │ │ --otlp-endpoint TEXT [default: None] │
│ --otlp-service-name TEXT [default: │
│ text-generation-inference...│
│ --help Show this message and exit. │ │ --help Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
``` ```

View File

@ -2,13 +2,13 @@
If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens) If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example: If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
``` ```
export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN> export HF_TOKEN=<YOUR READ TOKEN>
``` ```
If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below. If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
```bash ```bash
model=meta-llama/Llama-2-7b-chat-hf model=meta-llama/Llama-2-7b-chat-hf
@ -17,7 +17,7 @@ token=<your READ token>
docker run --gpus all \ docker run --gpus all \
--shm-size 1g \ --shm-size 1g \
-e HUGGING_FACE_HUB_TOKEN=$token \ -e HF_TOKEN=$token \
-p 8080:80 \ -p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \ -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
--model-id $model --model-id $model

View File

@ -336,6 +336,13 @@ Options:
--otlp-endpoint <OTLP_ENDPOINT> --otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=] [env: OTLP_ENDPOINT=]
```
## OTLP_SERVICE_NAME
```shell
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
[default: text-generation-inference.router]
``` ```
## CORS_ALLOW_ORIGIN ## CORS_ALLOW_ORIGIN
```shell ```shell

View File

@ -1,38 +1,38 @@
import sys
import subprocess
import contextlib
import pytest
import asyncio import asyncio
import os import contextlib
import docker
import json import json
import math import math
import os
import random
import re
import shutil import shutil
import subprocess
import sys
import tempfile import tempfile
import time import time
import random from typing import Dict, List, Optional
from docker.errors import NotFound import docker
from typing import Optional, List, Dict import pytest
from syrupy.extensions.json import JSONSnapshotExtension
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
from syrupy.extensions.json import JSONSnapshotExtension
from text_generation import AsyncClient from text_generation import AsyncClient
from text_generation.types import ( from text_generation.types import (
Response,
Details,
InputToken,
Token,
BestOfSequence, BestOfSequence,
Grammar,
ChatComplete, ChatComplete,
ChatCompletionChunk, ChatCompletionChunk,
ChatCompletionComplete, ChatCompletionComplete,
Completion, Completion,
Details,
Grammar,
InputToken,
Response,
Token,
) )
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None) DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None) HF_TOKEN = os.getenv("HF_TOKEN", None)
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES") DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
SYSTEM = os.getenv("SYSTEM", None) SYSTEM = os.getenv("SYSTEM", None)
@ -455,8 +455,8 @@ def launcher(event_loop):
if not use_flash_attention: if not use_flash_attention:
env["USE_FLASH_ATTENTION"] = "false" env["USE_FLASH_ATTENTION"] = "false"
if HUGGING_FACE_HUB_TOKEN is not None: if HF_TOKEN is not None:
env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN env["HF_TOKEN"] = HF_TOKEN
volumes = [] volumes = []
if DOCKER_VOLUME: if DOCKER_VOLUME:

View File

@ -413,6 +413,9 @@ struct Args {
#[clap(long, env)] #[clap(long, env)]
otlp_endpoint: Option<String>, otlp_endpoint: Option<String>,
#[clap(default_value = "text-generation-inference.router", long, env)]
otlp_service_name: String,
#[clap(long, env)] #[clap(long, env)]
cors_allow_origin: Vec<String>, cors_allow_origin: Vec<String>,
#[clap(long, env)] #[clap(long, env)]
@ -483,6 +486,7 @@ fn shard_manager(
max_batch_size: Option<usize>, max_batch_size: Option<usize>,
max_input_tokens: usize, max_input_tokens: usize,
otlp_endpoint: Option<String>, otlp_endpoint: Option<String>,
otlp_service_name: String,
log_level: LevelFilter, log_level: LevelFilter,
status_sender: mpsc::Sender<ShardStatus>, status_sender: mpsc::Sender<ShardStatus>,
shutdown: Arc<AtomicBool>, shutdown: Arc<AtomicBool>,
@ -548,12 +552,16 @@ fn shard_manager(
(None, Some(factor)) => Some((RopeScaling::Linear, factor)), (None, Some(factor)) => Some((RopeScaling::Linear, factor)),
}; };
// OpenTelemetry // OpenTelemetry Endpoint
if let Some(otlp_endpoint) = otlp_endpoint { if let Some(otlp_endpoint) = otlp_endpoint {
shard_args.push("--otlp-endpoint".to_string()); shard_args.push("--otlp-endpoint".to_string());
shard_args.push(otlp_endpoint); shard_args.push(otlp_endpoint);
} }
// OpenTelemetry Service Name
shard_args.push("--otlp-service-name".to_string());
shard_args.push(otlp_service_name);
// In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter. // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
shard_args.push("--max-input-tokens".to_string()); shard_args.push("--max-input-tokens".to_string());
shard_args.push(max_input_tokens.to_string()); shard_args.push(max_input_tokens.to_string());
@ -592,7 +600,7 @@ fn shard_manager(
// Parse Inference API token // Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") { if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) envs.push(("HF_TOKEN".into(), api_token.into()))
}; };
// Detect rope scaling // Detect rope scaling
@ -751,7 +759,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
fn num_cuda_devices() -> Option<usize> { fn num_cuda_devices() -> Option<usize> {
let devices = match env::var("CUDA_VISIBLE_DEVICES") { let devices = match env::var("CUDA_VISIBLE_DEVICES") {
Ok(devices) => devices, Ok(devices) => devices,
Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?, Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
Ok(devices) => devices,
Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
}
}; };
let n_devices = devices.split(',').count(); let n_devices = devices.split(',').count();
Some(n_devices) Some(n_devices)
@ -824,9 +835,9 @@ fn find_num_shards(
let num_shard = match (sharded, num_shard) { let num_shard = match (sharded, num_shard) {
(Some(true), None) => { (Some(true), None) => {
// try to default to the number of available GPUs // try to default to the number of available GPUs
tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES"); tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
let n_devices = num_cuda_devices() let n_devices = num_cuda_devices()
.expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set"); .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
if n_devices <= 1 { if n_devices <= 1 {
return Err(LauncherError::NotEnoughCUDADevices(format!( return Err(LauncherError::NotEnoughCUDADevices(format!(
"`sharded` is true but only found {n_devices} CUDA devices" "`sharded` is true but only found {n_devices} CUDA devices"
@ -925,7 +936,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
// Parse Inference API token // Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") { if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) envs.push(("HF_TOKEN".into(), api_token.into()))
}; };
// If args.weights_cache_override is some, pass it to the download process // If args.weights_cache_override is some, pass it to the download process
@ -1035,6 +1046,7 @@ fn spawn_shards(
let shutdown = shutdown.clone(); let shutdown = shutdown.clone();
let shutdown_sender = shutdown_sender.clone(); let shutdown_sender = shutdown_sender.clone();
let otlp_endpoint = args.otlp_endpoint.clone(); let otlp_endpoint = args.otlp_endpoint.clone();
let otlp_service_name = args.otlp_service_name.clone();
let quantize = args.quantize; let quantize = args.quantize;
let speculate = args.speculate; let speculate = args.speculate;
let dtype = args.dtype; let dtype = args.dtype;
@ -1074,6 +1086,7 @@ fn spawn_shards(
max_batch_size, max_batch_size,
max_input_tokens, max_input_tokens,
otlp_endpoint, otlp_endpoint,
otlp_service_name,
max_log_level, max_log_level,
status_sender, status_sender,
shutdown, shutdown,
@ -1207,6 +1220,12 @@ fn spawn_webserver(
router_args.push(otlp_endpoint); router_args.push(otlp_endpoint);
} }
// OpenTelemetry
let otlp_service_name = args.otlp_service_name;
router_args.push("--otlp-service-name".to_string());
router_args.push(otlp_service_name);
// CORS origins // CORS origins
for origin in args.cors_allow_origin.into_iter() { for origin in args.cors_allow_origin.into_iter() {
router_args.push("--cors-allow-origin".to_string()); router_args.push("--cors-allow-origin".to_string());
@ -1227,7 +1246,7 @@ fn spawn_webserver(
// Parse Inference API token // Parse Inference API token
if let Ok(api_token) = env::var("HF_API_TOKEN") { if let Ok(api_token) = env::var("HF_API_TOKEN") {
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into())) envs.push(("HF_TOKEN".into(), api_token.into()))
}; };
// Parse Compute type // Parse Compute type

View File

@ -570,7 +570,7 @@ impl ChatCompletion {
}; };
Self { Self {
id: String::new(), id: String::new(),
object: "text_completion".into(), object: "chat.completion".into(),
created, created,
model, model,
system_fingerprint, system_fingerprint,
@ -682,7 +682,7 @@ impl ChatCompletionChunk {
}; };
Self { Self {
id: String::new(), id: String::new(),
object: "text_completion".to_string(), object: "chat.completion.chunk".to_string(),
created, created,
model, model,
system_fingerprint, system_fingerprint,

View File

@ -65,6 +65,8 @@ struct Args {
json_output: bool, json_output: bool,
#[clap(long, env)] #[clap(long, env)]
otlp_endpoint: Option<String>, otlp_endpoint: Option<String>,
#[clap(default_value = "text-generation-inference.router", long, env)]
otlp_service_name: String,
#[clap(long, env)] #[clap(long, env)]
cors_allow_origin: Option<Vec<String>>, cors_allow_origin: Option<Vec<String>>,
#[clap(long, env)] #[clap(long, env)]
@ -107,6 +109,7 @@ async fn main() -> Result<(), RouterError> {
validation_workers, validation_workers,
json_output, json_output,
otlp_endpoint, otlp_endpoint,
otlp_service_name,
cors_allow_origin, cors_allow_origin,
ngrok, ngrok,
ngrok_authtoken, ngrok_authtoken,
@ -117,7 +120,7 @@ async fn main() -> Result<(), RouterError> {
} = args; } = args;
// Launch Tokio runtime // Launch Tokio runtime
init_logging(otlp_endpoint, json_output); init_logging(otlp_endpoint, otlp_service_name, json_output);
// Validate args // Validate args
if max_input_tokens >= max_total_tokens { if max_input_tokens >= max_total_tokens {
@ -156,7 +159,7 @@ async fn main() -> Result<(), RouterError> {
}); });
// Parse Huggingface hub token // Parse Huggingface hub token
let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok(); let authorization_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
// Tokenizer instance // Tokenizer instance
// This will only be used to validate payloads // This will only be used to validate payloads
@ -367,10 +370,11 @@ async fn main() -> Result<(), RouterError> {
/// Init logging using env variables LOG_LEVEL and LOG_FORMAT: /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
/// - otlp_endpoint is an optional URL to an Open Telemetry collector /// - otlp_endpoint is an optional URL to an Open Telemetry collector
/// - otlp_service_name service name to appear in APM
/// - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO) /// - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
/// - LOG_FORMAT may be TEXT or JSON (default to TEXT) /// - LOG_FORMAT may be TEXT or JSON (default to TEXT)
/// - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms) /// - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
fn init_logging(otlp_endpoint: Option<String>, json_output: bool) { fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
let mut layers = Vec::new(); let mut layers = Vec::new();
// STDOUT/STDERR layer // STDOUT/STDERR layer
@ -401,7 +405,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
trace::config() trace::config()
.with_resource(Resource::new(vec![KeyValue::new( .with_resource(Resource::new(vec![KeyValue::new(
"service.name", "service.name",
"text-generation-inference.router", otlp_service_name,
)])) )]))
.with_sampler(Sampler::AlwaysOn), .with_sampler(Sampler::AlwaysOn),
) )

View File

@ -42,6 +42,7 @@ def serve(
logger_level: str = "INFO", logger_level: str = "INFO",
json_output: bool = False, json_output: bool = False,
otlp_endpoint: Optional[str] = None, otlp_endpoint: Optional[str] = None,
otlp_service_name: str = "text-generation-inference.server",
max_input_tokens: Optional[int] = None, max_input_tokens: Optional[int] = None,
): ):
if sharded: if sharded:
@ -76,7 +77,7 @@ def serve(
# Setup OpenTelemetry distributed tracing # Setup OpenTelemetry distributed tracing
if otlp_endpoint is not None: if otlp_endpoint is not None:
setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint) setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
# Downgrade enum into str for easier management later on # Downgrade enum into str for easier management later on
quantize = None if quantize is None else quantize.value quantize = None if quantize is None else quantize.value

View File

@ -54,10 +54,8 @@ class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
) )
def setup_tracing(shard: int, otlp_endpoint: str): def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
resource = Resource.create( resource = Resource.create(attributes={"service.name": otlp_service_name})
attributes={"service.name": f"text-generation-inference.server-{shard}"}
)
span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True) span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
span_processor = BatchSpanProcessor(span_exporter) span_processor = BatchSpanProcessor(span_exporter)

View File

@ -1,5 +1,6 @@
import torch import torch
from loguru import logger from loguru import logger
import subprocess
def is_xpu_available(): def is_xpu_available():
@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
def get_xpu_free_memory(device, memory_fraction): def get_xpu_free_memory(device, memory_fraction):
total_gpu_memory = torch.xpu.get_device_properties(device).total_memory total_memory = torch.xpu.get_device_properties(device).total_memory
free_memory = int(total_gpu_memory * 0.5) device_id = device.index
query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
free_memory = int(total_memory * 0.95 - used_memory)
return free_memory return free_memory