fix(gaudi): remove use of latest for gaudi docker image + redid gaudi benchmarking section to include best practices

This commit is contained in:
baptiste 2025-03-11 10:00:02 +00:00
parent 2fd0049929
commit a3dfcf571d
6 changed files with 37 additions and 266 deletions

View File

@ -1,39 +0,0 @@
# TGI-Gaudi example
This example provide a simple way of usage of `tgi-gaudi` with continuous batching. It uses a small dataset [DIBT/10k_prompts_ranked](https://huggingface.co/datasets/DIBT/10k_prompts_ranked) and present basic performance numbers.
## Get started
### Install
```
pip install -r requirements.txt
```
### Setup TGI server
More details on runing the TGI server available [here](https://github.com/huggingface/tgi-gaudi/blob/habana-main/README.md#running-tgi-on-gaudi).
### Run benchmark
To run benchmark use below command:
```
python run_generation --model_id MODEL_ID
```
where `MODEL_ID` should be set to the same value as in the TGI server instance.
> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
All possible parameters are described in the below table:
<div align="left">
| Name | Default value | Description |
| ------------------------- | :---------------------------- | :------------------------------------------------------------ |
| SERVER_ADDRESS | http://localhost:8080 | The address and port at which the TGI server is available. |
| MODEL_ID | meta-llama/Llama-2-7b-chat-hf | Model ID used in the TGI server instance. |
| MAX_INPUT_LENGTH | 1024 | Maximum input length supported by the TGI server. |
| MAX_OUTPUT_LENGTH | 1024 | Maximum output length supported by the TGI server. |
| TOTAL_SAMPLE_COUNT | 2048 | Number of samples to run. |
| MAX_CONCURRENT_REQUESTS | 256 | The number of requests sent simultaneously to the TGI server. |
</div>

View File

@ -1,4 +0,0 @@
huggingface_hub==0.23.5
requests==2.31.0
datasets==2.18.0
transformers>=4.37.0

View File

@ -1,110 +0,0 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
import argparse
import requests
import time
from typing import List
from datasets import load_dataset
from transformers import AutoTokenizer
from tgi_client import TgiClient
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--server_address",
type=str,
default="http://localhost:8083",
help="Address of the TGI server",
)
parser.add_argument(
"--model_id",
type=str,
default="meta-llama/Llama-2-7b-chat-hf",
help="Model id used in TGI server",
)
parser.add_argument(
"--max_input_length",
type=int,
default=1024,
help="Max input length for TGI model",
)
parser.add_argument(
"--max_output_length",
type=int,
default=1024,
help="Max output length for TGI model",
)
parser.add_argument(
"--total_sample_count",
type=int,
default=2048,
help="Total number of samples to generate",
)
parser.add_argument(
"--max_concurrent_requests",
type=int,
default=256,
help="Max number of concurrent requests",
)
parser.add_argument("--seed", type=int, default=42, help="Random seed for datasets")
return parser.parse_args()
def read_dataset(
max_input_length: int,
total_sample_count: int,
model_id: str,
seed: int,
) -> List[str]:
"""
Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked
and filters out too long samples.
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset(
"DIBT/10k_prompts_ranked", split="train", trust_remote_code=True
)
dataset = dataset.filter(
lambda x: len(tokenizer(x["prompt"])["input_ids"]) < max_input_length
)
if len(dataset) > total_sample_count:
dataset = dataset.select(range(total_sample_count))
dataset = dataset.shuffle(seed=seed)
return [sample["prompt"] for sample in dataset]
def is_tgi_available(server_address: str) -> bool:
"""
Checks if TGI server is available under the specified address.
"""
try:
info = requests.get(f"{server_address}/info")
return info.status_code == 200
except Exception:
return False
def main():
args = get_args()
dataset = read_dataset(
args.max_input_length, args.total_sample_count, args.model_id, args.seed
)
if not is_tgi_available(args.server_address):
raise RuntimeError("Cannot connect with TGI server!")
tgi_client = TgiClient(args.server_address, args.max_concurrent_requests)
timestamp = time.perf_counter_ns()
tgi_client.run_generation(dataset, args.max_output_length)
duration_s = (time.perf_counter_ns() - timestamp) * 1e-9
tgi_client.print_performance_metrics(duration_s)
if __name__ == "__main__":
main()

View File

@ -1,88 +0,0 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
import os
import statistics
import threading
import time
import tqdm
from typing import List
from huggingface_hub import InferenceClient
def except_hook(args):
print(f"Thread failed with error: {args.exc_value}")
os._exit(1)
threading.excepthook = except_hook
class TgiClient:
def __init__(self, server_address: str, max_num_threads: int) -> None:
self._lock = threading.Lock()
self._semaphore = threading.Semaphore(max_num_threads)
self._client = InferenceClient(server_address)
self._ttft = []
self._tpot = []
self._generated_tokens = []
def run_generation(self, samples: List[str], max_new_tokens: int) -> None:
"""
Run generation for every sample in dataset.
Creates a separate thread for every sample.
"""
threads: List[threading.Thread] = []
for sample in tqdm.tqdm(samples):
self._semaphore.acquire()
threads.append(
threading.Thread(
target=self._process_sample, args=[sample, max_new_tokens]
)
)
threads[-1].start()
for thread in threads:
if thread is not None:
thread.join()
def _process_sample(self, sample: str, max_new_tokens: int) -> None:
"""
Generates response stream for a single sample.
Collects performance metrics.
"""
timestamp = time.perf_counter_ns()
response_stream = self._client.text_generation(
sample, max_new_tokens=max_new_tokens, stream=True, details=True
)
out = ""
for id, response in enumerate(response_stream):
if id == 0:
self._ttft.append(time.perf_counter_ns() - timestamp)
else:
self._tpot.append(time.perf_counter_ns() - timestamp)
timestamp = time.perf_counter_ns()
out += response.token.text
if response.details:
self._generated_tokens.append(response.details.generated_tokens)
self._semaphore.release()
def print_performance_metrics(self, duration_s: float) -> None:
def line():
print(32 * "-")
line()
print("----- Performance summary -----")
line()
print(f"Throughput: {sum(self._generated_tokens) / duration_s:.1f} tokens/s")
print(f"Throughput: {len(self._generated_tokens) / duration_s:.1f} queries/s")
line()
print("First token latency:")
print(f"\tMedian: \t{statistics.median(self._ttft)*1e-6:.2f}ms")
print(f"\tAverage: \t{statistics.fmean(self._ttft)*1e-6:.2f}ms")
line()
print("Output token latency:")
print(f"\tMedian: \t{statistics.median(self._tpot)*1e-6:.2f}ms")
print(f"\tAverage: \t{statistics.fmean(self._tpot)*1e-6:.2f}ms")
line()

View File

@ -23,7 +23,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \ -e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -47,7 +47,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \ -e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
@ -72,7 +72,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \ -e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -96,7 +96,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \ -e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
@ -117,7 +117,7 @@ docker run -p 8080:80 \
-v $volume:/data \ -v $volume:/data \
-e PREFILL_BATCH_BUCKET_SIZE=1 \ -e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4 --max-total-tokens 8192 --max-batch-size 4
@ -147,7 +147,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \ -e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -174,7 +174,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \ -e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
@ -202,7 +202,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \ -e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -229,7 +229,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \ -e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
@ -253,7 +253,7 @@ docker run -p 8080:80 \
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-e PREFILL_BATCH_BUCKET_SIZE=1 \ -e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4 --max-total-tokens 8192 --max-batch-size 4
@ -275,7 +275,7 @@ docker run -p 8080:80 \
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-e PREFILL_BATCH_BUCKET_SIZE=1 \ -e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \

View File

@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
docker run --runtime=habana --cap-add=sys_nice --ipc=host \ docker run --runtime=habana --cap-add=sys_nice --ipc=host \
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model --model-id $model
``` ```
@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN
docker run --runtime=habana --cap-add=sys_nice --ipc=host \ docker run --runtime=habana --cap-add=sys_nice --ipc=host \
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model --model-id $model
<text-generation-inference-launcher-arguments> <text-generation-inference-launcher-arguments>
``` ```
@ -137,7 +137,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \ -e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--sharded true --num-shard 8 \ --sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \ --max-input-tokens 1024 --max-total-tokens 2048 \
@ -163,7 +163,7 @@ docker run -p 8080:80 \
-v $volume:/data \ -v $volume:/data \
-e PREFILL_BATCH_BUCKET_SIZE=1 \ -e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \ --model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4 --max-total-tokens 8192 --max-batch-size 4
@ -181,18 +181,30 @@ curl -N 127.0.0.1:8080/generate \
### How to Benchmark Performance ### How to Benchmark Performance
#### Option 1: Static Batching Benchmark (Recommended) We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware.
To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark).
This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios.
To run it on the same machine, you can do the following: To run it on the same machine, you can do the following:
* `docker exec -it <docker name> bash` , pick the docker started from step 2 using docker ps ```bash
* `text-generation-benchmark -t <model-id>` , pass the model-id from docker run command MODEL=meta-llama/Llama-3.1-8B-Instruct
* after the completion of tests, hit ctrl+c to see the performance data summary. HF_TOKEN=<your HF READ token>
# run a benchmark to evaluate the performance of the model for chat use case
# we mount results to the current directory
docker run \
--rm \
-it \
--net host \
-v $(pwd):/opt/inference-benchmarker/results \
-e "HF_TOKEN=$HF_TOKEN" \
ghcr.io/huggingface/inference-benchmarker:latest \
inference-benchmarker \
--tokenizer-name "$MODEL" \
--url http://localhost:8080 \
--profile chat
```
> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters. Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details.
#### Option 2: Continuous Batching Benchmark
To run continuous batching benchmark, please refer to [README in examples/benchmark folder in the gaudi backend](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/benchmark).
### How to Profile Performance ### How to Profile Performance
@ -218,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
-e PROF_PATH=/tmp/hpu_profile \ -e PROF_PATH=/tmp/hpu_profile \
-e PROF_RANKS=0 \ -e PROF_RANKS=0 \
-e PROF_RECORD_SHAPES=True \ -e PROF_RECORD_SHAPES=True \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \ ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model --model-id $model
``` ```