fix(gaudi): remove use of latest for gaudi docker image + redid gaudi benchmarking section to include best practices

This commit is contained in:
baptiste 2025-03-11 10:00:02 +00:00
parent 2fd0049929
commit a3dfcf571d
6 changed files with 37 additions and 266 deletions

View File

@ -1,39 +0,0 @@
# TGI-Gaudi example
This example provide a simple way of usage of `tgi-gaudi` with continuous batching. It uses a small dataset [DIBT/10k_prompts_ranked](https://huggingface.co/datasets/DIBT/10k_prompts_ranked) and present basic performance numbers.
## Get started
### Install
```
pip install -r requirements.txt
```
### Setup TGI server
More details on runing the TGI server available [here](https://github.com/huggingface/tgi-gaudi/blob/habana-main/README.md#running-tgi-on-gaudi).
### Run benchmark
To run benchmark use below command:
```
python run_generation --model_id MODEL_ID
```
where `MODEL_ID` should be set to the same value as in the TGI server instance.
> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
All possible parameters are described in the below table:
<div align="left">
| Name | Default value | Description |
| ------------------------- | :---------------------------- | :------------------------------------------------------------ |
| SERVER_ADDRESS | http://localhost:8080 | The address and port at which the TGI server is available. |
| MODEL_ID | meta-llama/Llama-2-7b-chat-hf | Model ID used in the TGI server instance. |
| MAX_INPUT_LENGTH | 1024 | Maximum input length supported by the TGI server. |
| MAX_OUTPUT_LENGTH | 1024 | Maximum output length supported by the TGI server. |
| TOTAL_SAMPLE_COUNT | 2048 | Number of samples to run. |
| MAX_CONCURRENT_REQUESTS | 256 | The number of requests sent simultaneously to the TGI server. |
</div>

View File

@ -1,4 +0,0 @@
huggingface_hub==0.23.5
requests==2.31.0
datasets==2.18.0
transformers>=4.37.0

View File

@ -1,110 +0,0 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
import argparse
import requests
import time
from typing import List
from datasets import load_dataset
from transformers import AutoTokenizer
from tgi_client import TgiClient
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--server_address",
type=str,
default="http://localhost:8083",
help="Address of the TGI server",
)
parser.add_argument(
"--model_id",
type=str,
default="meta-llama/Llama-2-7b-chat-hf",
help="Model id used in TGI server",
)
parser.add_argument(
"--max_input_length",
type=int,
default=1024,
help="Max input length for TGI model",
)
parser.add_argument(
"--max_output_length",
type=int,
default=1024,
help="Max output length for TGI model",
)
parser.add_argument(
"--total_sample_count",
type=int,
default=2048,
help="Total number of samples to generate",
)
parser.add_argument(
"--max_concurrent_requests",
type=int,
default=256,
help="Max number of concurrent requests",
)
parser.add_argument("--seed", type=int, default=42, help="Random seed for datasets")
return parser.parse_args()
def read_dataset(
max_input_length: int,
total_sample_count: int,
model_id: str,
seed: int,
) -> List[str]:
"""
Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked
and filters out too long samples.
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset(
"DIBT/10k_prompts_ranked", split="train", trust_remote_code=True
)
dataset = dataset.filter(
lambda x: len(tokenizer(x["prompt"])["input_ids"]) < max_input_length
)
if len(dataset) > total_sample_count:
dataset = dataset.select(range(total_sample_count))
dataset = dataset.shuffle(seed=seed)
return [sample["prompt"] for sample in dataset]
def is_tgi_available(server_address: str) -> bool:
"""
Checks if TGI server is available under the specified address.
"""
try:
info = requests.get(f"{server_address}/info")
return info.status_code == 200
except Exception:
return False
def main():
args = get_args()
dataset = read_dataset(
args.max_input_length, args.total_sample_count, args.model_id, args.seed
)
if not is_tgi_available(args.server_address):
raise RuntimeError("Cannot connect with TGI server!")
tgi_client = TgiClient(args.server_address, args.max_concurrent_requests)
timestamp = time.perf_counter_ns()
tgi_client.run_generation(dataset, args.max_output_length)
duration_s = (time.perf_counter_ns() - timestamp) * 1e-9
tgi_client.print_performance_metrics(duration_s)
if __name__ == "__main__":
main()

View File

@ -1,88 +0,0 @@
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
import os
import statistics
import threading
import time
import tqdm
from typing import List
from huggingface_hub import InferenceClient
def except_hook(args):
print(f"Thread failed with error: {args.exc_value}")
os._exit(1)
threading.excepthook = except_hook
class TgiClient:
def __init__(self, server_address: str, max_num_threads: int) -> None:
self._lock = threading.Lock()
self._semaphore = threading.Semaphore(max_num_threads)
self._client = InferenceClient(server_address)
self._ttft = []
self._tpot = []
self._generated_tokens = []
def run_generation(self, samples: List[str], max_new_tokens: int) -> None:
"""
Run generation for every sample in dataset.
Creates a separate thread for every sample.
"""
threads: List[threading.Thread] = []
for sample in tqdm.tqdm(samples):
self._semaphore.acquire()
threads.append(
threading.Thread(
target=self._process_sample, args=[sample, max_new_tokens]
)
)
threads[-1].start()
for thread in threads:
if thread is not None:
thread.join()
def _process_sample(self, sample: str, max_new_tokens: int) -> None:
"""
Generates response stream for a single sample.
Collects performance metrics.
"""
timestamp = time.perf_counter_ns()
response_stream = self._client.text_generation(
sample, max_new_tokens=max_new_tokens, stream=True, details=True
)
out = ""
for id, response in enumerate(response_stream):
if id == 0:
self._ttft.append(time.perf_counter_ns() - timestamp)
else:
self._tpot.append(time.perf_counter_ns() - timestamp)
timestamp = time.perf_counter_ns()
out += response.token.text
if response.details:
self._generated_tokens.append(response.details.generated_tokens)
self._semaphore.release()
def print_performance_metrics(self, duration_s: float) -> None:
def line():
print(32 * "-")
line()
print("----- Performance summary -----")
line()
print(f"Throughput: {sum(self._generated_tokens) / duration_s:.1f} tokens/s")
print(f"Throughput: {len(self._generated_tokens) / duration_s:.1f} queries/s")
line()
print("First token latency:")
print(f"\tMedian: \t{statistics.median(self._ttft)*1e-6:.2f}ms")
print(f"\tAverage: \t{statistics.fmean(self._ttft)*1e-6:.2f}ms")
line()
print("Output token latency:")
print(f"\tMedian: \t{statistics.median(self._tpot)*1e-6:.2f}ms")
print(f"\tAverage: \t{statistics.fmean(self._tpot)*1e-6:.2f}ms")
line()

View File

@ -23,7 +23,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -47,7 +47,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \
@ -72,7 +72,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -96,7 +96,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \
@ -117,7 +117,7 @@ docker run -p 8080:80 \
-v $volume:/data \
-e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4
@ -147,7 +147,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -174,7 +174,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \
@ -202,7 +202,7 @@ docker run -p 8080:80 \
-e PREFILL_BATCH_BUCKET_SIZE=2 \
-e BATCH_BUCKET_SIZE=32 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--max-input-tokens 1024 --max-total-tokens 2048 \
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -229,7 +229,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \
@ -253,7 +253,7 @@ docker run -p 8080:80 \
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4
@ -275,7 +275,7 @@ docker run -p 8080:80 \
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--sharded true --num-shard 8 \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \

View File

@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
docker run --runtime=habana --cap-add=sys_nice --ipc=host \
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model
```
@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN
docker run --runtime=habana --cap-add=sys_nice --ipc=host \
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model
<text-generation-inference-launcher-arguments>
```
@ -137,7 +137,7 @@ docker run -p 8080:80 \
-e BATCH_BUCKET_SIZE=256 \
-e PREFILL_BATCH_BUCKET_SIZE=4 \
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--sharded true --num-shard 8 \
--max-input-tokens 1024 --max-total-tokens 2048 \
@ -163,7 +163,7 @@ docker run -p 8080:80 \
-v $volume:/data \
-e PREFILL_BATCH_BUCKET_SIZE=1 \
-e BATCH_BUCKET_SIZE=1 \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model \
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--max-total-tokens 8192 --max-batch-size 4
@ -181,18 +181,30 @@ curl -N 127.0.0.1:8080/generate \
### How to Benchmark Performance
#### Option 1: Static Batching Benchmark (Recommended)
To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark).
We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware.
This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios.
To run it on the same machine, you can do the following:
* `docker exec -it <docker name> bash` , pick the docker started from step 2 using docker ps
* `text-generation-benchmark -t <model-id>` , pass the model-id from docker run command
* after the completion of tests, hit ctrl+c to see the performance data summary.
```bash
MODEL=meta-llama/Llama-3.1-8B-Instruct
HF_TOKEN=<your HF READ token>
# run a benchmark to evaluate the performance of the model for chat use case
# we mount results to the current directory
docker run \
--rm \
-it \
--net host \
-v $(pwd):/opt/inference-benchmarker/results \
-e "HF_TOKEN=$HF_TOKEN" \
ghcr.io/huggingface/inference-benchmarker:latest \
inference-benchmarker \
--tokenizer-name "$MODEL" \
--url http://localhost:8080 \
--profile chat
```
> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters.
#### Option 2: Continuous Batching Benchmark
To run continuous batching benchmark, please refer to [README in examples/benchmark folder in the gaudi backend](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/benchmark).
Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details.
### How to Profile Performance
@ -218,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
-e PROF_PATH=/tmp/hpu_profile \
-e PROF_RANKS=0 \
-e PROF_RECORD_SHAPES=True \
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
--model-id $model
```