mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-19 15:52:08 +00:00
fix(gaudi): remove use of latest for gaudi docker image + redid gaudi benchmarking section to include best practices
This commit is contained in:
parent
2fd0049929
commit
a3dfcf571d
@ -1,39 +0,0 @@
|
||||
# TGI-Gaudi example
|
||||
|
||||
This example provide a simple way of usage of `tgi-gaudi` with continuous batching. It uses a small dataset [DIBT/10k_prompts_ranked](https://huggingface.co/datasets/DIBT/10k_prompts_ranked) and present basic performance numbers.
|
||||
|
||||
## Get started
|
||||
|
||||
### Install
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Setup TGI server
|
||||
|
||||
More details on runing the TGI server available [here](https://github.com/huggingface/tgi-gaudi/blob/habana-main/README.md#running-tgi-on-gaudi).
|
||||
|
||||
### Run benchmark
|
||||
|
||||
To run benchmark use below command:
|
||||
|
||||
```
|
||||
python run_generation --model_id MODEL_ID
|
||||
```
|
||||
where `MODEL_ID` should be set to the same value as in the TGI server instance.
|
||||
> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
|
||||
|
||||
All possible parameters are described in the below table:
|
||||
<div align="left">
|
||||
|
||||
| Name | Default value | Description |
|
||||
| ------------------------- | :---------------------------- | :------------------------------------------------------------ |
|
||||
| SERVER_ADDRESS | http://localhost:8080 | The address and port at which the TGI server is available. |
|
||||
| MODEL_ID | meta-llama/Llama-2-7b-chat-hf | Model ID used in the TGI server instance. |
|
||||
| MAX_INPUT_LENGTH | 1024 | Maximum input length supported by the TGI server. |
|
||||
| MAX_OUTPUT_LENGTH | 1024 | Maximum output length supported by the TGI server. |
|
||||
| TOTAL_SAMPLE_COUNT | 2048 | Number of samples to run. |
|
||||
| MAX_CONCURRENT_REQUESTS | 256 | The number of requests sent simultaneously to the TGI server. |
|
||||
|
||||
</div>
|
@ -1,4 +0,0 @@
|
||||
huggingface_hub==0.23.5
|
||||
requests==2.31.0
|
||||
datasets==2.18.0
|
||||
transformers>=4.37.0
|
@ -1,110 +0,0 @@
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
|
||||
|
||||
import argparse
|
||||
import requests
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tgi_client import TgiClient
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--server_address",
|
||||
type=str,
|
||||
default="http://localhost:8083",
|
||||
help="Address of the TGI server",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_id",
|
||||
type=str,
|
||||
default="meta-llama/Llama-2-7b-chat-hf",
|
||||
help="Model id used in TGI server",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_input_length",
|
||||
type=int,
|
||||
default=1024,
|
||||
help="Max input length for TGI model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_output_length",
|
||||
type=int,
|
||||
default=1024,
|
||||
help="Max output length for TGI model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--total_sample_count",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="Total number of samples to generate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_concurrent_requests",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Max number of concurrent requests",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="Random seed for datasets")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def read_dataset(
|
||||
max_input_length: int,
|
||||
total_sample_count: int,
|
||||
model_id: str,
|
||||
seed: int,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked
|
||||
and filters out too long samples.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
dataset = load_dataset(
|
||||
"DIBT/10k_prompts_ranked", split="train", trust_remote_code=True
|
||||
)
|
||||
dataset = dataset.filter(
|
||||
lambda x: len(tokenizer(x["prompt"])["input_ids"]) < max_input_length
|
||||
)
|
||||
if len(dataset) > total_sample_count:
|
||||
dataset = dataset.select(range(total_sample_count))
|
||||
|
||||
dataset = dataset.shuffle(seed=seed)
|
||||
return [sample["prompt"] for sample in dataset]
|
||||
|
||||
|
||||
def is_tgi_available(server_address: str) -> bool:
|
||||
"""
|
||||
Checks if TGI server is available under the specified address.
|
||||
"""
|
||||
try:
|
||||
info = requests.get(f"{server_address}/info")
|
||||
return info.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
dataset = read_dataset(
|
||||
args.max_input_length, args.total_sample_count, args.model_id, args.seed
|
||||
)
|
||||
|
||||
if not is_tgi_available(args.server_address):
|
||||
raise RuntimeError("Cannot connect with TGI server!")
|
||||
|
||||
tgi_client = TgiClient(args.server_address, args.max_concurrent_requests)
|
||||
timestamp = time.perf_counter_ns()
|
||||
tgi_client.run_generation(dataset, args.max_output_length)
|
||||
duration_s = (time.perf_counter_ns() - timestamp) * 1e-9
|
||||
tgi_client.print_performance_metrics(duration_s)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,88 +0,0 @@
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
|
||||
|
||||
import os
|
||||
import statistics
|
||||
import threading
|
||||
import time
|
||||
import tqdm
|
||||
from typing import List
|
||||
|
||||
from huggingface_hub import InferenceClient
|
||||
|
||||
|
||||
def except_hook(args):
|
||||
print(f"Thread failed with error: {args.exc_value}")
|
||||
os._exit(1)
|
||||
|
||||
|
||||
threading.excepthook = except_hook
|
||||
|
||||
|
||||
class TgiClient:
|
||||
def __init__(self, server_address: str, max_num_threads: int) -> None:
|
||||
self._lock = threading.Lock()
|
||||
self._semaphore = threading.Semaphore(max_num_threads)
|
||||
self._client = InferenceClient(server_address)
|
||||
|
||||
self._ttft = []
|
||||
self._tpot = []
|
||||
self._generated_tokens = []
|
||||
|
||||
def run_generation(self, samples: List[str], max_new_tokens: int) -> None:
|
||||
"""
|
||||
Run generation for every sample in dataset.
|
||||
Creates a separate thread for every sample.
|
||||
"""
|
||||
threads: List[threading.Thread] = []
|
||||
for sample in tqdm.tqdm(samples):
|
||||
self._semaphore.acquire()
|
||||
threads.append(
|
||||
threading.Thread(
|
||||
target=self._process_sample, args=[sample, max_new_tokens]
|
||||
)
|
||||
)
|
||||
threads[-1].start()
|
||||
for thread in threads:
|
||||
if thread is not None:
|
||||
thread.join()
|
||||
|
||||
def _process_sample(self, sample: str, max_new_tokens: int) -> None:
|
||||
"""
|
||||
Generates response stream for a single sample.
|
||||
Collects performance metrics.
|
||||
"""
|
||||
timestamp = time.perf_counter_ns()
|
||||
response_stream = self._client.text_generation(
|
||||
sample, max_new_tokens=max_new_tokens, stream=True, details=True
|
||||
)
|
||||
out = ""
|
||||
for id, response in enumerate(response_stream):
|
||||
if id == 0:
|
||||
self._ttft.append(time.perf_counter_ns() - timestamp)
|
||||
else:
|
||||
self._tpot.append(time.perf_counter_ns() - timestamp)
|
||||
timestamp = time.perf_counter_ns()
|
||||
out += response.token.text
|
||||
if response.details:
|
||||
self._generated_tokens.append(response.details.generated_tokens)
|
||||
|
||||
self._semaphore.release()
|
||||
|
||||
def print_performance_metrics(self, duration_s: float) -> None:
|
||||
def line():
|
||||
print(32 * "-")
|
||||
|
||||
line()
|
||||
print("----- Performance summary -----")
|
||||
line()
|
||||
print(f"Throughput: {sum(self._generated_tokens) / duration_s:.1f} tokens/s")
|
||||
print(f"Throughput: {len(self._generated_tokens) / duration_s:.1f} queries/s")
|
||||
line()
|
||||
print("First token latency:")
|
||||
print(f"\tMedian: \t{statistics.median(self._ttft)*1e-6:.2f}ms")
|
||||
print(f"\tAverage: \t{statistics.fmean(self._ttft)*1e-6:.2f}ms")
|
||||
line()
|
||||
print("Output token latency:")
|
||||
print(f"\tMedian: \t{statistics.median(self._tpot)*1e-6:.2f}ms")
|
||||
print(f"\tAverage: \t{statistics.fmean(self._tpot)*1e-6:.2f}ms")
|
||||
line()
|
@ -23,7 +23,7 @@ docker run -p 8080:80 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
@ -47,7 +47,7 @@ docker run -p 8080:80 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
@ -72,7 +72,7 @@ docker run -p 8080:80 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
@ -96,7 +96,7 @@ docker run -p 8080:80 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
@ -117,7 +117,7 @@ docker run -p 8080:80 \
|
||||
-v $volume:/data \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
-e BATCH_BUCKET_SIZE=1 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||
--max-total-tokens 8192 --max-batch-size 4
|
||||
@ -147,7 +147,7 @@ docker run -p 8080:80 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
@ -174,7 +174,7 @@ docker run -p 8080:80 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
@ -202,7 +202,7 @@ docker run -p 8080:80 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
@ -229,7 +229,7 @@ docker run -p 8080:80 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
@ -253,7 +253,7 @@ docker run -p 8080:80 \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
-e BATCH_BUCKET_SIZE=1 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||
--max-total-tokens 8192 --max-batch-size 4
|
||||
@ -275,7 +275,7 @@ docker run -p 8080:80 \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
-e BATCH_BUCKET_SIZE=1 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||
|
@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
|
||||
|
||||
docker run --runtime=habana --cap-add=sys_nice --ipc=host \
|
||||
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model
|
||||
```
|
||||
|
||||
@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN
|
||||
|
||||
docker run --runtime=habana --cap-add=sys_nice --ipc=host \
|
||||
-p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model
|
||||
<text-generation-inference-launcher-arguments>
|
||||
```
|
||||
@ -137,7 +137,7 @@ docker run -p 8080:80 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
@ -163,7 +163,7 @@ docker run -p 8080:80 \
|
||||
-v $volume:/data \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
-e BATCH_BUCKET_SIZE=1 \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||
--max-total-tokens 8192 --max-batch-size 4
|
||||
@ -181,18 +181,30 @@ curl -N 127.0.0.1:8080/generate \
|
||||
|
||||
### How to Benchmark Performance
|
||||
|
||||
#### Option 1: Static Batching Benchmark (Recommended)
|
||||
To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark).
|
||||
We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware.
|
||||
|
||||
This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios.
|
||||
|
||||
To run it on the same machine, you can do the following:
|
||||
* `docker exec -it <docker name> bash` , pick the docker started from step 2 using docker ps
|
||||
* `text-generation-benchmark -t <model-id>` , pass the model-id from docker run command
|
||||
* after the completion of tests, hit ctrl+c to see the performance data summary.
|
||||
```bash
|
||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||
HF_TOKEN=<your HF READ token>
|
||||
# run a benchmark to evaluate the performance of the model for chat use case
|
||||
# we mount results to the current directory
|
||||
docker run \
|
||||
--rm \
|
||||
-it \
|
||||
--net host \
|
||||
-v $(pwd):/opt/inference-benchmarker/results \
|
||||
-e "HF_TOKEN=$HF_TOKEN" \
|
||||
ghcr.io/huggingface/inference-benchmarker:latest \
|
||||
inference-benchmarker \
|
||||
--tokenizer-name "$MODEL" \
|
||||
--url http://localhost:8080 \
|
||||
--profile chat
|
||||
```
|
||||
|
||||
> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters.
|
||||
|
||||
#### Option 2: Continuous Batching Benchmark
|
||||
To run continuous batching benchmark, please refer to [README in examples/benchmark folder in the gaudi backend](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/benchmark).
|
||||
Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details.
|
||||
|
||||
### How to Profile Performance
|
||||
|
||||
@ -218,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
|
||||
-e PROF_PATH=/tmp/hpu_profile \
|
||||
-e PROF_RANKS=0 \
|
||||
-e PROF_RECORD_SHAPES=True \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model
|
||||
```
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user