From a3dfcf571d95604b2dc8875ae96212dfd3e8bc1a Mon Sep 17 00:00:00 2001 From: baptiste Date: Tue, 11 Mar 2025 10:00:02 +0000 Subject: [PATCH] fix(gaudi): remove use of latest for gaudi docker image + redid gaudi benchmarking section to include best practices --- backends/gaudi/examples/benchmark/README.md | 39 ------- .../gaudi/examples/benchmark/requirements.txt | 4 - .../examples/benchmark/run_generation.py | 110 ------------------ .../gaudi/examples/benchmark/tgi_client.py | 88 -------------- .../docker_commands/docker_commands.md | 22 ++-- docs/source/backends/gaudi.mdx | 40 ++++--- 6 files changed, 37 insertions(+), 266 deletions(-) delete mode 100644 backends/gaudi/examples/benchmark/README.md delete mode 100644 backends/gaudi/examples/benchmark/requirements.txt delete mode 100644 backends/gaudi/examples/benchmark/run_generation.py delete mode 100644 backends/gaudi/examples/benchmark/tgi_client.py diff --git a/backends/gaudi/examples/benchmark/README.md b/backends/gaudi/examples/benchmark/README.md deleted file mode 100644 index 226595c6..00000000 --- a/backends/gaudi/examples/benchmark/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# TGI-Gaudi example - -This example provide a simple way of usage of `tgi-gaudi` with continuous batching. It uses a small dataset [DIBT/10k_prompts_ranked](https://huggingface.co/datasets/DIBT/10k_prompts_ranked) and present basic performance numbers. - -## Get started - -### Install - -``` -pip install -r requirements.txt -``` - -### Setup TGI server - -More details on runing the TGI server available [here](https://github.com/huggingface/tgi-gaudi/blob/habana-main/README.md#running-tgi-on-gaudi). - -### Run benchmark - -To run benchmark use below command: - -``` -python run_generation --model_id MODEL_ID -``` -where `MODEL_ID` should be set to the same value as in the TGI server instance. -> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=` with a valid Hugging Face Hub read token. - -All possible parameters are described in the below table: -
- -| Name | Default value | Description | -| ------------------------- | :---------------------------- | :------------------------------------------------------------ | -| SERVER_ADDRESS | http://localhost:8080 | The address and port at which the TGI server is available. | -| MODEL_ID | meta-llama/Llama-2-7b-chat-hf | Model ID used in the TGI server instance. | -| MAX_INPUT_LENGTH | 1024 | Maximum input length supported by the TGI server. | -| MAX_OUTPUT_LENGTH | 1024 | Maximum output length supported by the TGI server. | -| TOTAL_SAMPLE_COUNT | 2048 | Number of samples to run. | -| MAX_CONCURRENT_REQUESTS | 256 | The number of requests sent simultaneously to the TGI server. | - -
diff --git a/backends/gaudi/examples/benchmark/requirements.txt b/backends/gaudi/examples/benchmark/requirements.txt deleted file mode 100644 index c772c19e..00000000 --- a/backends/gaudi/examples/benchmark/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -huggingface_hub==0.23.5 -requests==2.31.0 -datasets==2.18.0 -transformers>=4.37.0 diff --git a/backends/gaudi/examples/benchmark/run_generation.py b/backends/gaudi/examples/benchmark/run_generation.py deleted file mode 100644 index 460b40d1..00000000 --- a/backends/gaudi/examples/benchmark/run_generation.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. - -import argparse -import requests -import time -from typing import List - -from datasets import load_dataset -from transformers import AutoTokenizer - -from tgi_client import TgiClient - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--server_address", - type=str, - default="http://localhost:8083", - help="Address of the TGI server", - ) - parser.add_argument( - "--model_id", - type=str, - default="meta-llama/Llama-2-7b-chat-hf", - help="Model id used in TGI server", - ) - parser.add_argument( - "--max_input_length", - type=int, - default=1024, - help="Max input length for TGI model", - ) - parser.add_argument( - "--max_output_length", - type=int, - default=1024, - help="Max output length for TGI model", - ) - parser.add_argument( - "--total_sample_count", - type=int, - default=2048, - help="Total number of samples to generate", - ) - parser.add_argument( - "--max_concurrent_requests", - type=int, - default=256, - help="Max number of concurrent requests", - ) - parser.add_argument("--seed", type=int, default=42, help="Random seed for datasets") - - return parser.parse_args() - - -def read_dataset( - max_input_length: int, - total_sample_count: int, - model_id: str, - seed: int, -) -> List[str]: - """ - Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked - and filters out too long samples. - """ - tokenizer = AutoTokenizer.from_pretrained(model_id) - - dataset = load_dataset( - "DIBT/10k_prompts_ranked", split="train", trust_remote_code=True - ) - dataset = dataset.filter( - lambda x: len(tokenizer(x["prompt"])["input_ids"]) < max_input_length - ) - if len(dataset) > total_sample_count: - dataset = dataset.select(range(total_sample_count)) - - dataset = dataset.shuffle(seed=seed) - return [sample["prompt"] for sample in dataset] - - -def is_tgi_available(server_address: str) -> bool: - """ - Checks if TGI server is available under the specified address. - """ - try: - info = requests.get(f"{server_address}/info") - return info.status_code == 200 - except Exception: - return False - - -def main(): - args = get_args() - dataset = read_dataset( - args.max_input_length, args.total_sample_count, args.model_id, args.seed - ) - - if not is_tgi_available(args.server_address): - raise RuntimeError("Cannot connect with TGI server!") - - tgi_client = TgiClient(args.server_address, args.max_concurrent_requests) - timestamp = time.perf_counter_ns() - tgi_client.run_generation(dataset, args.max_output_length) - duration_s = (time.perf_counter_ns() - timestamp) * 1e-9 - tgi_client.print_performance_metrics(duration_s) - - -if __name__ == "__main__": - main() diff --git a/backends/gaudi/examples/benchmark/tgi_client.py b/backends/gaudi/examples/benchmark/tgi_client.py deleted file mode 100644 index 66d63ab8..00000000 --- a/backends/gaudi/examples/benchmark/tgi_client.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. - -import os -import statistics -import threading -import time -import tqdm -from typing import List - -from huggingface_hub import InferenceClient - - -def except_hook(args): - print(f"Thread failed with error: {args.exc_value}") - os._exit(1) - - -threading.excepthook = except_hook - - -class TgiClient: - def __init__(self, server_address: str, max_num_threads: int) -> None: - self._lock = threading.Lock() - self._semaphore = threading.Semaphore(max_num_threads) - self._client = InferenceClient(server_address) - - self._ttft = [] - self._tpot = [] - self._generated_tokens = [] - - def run_generation(self, samples: List[str], max_new_tokens: int) -> None: - """ - Run generation for every sample in dataset. - Creates a separate thread for every sample. - """ - threads: List[threading.Thread] = [] - for sample in tqdm.tqdm(samples): - self._semaphore.acquire() - threads.append( - threading.Thread( - target=self._process_sample, args=[sample, max_new_tokens] - ) - ) - threads[-1].start() - for thread in threads: - if thread is not None: - thread.join() - - def _process_sample(self, sample: str, max_new_tokens: int) -> None: - """ - Generates response stream for a single sample. - Collects performance metrics. - """ - timestamp = time.perf_counter_ns() - response_stream = self._client.text_generation( - sample, max_new_tokens=max_new_tokens, stream=True, details=True - ) - out = "" - for id, response in enumerate(response_stream): - if id == 0: - self._ttft.append(time.perf_counter_ns() - timestamp) - else: - self._tpot.append(time.perf_counter_ns() - timestamp) - timestamp = time.perf_counter_ns() - out += response.token.text - if response.details: - self._generated_tokens.append(response.details.generated_tokens) - - self._semaphore.release() - - def print_performance_metrics(self, duration_s: float) -> None: - def line(): - print(32 * "-") - - line() - print("----- Performance summary -----") - line() - print(f"Throughput: {sum(self._generated_tokens) / duration_s:.1f} tokens/s") - print(f"Throughput: {len(self._generated_tokens) / duration_s:.1f} queries/s") - line() - print("First token latency:") - print(f"\tMedian: \t{statistics.median(self._ttft)*1e-6:.2f}ms") - print(f"\tAverage: \t{statistics.fmean(self._ttft)*1e-6:.2f}ms") - line() - print("Output token latency:") - print(f"\tMedian: \t{statistics.median(self._tpot)*1e-6:.2f}ms") - print(f"\tAverage: \t{statistics.fmean(self._tpot)*1e-6:.2f}ms") - line() diff --git a/backends/gaudi/examples/docker_commands/docker_commands.md b/backends/gaudi/examples/docker_commands/docker_commands.md index e540e272..59701289 100644 --- a/backends/gaudi/examples/docker_commands/docker_commands.md +++ b/backends/gaudi/examples/docker_commands/docker_commands.md @@ -23,7 +23,7 @@ docker run -p 8080:80 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \ -e BATCH_BUCKET_SIZE=32 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --max-input-tokens 1024 --max-total-tokens 2048 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \ @@ -47,7 +47,7 @@ docker run -p 8080:80 \ -e BATCH_BUCKET_SIZE=256 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --sharded true --num-shard 8 \ --max-input-tokens 1024 --max-total-tokens 2048 \ @@ -72,7 +72,7 @@ docker run -p 8080:80 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \ -e BATCH_BUCKET_SIZE=32 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --max-input-tokens 1024 --max-total-tokens 2048 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \ @@ -96,7 +96,7 @@ docker run -p 8080:80 \ -e BATCH_BUCKET_SIZE=256 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --sharded true --num-shard 8 \ --max-input-tokens 1024 --max-total-tokens 2048 \ @@ -117,7 +117,7 @@ docker run -p 8080:80 \ -v $volume:/data \ -e PREFILL_BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-total-tokens 8192 --max-batch-size 4 @@ -147,7 +147,7 @@ docker run -p 8080:80 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \ -e BATCH_BUCKET_SIZE=32 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --max-input-tokens 1024 --max-total-tokens 2048 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \ @@ -174,7 +174,7 @@ docker run -p 8080:80 \ -e BATCH_BUCKET_SIZE=256 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --sharded true --num-shard 8 \ --max-input-tokens 1024 --max-total-tokens 2048 \ @@ -202,7 +202,7 @@ docker run -p 8080:80 \ -e PREFILL_BATCH_BUCKET_SIZE=2 \ -e BATCH_BUCKET_SIZE=32 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --max-input-tokens 1024 --max-total-tokens 2048 \ --max-batch-prefill-tokens 2048 --max-batch-size 32 \ @@ -229,7 +229,7 @@ docker run -p 8080:80 \ -e BATCH_BUCKET_SIZE=256 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --sharded true --num-shard 8 \ --max-input-tokens 1024 --max-total-tokens 2048 \ @@ -253,7 +253,7 @@ docker run -p 8080:80 \ -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ -e PREFILL_BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-total-tokens 8192 --max-batch-size 4 @@ -275,7 +275,7 @@ docker run -p 8080:80 \ -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ -e PREFILL_BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --sharded true --num-shard 8 \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx index 3a6f631c..c7f73618 100644 --- a/docs/source/backends/gaudi.mdx +++ b/docs/source/backends/gaudi.mdx @@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN docker run --runtime=habana --cap-add=sys_nice --ipc=host \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model ``` @@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN docker run --runtime=habana --cap-add=sys_nice --ipc=host \ -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model ``` @@ -137,7 +137,7 @@ docker run -p 8080:80 \ -e BATCH_BUCKET_SIZE=256 \ -e PREFILL_BATCH_BUCKET_SIZE=4 \ -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --sharded true --num-shard 8 \ --max-input-tokens 1024 --max-total-tokens 2048 \ @@ -163,7 +163,7 @@ docker run -p 8080:80 \ -v $volume:/data \ -e PREFILL_BATCH_BUCKET_SIZE=1 \ -e BATCH_BUCKET_SIZE=1 \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model \ --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ --max-total-tokens 8192 --max-batch-size 4 @@ -181,18 +181,30 @@ curl -N 127.0.0.1:8080/generate \ ### How to Benchmark Performance -#### Option 1: Static Batching Benchmark (Recommended) -To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark). +We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware. + +This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios. To run it on the same machine, you can do the following: -* `docker exec -it bash` , pick the docker started from step 2 using docker ps -* `text-generation-benchmark -t ` , pass the model-id from docker run command -* after the completion of tests, hit ctrl+c to see the performance data summary. +```bash +MODEL=meta-llama/Llama-3.1-8B-Instruct +HF_TOKEN= +# run a benchmark to evaluate the performance of the model for chat use case +# we mount results to the current directory +docker run \ + --rm \ + -it \ + --net host \ + -v $(pwd):/opt/inference-benchmarker/results \ + -e "HF_TOKEN=$HF_TOKEN" \ + ghcr.io/huggingface/inference-benchmarker:latest \ + inference-benchmarker \ + --tokenizer-name "$MODEL" \ + --url http://localhost:8080 \ + --profile chat +``` -> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters. - -#### Option 2: Continuous Batching Benchmark -To run continuous batching benchmark, please refer to [README in examples/benchmark folder in the gaudi backend](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/benchmark). +Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details. ### How to Profile Performance @@ -218,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \ -e PROF_PATH=/tmp/hpu_profile \ -e PROF_RANKS=0 \ -e PROF_RECORD_SHAPES=True \ - ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ --model-id $model ```