fix(gaudi): remove use of latest for gaudi docker image + redid gaudi benchmarking section to include best practices

2025-09-09 19:34:53 +00:00 · 2025-03-11 10:00:02 +00:00 · 2025-03-11 10:00:02 +00:00 · a3dfcf571d
commit a3dfcf571d
parent 2fd0049929
6 changed files with 37 additions and 266 deletions
--- a/backends/gaudi/examples/benchmark/README.md
+++ b/backends/gaudi/examples/benchmark/README.md
@ -1,39 +0,0 @@
 # TGI-Gaudi example
 This example provide a simple way of usage of `tgi-gaudi` with continuous batching. It uses a small dataset [DIBT/10k_prompts_ranked](https://huggingface.co/datasets/DIBT/10k_prompts_ranked) and present basic performance numbers.
 ## Get started
 ### Install
 ```
 pip install -r requirements.txt
 ```
 ### Setup TGI server
 More details on runing the TGI server available [here](https://github.com/huggingface/tgi-gaudi/blob/habana-main/README.md#running-tgi-on-gaudi).
 ### Run benchmark
 To run benchmark use below command:
 ```
 python run_generation --model_id MODEL_ID
 ```
 where `MODEL_ID` should be set to the same value as in the TGI server instance.
 > For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
 All possible parameters are described in the below table:
 <div align="left">
 | Name                      | Default value                 | Description                                                   |
 | ------------------------- | :---------------------------- | :------------------------------------------------------------ |
 | SERVER_ADDRESS            | http://localhost:8080         | The address and port at which the TGI server is available.    |
 | MODEL_ID                  | meta-llama/Llama-2-7b-chat-hf | Model ID used in the TGI server instance.                     |
 | MAX_INPUT_LENGTH          | 1024                          | Maximum input length supported by the TGI server.             |
 | MAX_OUTPUT_LENGTH         | 1024                          | Maximum output length supported by the TGI server.            |
 | TOTAL_SAMPLE_COUNT        | 2048                          | Number of samples to run.                                     |
 | MAX_CONCURRENT_REQUESTS   | 256                           | The number of requests sent simultaneously to the TGI server. |
 </div>
--- a/backends/gaudi/examples/benchmark/requirements.txt
+++ b/backends/gaudi/examples/benchmark/requirements.txt
@ -1,4 +0,0 @@
 huggingface_hub==0.23.5
 requests==2.31.0
 datasets==2.18.0
 transformers>=4.37.0
--- a/backends/gaudi/examples/benchmark/run_generation.py
+++ b/backends/gaudi/examples/benchmark/run_generation.py
@ -1,110 +0,0 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
 import argparse
 import requests
 import time
 from typing import List
 from datasets import load_dataset
 from transformers import AutoTokenizer
 from tgi_client import TgiClient
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--server_address",
        type=str,
        default="http://localhost:8083",
        help="Address of the TGI server",
    )
    parser.add_argument(
        "--model_id",
        type=str,
        default="meta-llama/Llama-2-7b-chat-hf",
        help="Model id used in TGI server",
    )
    parser.add_argument(
        "--max_input_length",
        type=int,
        default=1024,
        help="Max input length for TGI model",
    )
    parser.add_argument(
        "--max_output_length",
        type=int,
        default=1024,
        help="Max output length for TGI model",
    )
    parser.add_argument(
        "--total_sample_count",
        type=int,
        default=2048,
        help="Total number of samples to generate",
    )
    parser.add_argument(
        "--max_concurrent_requests",
        type=int,
        default=256,
        help="Max number of concurrent requests",
    )
    parser.add_argument("--seed", type=int, default=42, help="Random seed for datasets")
    return parser.parse_args()
 def read_dataset(
    max_input_length: int,
    total_sample_count: int,
    model_id: str,
    seed: int,
 ) -> List[str]:
    """
    Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked
    and filters out too long samples.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    dataset = load_dataset(
        "DIBT/10k_prompts_ranked", split="train", trust_remote_code=True
    )
    dataset = dataset.filter(
        lambda x: len(tokenizer(x["prompt"])["input_ids"]) < max_input_length
    )
    if len(dataset) > total_sample_count:
        dataset = dataset.select(range(total_sample_count))
    dataset = dataset.shuffle(seed=seed)
    return [sample["prompt"] for sample in dataset]
 def is_tgi_available(server_address: str) -> bool:
    """
    Checks if TGI server is available under the specified address.
    """
    try:
        info = requests.get(f"{server_address}/info")
        return info.status_code == 200
    except Exception:
        return False
 def main():
    args = get_args()
    dataset = read_dataset(
        args.max_input_length, args.total_sample_count, args.model_id, args.seed
    )
    if not is_tgi_available(args.server_address):
        raise RuntimeError("Cannot connect with TGI server!")
    tgi_client = TgiClient(args.server_address, args.max_concurrent_requests)
    timestamp = time.perf_counter_ns()
    tgi_client.run_generation(dataset, args.max_output_length)
    duration_s = (time.perf_counter_ns() - timestamp) * 1e-9
    tgi_client.print_performance_metrics(duration_s)
 if __name__ == "__main__":
    main()
--- a/backends/gaudi/examples/benchmark/tgi_client.py
+++ b/backends/gaudi/examples/benchmark/tgi_client.py
@ -1,88 +0,0 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
 import os
 import statistics
 import threading
 import time
 import tqdm
 from typing import List
 from huggingface_hub import InferenceClient
 def except_hook(args):
    print(f"Thread failed with error: {args.exc_value}")
    os._exit(1)
 threading.excepthook = except_hook
 class TgiClient:
    def __init__(self, server_address: str, max_num_threads: int) -> None:
        self._lock = threading.Lock()
        self._semaphore = threading.Semaphore(max_num_threads)
        self._client = InferenceClient(server_address)
        self._ttft = []
        self._tpot = []
        self._generated_tokens = []
    def run_generation(self, samples: List[str], max_new_tokens: int) -> None:
        """
        Run generation for every sample in dataset.
        Creates a separate thread for every sample.
        """
        threads: List[threading.Thread] = []
        for sample in tqdm.tqdm(samples):
            self._semaphore.acquire()
            threads.append(
                threading.Thread(
                    target=self._process_sample, args=[sample, max_new_tokens]
                )
            )
            threads[-1].start()
        for thread in threads:
            if thread is not None:
                thread.join()
    def _process_sample(self, sample: str, max_new_tokens: int) -> None:
        """
        Generates response stream for a single sample.
        Collects performance metrics.
        """
        timestamp = time.perf_counter_ns()
        response_stream = self._client.text_generation(
            sample, max_new_tokens=max_new_tokens, stream=True, details=True
        )
        out = ""
        for id, response in enumerate(response_stream):
            if id == 0:
                self._ttft.append(time.perf_counter_ns() - timestamp)
            else:
                self._tpot.append(time.perf_counter_ns() - timestamp)
            timestamp = time.perf_counter_ns()
            out += response.token.text
            if response.details:
                self._generated_tokens.append(response.details.generated_tokens)
        self._semaphore.release()
    def print_performance_metrics(self, duration_s: float) -> None:
        def line():
            print(32 * "-")
        line()
        print("----- Performance  summary -----")
        line()
        print(f"Throughput: {sum(self._generated_tokens) / duration_s:.1f} tokens/s")
        print(f"Throughput: {len(self._generated_tokens) / duration_s:.1f} queries/s")
        line()
        print("First token latency:")
        print(f"\tMedian: \t{statistics.median(self._ttft)*1e-6:.2f}ms")
        print(f"\tAverage: \t{statistics.fmean(self._ttft)*1e-6:.2f}ms")
        line()
        print("Output token latency:")
        print(f"\tMedian: \t{statistics.median(self._tpot)*1e-6:.2f}ms")
        print(f"\tAverage: \t{statistics.fmean(self._tpot)*1e-6:.2f}ms")
        line()
--- a/backends/gaudi/examples/docker_commands/docker_commands.md
+++ b/backends/gaudi/examples/docker_commands/docker_commands.md
@ -23,7 +23,7 @@ docker run -p 8080:80 \
   -e PREFILL_BATCH_BUCKET_SIZE=2 \
   -e BATCH_BUCKET_SIZE=32 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -47,7 +47,7 @@ docker run -p 8080:80 \
   -e BATCH_BUCKET_SIZE=256 \
   -e PREFILL_BATCH_BUCKET_SIZE=4 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -72,7 +72,7 @@ docker run -p 8080:80 \
   -e PREFILL_BATCH_BUCKET_SIZE=2 \
   -e BATCH_BUCKET_SIZE=32 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -96,7 +96,7 @@ docker run -p 8080:80 \
   -e BATCH_BUCKET_SIZE=256 \
   -e PREFILL_BATCH_BUCKET_SIZE=4 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -117,7 +117,7 @@ docker run -p 8080:80 \
   -v $volume:/data \
    -e PREFILL_BATCH_BUCKET_SIZE=1 \
    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
@ -147,7 +147,7 @@ docker run -p 8080:80 \
   -e PREFILL_BATCH_BUCKET_SIZE=2 \
   -e BATCH_BUCKET_SIZE=32 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -174,7 +174,7 @@ docker run -p 8080:80 \
   -e BATCH_BUCKET_SIZE=256 \
   -e PREFILL_BATCH_BUCKET_SIZE=4 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -202,7 +202,7 @@ docker run -p 8080:80 \
   -e PREFILL_BATCH_BUCKET_SIZE=2 \
   -e BATCH_BUCKET_SIZE=32 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -229,7 +229,7 @@ docker run -p 8080:80 \
   -e BATCH_BUCKET_SIZE=256 \
   -e PREFILL_BATCH_BUCKET_SIZE=4 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -253,7 +253,7 @@ docker run -p 8080:80 \
   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
    -e PREFILL_BATCH_BUCKET_SIZE=1 \
    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
@ -275,7 +275,7 @@ docker run -p 8080:80 \
   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
    -e PREFILL_BATCH_BUCKET_SIZE=1 \
    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model
 ```
@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model
    <text-generation-inference-launcher-arguments>
 ```
@ -137,7 +137,7 @@ docker run -p 8080:80 \
   -e BATCH_BUCKET_SIZE=256 \
   -e PREFILL_BATCH_BUCKET_SIZE=4 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -163,7 +163,7 @@ docker run -p 8080:80 \
   -v $volume:/data \
    -e PREFILL_BATCH_BUCKET_SIZE=1 \
    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
@ -181,18 +181,30 @@ curl -N 127.0.0.1:8080/generate \
 ### How to Benchmark Performance
-#### Option 1: Static Batching Benchmark (Recommended)
+We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware.
-To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark).
+
 This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios.
 To run it on the same machine, you can do the following:
-* `docker exec -it <docker name> bash` , pick the docker started from step 2 using docker ps
+```bash
-* `text-generation-benchmark -t <model-id>` , pass the model-id from docker run command
+MODEL=meta-llama/Llama-3.1-8B-Instruct
-* after the completion of tests, hit ctrl+c to see the performance data summary.
+HF_TOKEN=<your HF READ token>
 # run a benchmark to evaluate the performance of the model for chat use case
 # we mount results to the current directory
 docker run \
    --rm \
    -it \
    --net host \
    -v $(pwd):/opt/inference-benchmarker/results \
    -e "HF_TOKEN=$HF_TOKEN" \
    ghcr.io/huggingface/inference-benchmarker:latest \
    inference-benchmarker \
    --tokenizer-name "$MODEL" \
    --url http://localhost:8080 \
    --profile chat
 ```
-> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters.
+Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details.
 #### Option 2: Continuous Batching Benchmark
 To run continuous batching benchmark, please refer to [README in examples/benchmark folder in the gaudi backend](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/benchmark).
 ### How to Profile Performance
@ -218,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
    -e PROF_PATH=/tmp/hpu_profile \
    -e PROF_RANKS=0 \
    -e PROF_RECORD_SHAPES=True \
-    ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model
 ```