From a3dfcf571d95604b2dc8875ae96212dfd3e8bc1a Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Tue, 11 Mar 2025 10:00:02 +0000
Subject: [PATCH] fix(gaudi): remove use of latest for gaudi docker image +
 redid gaudi benchmarking section to include best practices

---
 backends/gaudi/examples/benchmark/README.md   |  39 -------
 .../gaudi/examples/benchmark/requirements.txt |   4 -
 .../examples/benchmark/run_generation.py      | 110 ------------------
 .../gaudi/examples/benchmark/tgi_client.py    |  88 --------------
 .../docker_commands/docker_commands.md        |  22 ++--
 docs/source/backends/gaudi.mdx                |  40 ++++---
 6 files changed, 37 insertions(+), 266 deletions(-)
 delete mode 100644 backends/gaudi/examples/benchmark/README.md
 delete mode 100644 backends/gaudi/examples/benchmark/requirements.txt
 delete mode 100644 backends/gaudi/examples/benchmark/run_generation.py
 delete mode 100644 backends/gaudi/examples/benchmark/tgi_client.py
diff --git a/backends/gaudi/examples/benchmark/README.md b/backends/gaudi/examples/benchmark/README.md
deleted file mode 100644
index 226595c6..00000000
--- a/backends/gaudi/examples/benchmark/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# TGI-Gaudi example
-
-This example provide a simple way of usage of `tgi-gaudi` with continuous batching. It uses a small dataset [DIBT/10k_prompts_ranked](https://huggingface.co/datasets/DIBT/10k_prompts_ranked) and present basic performance numbers.
-
-## Get started
-
-### Install
-
-```
-pip install -r requirements.txt
-```
-
-### Setup TGI server
-
-More details on runing the TGI server available [here](https://github.com/huggingface/tgi-gaudi/blob/habana-main/README.md#running-tgi-on-gaudi).
-
-### Run benchmark
-
-To run benchmark use below command:
-
-```
-python run_generation --model_id MODEL_ID
-```
-where `MODEL_ID` should be set to the same value as in the TGI server instance.
-> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=<token>` with a valid Hugging Face Hub read token.
-
-All possible parameters are described in the below table:
-<div align="left">
-
-| Name                      | Default value                 | Description                                                   |
-| ------------------------- | :---------------------------- | :------------------------------------------------------------ |
-| SERVER_ADDRESS            | http://localhost:8080         | The address and port at which the TGI server is available.    |
-| MODEL_ID                  | meta-llama/Llama-2-7b-chat-hf | Model ID used in the TGI server instance.                     |
-| MAX_INPUT_LENGTH          | 1024                          | Maximum input length supported by the TGI server.             |
-| MAX_OUTPUT_LENGTH         | 1024                          | Maximum output length supported by the TGI server.            |
-| TOTAL_SAMPLE_COUNT        | 2048                          | Number of samples to run.                                     |
-| MAX_CONCURRENT_REQUESTS   | 256                           | The number of requests sent simultaneously to the TGI server. |
-
-</div>
diff --git a/backends/gaudi/examples/benchmark/requirements.txt b/backends/gaudi/examples/benchmark/requirements.txt
deleted file mode 100644
index c772c19e..00000000
--- a/backends/gaudi/examples/benchmark/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-huggingface_hub==0.23.5
-requests==2.31.0
-datasets==2.18.0
-transformers>=4.37.0
diff --git a/backends/gaudi/examples/benchmark/run_generation.py b/backends/gaudi/examples/benchmark/run_generation.py
deleted file mode 100644
index 460b40d1..00000000
--- a/backends/gaudi/examples/benchmark/run_generation.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
-
-import argparse
-import requests
-import time
-from typing import List
-
-from datasets import load_dataset
-from transformers import AutoTokenizer
-
-from tgi_client import TgiClient
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--server_address",
-        type=str,
-        default="http://localhost:8083",
-        help="Address of the TGI server",
-    )
-    parser.add_argument(
-        "--model_id",
-        type=str,
-        default="meta-llama/Llama-2-7b-chat-hf",
-        help="Model id used in TGI server",
-    )
-    parser.add_argument(
-        "--max_input_length",
-        type=int,
-        default=1024,
-        help="Max input length for TGI model",
-    )
-    parser.add_argument(
-        "--max_output_length",
-        type=int,
-        default=1024,
-        help="Max output length for TGI model",
-    )
-    parser.add_argument(
-        "--total_sample_count",
-        type=int,
-        default=2048,
-        help="Total number of samples to generate",
-    )
-    parser.add_argument(
-        "--max_concurrent_requests",
-        type=int,
-        default=256,
-        help="Max number of concurrent requests",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="Random seed for datasets")
-
-    return parser.parse_args()
-
-
-def read_dataset(
-    max_input_length: int,
-    total_sample_count: int,
-    model_id: str,
-    seed: int,
-) -> List[str]:
-    """
-    Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked
-    and filters out too long samples.
-    """
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-    dataset = load_dataset(
-        "DIBT/10k_prompts_ranked", split="train", trust_remote_code=True
-    )
-    dataset = dataset.filter(
-        lambda x: len(tokenizer(x["prompt"])["input_ids"]) < max_input_length
-    )
-    if len(dataset) > total_sample_count:
-        dataset = dataset.select(range(total_sample_count))
-
-    dataset = dataset.shuffle(seed=seed)
-    return [sample["prompt"] for sample in dataset]
-
-
-def is_tgi_available(server_address: str) -> bool:
-    """
-    Checks if TGI server is available under the specified address.
-    """
-    try:
-        info = requests.get(f"{server_address}/info")
-        return info.status_code == 200
-    except Exception:
-        return False
-
-
-def main():
-    args = get_args()
-    dataset = read_dataset(
-        args.max_input_length, args.total_sample_count, args.model_id, args.seed
-    )
-
-    if not is_tgi_available(args.server_address):
-        raise RuntimeError("Cannot connect with TGI server!")
-
-    tgi_client = TgiClient(args.server_address, args.max_concurrent_requests)
-    timestamp = time.perf_counter_ns()
-    tgi_client.run_generation(dataset, args.max_output_length)
-    duration_s = (time.perf_counter_ns() - timestamp) * 1e-9
-    tgi_client.print_performance_metrics(duration_s)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/backends/gaudi/examples/benchmark/tgi_client.py b/backends/gaudi/examples/benchmark/tgi_client.py
deleted file mode 100644
index 66d63ab8..00000000
--- a/backends/gaudi/examples/benchmark/tgi_client.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
-
-import os
-import statistics
-import threading
-import time
-import tqdm
-from typing import List
-
-from huggingface_hub import InferenceClient
-
-
-def except_hook(args):
-    print(f"Thread failed with error: {args.exc_value}")
-    os._exit(1)
-
-
-threading.excepthook = except_hook
-
-
-class TgiClient:
-    def __init__(self, server_address: str, max_num_threads: int) -> None:
-        self._lock = threading.Lock()
-        self._semaphore = threading.Semaphore(max_num_threads)
-        self._client = InferenceClient(server_address)
-
-        self._ttft = []
-        self._tpot = []
-        self._generated_tokens = []
-
-    def run_generation(self, samples: List[str], max_new_tokens: int) -> None:
-        """
-        Run generation for every sample in dataset.
-        Creates a separate thread for every sample.
-        """
-        threads: List[threading.Thread] = []
-        for sample in tqdm.tqdm(samples):
-            self._semaphore.acquire()
-            threads.append(
-                threading.Thread(
-                    target=self._process_sample, args=[sample, max_new_tokens]
-                )
-            )
-            threads[-1].start()
-        for thread in threads:
-            if thread is not None:
-                thread.join()
-
-    def _process_sample(self, sample: str, max_new_tokens: int) -> None:
-        """
-        Generates response stream for a single sample.
-        Collects performance metrics.
-        """
-        timestamp = time.perf_counter_ns()
-        response_stream = self._client.text_generation(
-            sample, max_new_tokens=max_new_tokens, stream=True, details=True
-        )
-        out = ""
-        for id, response in enumerate(response_stream):
-            if id == 0:
-                self._ttft.append(time.perf_counter_ns() - timestamp)
-            else:
-                self._tpot.append(time.perf_counter_ns() - timestamp)
-            timestamp = time.perf_counter_ns()
-            out += response.token.text
-            if response.details:
-                self._generated_tokens.append(response.details.generated_tokens)
-
-        self._semaphore.release()
-
-    def print_performance_metrics(self, duration_s: float) -> None:
-        def line():
-            print(32 * "-")
-
-        line()
-        print("----- Performance  summary -----")
-        line()
-        print(f"Throughput: {sum(self._generated_tokens) / duration_s:.1f} tokens/s")
-        print(f"Throughput: {len(self._generated_tokens) / duration_s:.1f} queries/s")
-        line()
-        print("First token latency:")
-        print(f"\tMedian: \t{statistics.median(self._ttft)*1e-6:.2f}ms")
-        print(f"\tAverage: \t{statistics.fmean(self._ttft)*1e-6:.2f}ms")
-        line()
-        print("Output token latency:")
-        print(f"\tMedian: \t{statistics.median(self._tpot)*1e-6:.2f}ms")
-        print(f"\tAverage: \t{statistics.fmean(self._tpot)*1e-6:.2f}ms")
-        line()
diff --git a/backends/gaudi/examples/docker_commands/docker_commands.md b/backends/gaudi/examples/docker_commands/docker_commands.md
index e540e272..59701289 100644
--- a/backends/gaudi/examples/docker_commands/docker_commands.md
+++ b/backends/gaudi/examples/docker_commands/docker_commands.md
@@ -23,7 +23,7 @@ docker run -p 8080:80 \
    -e PREFILL_BATCH_BUCKET_SIZE=2 \
    -e BATCH_BUCKET_SIZE=32 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@@ -47,7 +47,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -72,7 +72,7 @@ docker run -p 8080:80 \
    -e PREFILL_BATCH_BUCKET_SIZE=2 \
    -e BATCH_BUCKET_SIZE=32 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@@ -96,7 +96,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -117,7 +117,7 @@ docker run -p 8080:80 \
    -v $volume:/data \
     -e PREFILL_BATCH_BUCKET_SIZE=1 \
     -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
@@ -147,7 +147,7 @@ docker run -p 8080:80 \
    -e PREFILL_BATCH_BUCKET_SIZE=2 \
    -e BATCH_BUCKET_SIZE=32 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@@ -174,7 +174,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -202,7 +202,7 @@ docker run -p 8080:80 \
    -e PREFILL_BATCH_BUCKET_SIZE=2 \
    -e BATCH_BUCKET_SIZE=32 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --max-input-tokens 1024 --max-total-tokens 2048 \
    --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@@ -229,7 +229,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -253,7 +253,7 @@ docker run -p 8080:80 \
    -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
     -e PREFILL_BATCH_BUCKET_SIZE=1 \
     -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
@@ -275,7 +275,7 @@ docker run -p 8080:80 \
    -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
     -e PREFILL_BATCH_BUCKET_SIZE=1 \
     -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
index 3a6f631c..c7f73618 100644
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
     --model-id $model
 ```
 
@@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
     --model-id $model
     <text-generation-inference-launcher-arguments>
 ```
@@ -137,7 +137,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -163,7 +163,7 @@ docker run -p 8080:80 \
    -v $volume:/data \
     -e PREFILL_BATCH_BUCKET_SIZE=1 \
     -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
@@ -181,18 +181,30 @@ curl -N 127.0.0.1:8080/generate \
 
 ### How to Benchmark Performance
 
-#### Option 1: Static Batching Benchmark (Recommended)
-To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark).
+We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware.
+
+This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios.
 
 To run it on the same machine, you can do the following:
-* `docker exec -it <docker name> bash` , pick the docker started from step 2 using docker ps
-* `text-generation-benchmark -t <model-id>` , pass the model-id from docker run command
-* after the completion of tests, hit ctrl+c to see the performance data summary.
+```bash
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+HF_TOKEN=<your HF READ token>
+# run a benchmark to evaluate the performance of the model for chat use case
+# we mount results to the current directory
+docker run \
+    --rm \
+    -it \
+    --net host \
+    -v $(pwd):/opt/inference-benchmarker/results \
+    -e "HF_TOKEN=$HF_TOKEN" \
+    ghcr.io/huggingface/inference-benchmarker:latest \
+    inference-benchmarker \
+    --tokenizer-name "$MODEL" \
+    --url http://localhost:8080 \
+    --profile chat
+```
 
-> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters.
-
-#### Option 2: Continuous Batching Benchmark
-To run continuous batching benchmark, please refer to [README in examples/benchmark folder in the gaudi backend](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/benchmark).
+Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details.
 
 ### How to Profile Performance
 
@@ -218,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
     -e PROF_PATH=/tmp/hpu_profile \
     -e PROF_RANKS=0 \
     -e PROF_RECORD_SHAPES=True \
-    ghcr.io/huggingface/text-generation-inference:latest-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
     --model-id $model
 ```