From 02fa6adb20a98799521486db2accb43d330064f5 Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 10 Mar 2025 09:01:45 +0000 Subject: [PATCH] feat(gaudi): release ready (docs, docker image and vlm ready) --- .github/workflows/build.yaml | 9 + .github/workflows/ci_build.yaml | 3 +- .gitignore | 1 + Makefile | 3 + backends/gaudi/examples/benchmark/README.md | 39 +++ .../gaudi/examples/benchmark/requirements.txt | 4 + .../examples/benchmark/run_generation.py | 110 +++++++ .../gaudi/examples/benchmark/tgi_client.py | 88 +++++ .../docker_commands/docker_commands.md | 283 ++++++++++++++++ .../models/vlm_causal_lm.py | 25 +- docs/source/_toctree.yml | 2 + docs/source/backends/gaudi.mdx | 303 ++++++++++++++++++ docs/source/installation_gaudi.md | 2 +- 13 files changed, 862 insertions(+), 10 deletions(-) create mode 100644 backends/gaudi/examples/benchmark/README.md create mode 100644 backends/gaudi/examples/benchmark/requirements.txt create mode 100644 backends/gaudi/examples/benchmark/run_generation.py create mode 100644 backends/gaudi/examples/benchmark/tgi_client.py create mode 100644 backends/gaudi/examples/docker_commands/docker_commands.md create mode 100644 docs/source/backends/gaudi.mdx diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e849f7c0..99f29d7e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -124,6 +124,15 @@ jobs: export extra_pytest="--neuron" export target="" ;; + gaudi) + export dockerfile="Dockerfile_gaudi" + export label_extension="-gaudi" + export docker_volume="/mnt/cache" + export docker_devices="" + export runs_on="ubuntu-latest" + export platform="" + export extra_pytest="" + export target="" esac echo $dockerfile echo "Dockerfile=${dockerfile}" diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml index 752c6ddd..f0d39399 100644 --- a/.github/workflows/ci_build.yaml +++ b/.github/workflows/ci_build.yaml @@ -21,6 +21,7 @@ on: - "Dockerfile_amd" - "Dockerfile_intel" - "Dockerfile.neuron" + - "Dockerfile_gaudi" branches: - "main" workflow_dispatch: @@ -38,7 +39,7 @@ jobs: # fail-fast is true by default fail-fast: false matrix: - hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron"] + hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron", "gaudi"] uses: ./.github/workflows/build.yaml # calls the one above ^ permissions: contents: write diff --git a/.gitignore b/.gitignore index 7d6c7564..8a6bda72 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ server/fbgemmm hl-smi_log*.txt .graph_dumps out +hqt_output diff --git a/Makefile b/Makefile index 3068a06f..2ecdd45c 100644 --- a/Makefile +++ b/Makefile @@ -53,3 +53,6 @@ run-falcon-7b-instruct-quantize: clean: rm -rf target aml + +preview_doc: + doc-builder preview text-generation-inference docs/source --not_python_module diff --git a/backends/gaudi/examples/benchmark/README.md b/backends/gaudi/examples/benchmark/README.md new file mode 100644 index 00000000..226595c6 --- /dev/null +++ b/backends/gaudi/examples/benchmark/README.md @@ -0,0 +1,39 @@ +# TGI-Gaudi example + +This example provide a simple way of usage of `tgi-gaudi` with continuous batching. It uses a small dataset [DIBT/10k_prompts_ranked](https://huggingface.co/datasets/DIBT/10k_prompts_ranked) and present basic performance numbers. + +## Get started + +### Install + +``` +pip install -r requirements.txt +``` + +### Setup TGI server + +More details on runing the TGI server available [here](https://github.com/huggingface/tgi-gaudi/blob/habana-main/README.md#running-tgi-on-gaudi). + +### Run benchmark + +To run benchmark use below command: + +``` +python run_generation --model_id MODEL_ID +``` +where `MODEL_ID` should be set to the same value as in the TGI server instance. +> For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to set environment variable `HF_TOKEN=` with a valid Hugging Face Hub read token. + +All possible parameters are described in the below table: +
+ +| Name | Default value | Description | +| ------------------------- | :---------------------------- | :------------------------------------------------------------ | +| SERVER_ADDRESS | http://localhost:8080 | The address and port at which the TGI server is available. | +| MODEL_ID | meta-llama/Llama-2-7b-chat-hf | Model ID used in the TGI server instance. | +| MAX_INPUT_LENGTH | 1024 | Maximum input length supported by the TGI server. | +| MAX_OUTPUT_LENGTH | 1024 | Maximum output length supported by the TGI server. | +| TOTAL_SAMPLE_COUNT | 2048 | Number of samples to run. | +| MAX_CONCURRENT_REQUESTS | 256 | The number of requests sent simultaneously to the TGI server. | + +
diff --git a/backends/gaudi/examples/benchmark/requirements.txt b/backends/gaudi/examples/benchmark/requirements.txt new file mode 100644 index 00000000..c772c19e --- /dev/null +++ b/backends/gaudi/examples/benchmark/requirements.txt @@ -0,0 +1,4 @@ +huggingface_hub==0.23.5 +requests==2.31.0 +datasets==2.18.0 +transformers>=4.37.0 diff --git a/backends/gaudi/examples/benchmark/run_generation.py b/backends/gaudi/examples/benchmark/run_generation.py new file mode 100644 index 00000000..460b40d1 --- /dev/null +++ b/backends/gaudi/examples/benchmark/run_generation.py @@ -0,0 +1,110 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. + +import argparse +import requests +import time +from typing import List + +from datasets import load_dataset +from transformers import AutoTokenizer + +from tgi_client import TgiClient + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--server_address", + type=str, + default="http://localhost:8083", + help="Address of the TGI server", + ) + parser.add_argument( + "--model_id", + type=str, + default="meta-llama/Llama-2-7b-chat-hf", + help="Model id used in TGI server", + ) + parser.add_argument( + "--max_input_length", + type=int, + default=1024, + help="Max input length for TGI model", + ) + parser.add_argument( + "--max_output_length", + type=int, + default=1024, + help="Max output length for TGI model", + ) + parser.add_argument( + "--total_sample_count", + type=int, + default=2048, + help="Total number of samples to generate", + ) + parser.add_argument( + "--max_concurrent_requests", + type=int, + default=256, + help="Max number of concurrent requests", + ) + parser.add_argument("--seed", type=int, default=42, help="Random seed for datasets") + + return parser.parse_args() + + +def read_dataset( + max_input_length: int, + total_sample_count: int, + model_id: str, + seed: int, +) -> List[str]: + """ + Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked + and filters out too long samples. + """ + tokenizer = AutoTokenizer.from_pretrained(model_id) + + dataset = load_dataset( + "DIBT/10k_prompts_ranked", split="train", trust_remote_code=True + ) + dataset = dataset.filter( + lambda x: len(tokenizer(x["prompt"])["input_ids"]) < max_input_length + ) + if len(dataset) > total_sample_count: + dataset = dataset.select(range(total_sample_count)) + + dataset = dataset.shuffle(seed=seed) + return [sample["prompt"] for sample in dataset] + + +def is_tgi_available(server_address: str) -> bool: + """ + Checks if TGI server is available under the specified address. + """ + try: + info = requests.get(f"{server_address}/info") + return info.status_code == 200 + except Exception: + return False + + +def main(): + args = get_args() + dataset = read_dataset( + args.max_input_length, args.total_sample_count, args.model_id, args.seed + ) + + if not is_tgi_available(args.server_address): + raise RuntimeError("Cannot connect with TGI server!") + + tgi_client = TgiClient(args.server_address, args.max_concurrent_requests) + timestamp = time.perf_counter_ns() + tgi_client.run_generation(dataset, args.max_output_length) + duration_s = (time.perf_counter_ns() - timestamp) * 1e-9 + tgi_client.print_performance_metrics(duration_s) + + +if __name__ == "__main__": + main() diff --git a/backends/gaudi/examples/benchmark/tgi_client.py b/backends/gaudi/examples/benchmark/tgi_client.py new file mode 100644 index 00000000..66d63ab8 --- /dev/null +++ b/backends/gaudi/examples/benchmark/tgi_client.py @@ -0,0 +1,88 @@ +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. + +import os +import statistics +import threading +import time +import tqdm +from typing import List + +from huggingface_hub import InferenceClient + + +def except_hook(args): + print(f"Thread failed with error: {args.exc_value}") + os._exit(1) + + +threading.excepthook = except_hook + + +class TgiClient: + def __init__(self, server_address: str, max_num_threads: int) -> None: + self._lock = threading.Lock() + self._semaphore = threading.Semaphore(max_num_threads) + self._client = InferenceClient(server_address) + + self._ttft = [] + self._tpot = [] + self._generated_tokens = [] + + def run_generation(self, samples: List[str], max_new_tokens: int) -> None: + """ + Run generation for every sample in dataset. + Creates a separate thread for every sample. + """ + threads: List[threading.Thread] = [] + for sample in tqdm.tqdm(samples): + self._semaphore.acquire() + threads.append( + threading.Thread( + target=self._process_sample, args=[sample, max_new_tokens] + ) + ) + threads[-1].start() + for thread in threads: + if thread is not None: + thread.join() + + def _process_sample(self, sample: str, max_new_tokens: int) -> None: + """ + Generates response stream for a single sample. + Collects performance metrics. + """ + timestamp = time.perf_counter_ns() + response_stream = self._client.text_generation( + sample, max_new_tokens=max_new_tokens, stream=True, details=True + ) + out = "" + for id, response in enumerate(response_stream): + if id == 0: + self._ttft.append(time.perf_counter_ns() - timestamp) + else: + self._tpot.append(time.perf_counter_ns() - timestamp) + timestamp = time.perf_counter_ns() + out += response.token.text + if response.details: + self._generated_tokens.append(response.details.generated_tokens) + + self._semaphore.release() + + def print_performance_metrics(self, duration_s: float) -> None: + def line(): + print(32 * "-") + + line() + print("----- Performance summary -----") + line() + print(f"Throughput: {sum(self._generated_tokens) / duration_s:.1f} tokens/s") + print(f"Throughput: {len(self._generated_tokens) / duration_s:.1f} queries/s") + line() + print("First token latency:") + print(f"\tMedian: \t{statistics.median(self._ttft)*1e-6:.2f}ms") + print(f"\tAverage: \t{statistics.fmean(self._ttft)*1e-6:.2f}ms") + line() + print("Output token latency:") + print(f"\tMedian: \t{statistics.median(self._tpot)*1e-6:.2f}ms") + print(f"\tAverage: \t{statistics.fmean(self._tpot)*1e-6:.2f}ms") + line() diff --git a/backends/gaudi/examples/docker_commands/docker_commands.md b/backends/gaudi/examples/docker_commands/docker_commands.md new file mode 100644 index 00000000..e540e272 --- /dev/null +++ b/backends/gaudi/examples/docker_commands/docker_commands.md @@ -0,0 +1,283 @@ +# Examples of Docker Commands for Gaudi Backend + +This page gives a list of examples of docker run commands for some of the most popular models. + +> **Note:** The parameters are chosen for Gaudi2 hardware to maximize performance on this given hardware, please adjust the parameters based on your hardware. For example, if you are using Gaudi3, you may want to increase the batch size. + +## Default Precision (BF16) + +### Llama3.1-8B on 1 card (BF16) + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +### Llama3.1-70B 8 cards (BF16) + +```bash +model=meta-llama/Meta-Llama-3.1-70B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +### Llama2-7B on 1 Card (BF16) + +```bash +model=meta-llama/Llama-2-7b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +### Llama2-70B on 8 cards (BF16) + +```bash +model=meta-llama/Llama-2-70b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +### Llava-v1.6-Mistral-7B on 1 card (BF16) + +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` + +## FP8 Precision + +Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision. + +## Llama3.1-8B on 1 Card (FP8) + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +## Llama3.1-70B on 8 cards (FP8) + +```bash +model=meta-llama/Meta-Llama-3.1-70B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +## Llama2-7B on 1 Card (FP8) + +```bash +model=meta-llama/Llama-2-7b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +## Llama2-70B on 8 Cards (FP8) + +```bash +model=meta-llama/Llama-2-70b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +## Llava-v1.6-Mistral-7B on 1 Card (FP8) + +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` + +## Llava-v1.6-Mistral-7B on 8 Cards (FP8) + +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py index d4f4c1af..cef761b4 100644 --- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py @@ -68,10 +68,14 @@ IDEFICS2_IMAGE_TOKEN = "" IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)") BASE_IMAGE_TOKENS = int(os.environ.get("BASE_IMAGE_TOKENS", 2048)) MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 8192)) -MAX_BATCH_TOTAL_TOKENS = int(os.environ.get("MAX_BATCH_TOTAL_TOKENS", 131072)) PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256)) CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1)) +MAX_BATCH_SIZE = ( + int(os.environ.get("MAX_BATCH_SIZE")) + if os.environ.get("MAX_BATCH_SIZE") is not None + else None +) PREFILL_WARMUP_BATCH_SIZE_LIST = [] PREFILL_WARMUP_SEQLEN_LIST = [] @@ -1197,7 +1201,9 @@ class VlmCausalLM(Model): return self.batch_from_pb(batch, is_warmup) - def warmup(self, request) -> None: + def warmup( + self, request: generate_pb2.WarmupRequest + ) -> Tuple[Optional[int], Optional[int], Optional[int]]: is_warmup = True batch = self.batch_from_pb(request.batch, is_warmup) @@ -1210,7 +1216,8 @@ class VlmCausalLM(Model): f"You need to decrease `--max-batch-prefill-tokens`" ) - global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST + global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST + MAX_TOTAL_TOKENS = request.max_total_tokens max_input_length = batch.input_ids.shape[1] max_prefill_batch_size = batch.input_ids.shape[0] PREFILL_WARMUP_BATCH_SIZE_LIST = [] @@ -1264,7 +1271,7 @@ class VlmCausalLM(Model): f"Memory stats: {mem_stats} " ) - max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS) + max_decode_batch_size = MAX_BATCH_SIZE batch_size = max_prefill_batch_size * 2 # Decode warmup with bigger batch_size try: @@ -1310,14 +1317,12 @@ class VlmCausalLM(Model): batches.append(prefill_batch) _, decode_batch, _ = self.generate_token(batches, is_warmup) DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size) - max_batch_total_tokens = max_decode_batch_size * MAX_TOTAL_TOKENS - MAX_BATCH_TOTAL_TOKENS = max_batch_total_tokens except Exception: raise RuntimeError( f"Not enough memory to handle batch_size({batch_size}) decode warmup." f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}" f"max_decode_batch_size is {max_decode_batch_size}" - f"You need to decrease env `MAX_BATCH_TOTAL_TOKENS` or '--max_batch_total_tokens'" + f"You need to decrease env `MAX_BATCH_SIZE` or '--max_batch_size'" ) mem_stats = get_hpu_memory_stats(self.device) @@ -1327,4 +1332,8 @@ class VlmCausalLM(Model): f"Memory stats: {mem_stats}" ) - return MAX_BATCH_TOTAL_TOKENS + max_supported_total_tokens = MAX_BATCH_SIZE * MAX_TOTAL_TOKENS + max_input_tokens = max_input_length + max_total_tokens = MAX_TOTAL_TOKENS + + return max_supported_total_tokens, max_input_tokens, max_total_tokens diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 37b57d6f..4c6f0151 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -52,6 +52,8 @@ - sections: - local: backends/neuron title: Neuron + - local: backends/gaudi + title: Gaudi - local: backends/trtllm title: TensorRT-LLM - local: backends/llamacpp diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx new file mode 100644 index 00000000..3a6f631c --- /dev/null +++ b/docs/source/backends/gaudi.mdx @@ -0,0 +1,303 @@ +# Gaudi Backend for Text Generation Inference + +## Overview +Text Generation Inference (TGI) has been optimized to run on Gaudi hardware via the Gaudi backend for TGI. + +## Supported Hardware +- **Gaudi1**: Available on [AWS EC2 DL1 instances](https://aws.amazon.com/ec2/instance-types/dl1/) +- **Gaudi2**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html) +- **Gaudi3**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html) + +## Tutorial: Getting Started with TGI on Gaudi + +### Basic Usage +The easiest way to run TGI on Gaudi is to use the official Docker image: + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run +hf_token=YOUR_HF_ACCESS_TOKEN + +docker run --runtime=habana --cap-add=sys_nice --ipc=host \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model +``` + +Once you see the `connected` log, the server is ready to accept requests: +> 2024-05-22T19:31:48.302239Z INFO text_generation_router: router/src/main.rs:378: Connected + +You can find your `YOUR_HF_ACCESS_TOKEN` at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). This is necessary to access gated models like llama3.1. + +### Making Your First Request +You can send a request from a separate terminal: + +```bash +curl 127.0.0.1:8080/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":32}}' \ + -H 'Content-Type: application/json' +``` + +## How-to Guides + +### How to Run Specific Models + +The following models have been validated on Gaudi2: + +| Model | Model ID | BF16 | | FP8 | | +|-----------------------|----------------------------------------|-------------|------------|-------------|------------| +| | | Single Card | Multi-Card | Single Card | Multi-Card | +| Llama2-7B | meta-llama/Llama-2-7b-chat-hf | ✔ | ✔ | ✔ | ✔ | +| Llama2-70B | meta-llama/Llama-2-70b-chat-hf | | ✔ | | ✔ | +| Llama3-8B | meta-llama/Meta-Llama-3.1-8B-Instruct | ✔ | ✔ | ✔ | ✔ | +| Llama3-70B | meta-llama/Meta-Llama-3-70B-Instruct | | ✔ | | ✔ | +| Llama3.1-8B | meta-llama/Meta-Llama-3.1-8B-Instruct | ✔ | ✔ | ✔ | ✔ | +| Llama3.1-70B | meta-llama/Meta-Llama-3.1-70B-Instruct | | ✔ | | ✔ | +| CodeLlama-13B | codellama/CodeLlama-13b-hf | ✔ | ✔ | ✔ | ✔ | +| Mixtral-8x7B | mistralai/Mixtral-8x7B-Instruct-v0.1 | ✔ | ✔ | ✔ | ✔ | +| Mistral-7B | mistralai/Mistral-7B-Instruct-v0.3 | ✔ | ✔ | ✔ | ✔ | +| Falcon-180B | tiiuae/falcon-180B-chat | | ✔ | | ✔ | +| Qwen2-72B | Qwen/Qwen2-72B-Instruct | | ✔ | | ✔ | +| Starcoder2-3b | bigcode/starcoder2-3b | ✔ | ✔ | ✔ | | +| Starcoder2-15b | bigcode/starcoder2-15b | ✔ | ✔ | ✔ | | +| Starcoder | bigcode/starcoder | ✔ | ✔ | ✔ | ✔ | +| Gemma-7b | google/gemma-7b-it | ✔ | ✔ | ✔ | ✔ | +| Llava-v1.6-Mistral-7B | llava-hf/llava-v1.6-mistral-7b-hf | ✔ | ✔ | ✔ | ✔ | + +To run any of these models: + +```bash +model=MODEL_ID_THAT_YOU_WANT_TO_RUN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run +hf_token=YOUR_ACCESS_TOKEN + +docker run --runtime=habana --cap-add=sys_nice --ipc=host \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model + +``` + +For the full list of service parameters, refer to the [launcher-arguments page](https://huggingface.co/docs/text-generation-inference/reference/launcher). + +The validated docker commands can be found in the [examples/docker_commands folder](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/docker_commands). + +> Note: `--runtime=habana --cap-add=sys_nice --ipc=host ` is required to enable docker to use the Gaudi hardware (more details [here](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html)). + +### How to Enable Multi-Card Inference (Sharding) + +TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards. + +For example, on a machine with 8 Gaudi cards, you can run: + +```bash +docker run --runtime=habana --ipc=host --cap-add=sys_nice \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + tgi-gaudi \ + --model-id $model --sharded true --num-shard 8 +``` + + +We recommend always using sharding when running on a multi-card machine. + + +### How to Use Different Precision Formats + +#### BF16 Precision (Default) +By default, all models run with BF16 precision on Gaudi hardware. + +#### FP8 Precision +TGI-Gaudi supports FP8 precision inference with [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). + +To run FP8 Inference: + +1. Measure statistics using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8) +2. Run the model in TGI with QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`. + +The following commmand example for FP8 inference is based on the assumption that measurement is done via the first step above. + +Example for Llama3.1-70B on 8 cards with FP8 precision: + +```bash +model=meta-llama/Meta-Llama-3.1-70B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +### How to Run Vision-Language Models (VLMs) + +Gaudi supports VLM inference. + +Example for Llava-v1.6-Mistral-7B on 1 card: + +Start the TGI server via the following command: +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` + +You can then send a request to the server via the following command: +```bash +curl -N 127.0.0.1:8080/generate \ + -X POST \ + -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":32}}' \ + -H 'Content-Type: application/json' +``` + +> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token value. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For Llava-v1.6-Mistral-7B, the value of `max-batch-prefill-tokens` is 16384, which is calcualted as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`. + +### How to Benchmark Performance + +#### Option 1: Static Batching Benchmark (Recommended) +To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark). + +To run it on the same machine, you can do the following: +* `docker exec -it bash` , pick the docker started from step 2 using docker ps +* `text-generation-benchmark -t ` , pass the model-id from docker run command +* after the completion of tests, hit ctrl+c to see the performance data summary. + +> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters. + +#### Option 2: Continuous Batching Benchmark +To run continuous batching benchmark, please refer to [README in examples/benchmark folder in the gaudi backend](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/benchmark). + +### How to Profile Performance + +To collect performance profiling, you need to set the following environment variables: + +| Name | Value(s) | Default | Description | +|--------------------| :--------- | :--------------- | :------------------------------------------------------- | +| PROF_WAITSTEP | integer | 0 | Control profile wait steps | +| PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | +| PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | +| PROF_PATH | string | /tmp/hpu_profile | Define profile folder | +| PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | +| PROF_RECORD_SHAPES | True/False | False | Control record_shapes option in the profiler | + +To use these environment variables, add them to your docker run command with the -e flag. For example: + +```bash +docker run --runtime=habana --ipc=host --cap-add=sys_nice \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + -e PROF_WAITSTEP=10 \ + -e PROF_WARMUPSTEP=10 \ + -e PROF_STEP=1 \ + -e PROF_PATH=/tmp/hpu_profile \ + -e PROF_RANKS=0 \ + -e PROF_RECORD_SHAPES=True \ + ghcr.io/huggingface/text-generation-inference:latest-gaudi \ + --model-id $model +``` + +## Explanation: Understanding TGI on Gaudi + +### The Warmup Process + +To ensure optimal performance, warmup is performed at the beginning of each server run. This process creates queries with various input shapes based on provided parameters and runs basic TGI operations (prefill, decode, concatenate). + +Note: Model warmup can take several minutes, especially for FP8 inference. For faster subsequent runs, refer to [Disk Caching Eviction Policy](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#disk-caching-eviction-policy). + +### Understanding Parameter Tuning + +#### Sequence Length Parameters +- `--max-input-tokens` is the maximum possible input prompt length. Default value is `4095`. +- `--max-total-tokens` is the maximum possible total length of the sequence (input and output). Default value is `4096`. + +#### Batch Size Parameters +- For prefill operation, please set `--max-batch-prefill-tokens` as `bs * max-input-tokens`, where `bs` is your expected maximum prefill batch size. +- For decode operation, please set `--max-batch-size` as `bs`, where `bs` is your expected maximum decode batch size. +- Please note that batch size will be always padded to the nearest multiplication of `BATCH_BUCKET_SIZE` and `PREFILL_BATCH_BUCKET_SIZE`. + +#### Performance and Memory Parameters +- `PAD_SEQUENCE_TO_MULTIPLE_OF` determines sizes of input length buckets. Since warmup creates several graphs for each bucket, it's important to adjust that value proportionally to input sequence length. Otherwise, some out of memory issues can be observed. +- `ENABLE_HPU_GRAPH` enables HPU graphs usage, which is crucial for performance results. Recommended value to keep is `true`. + +#### Sequence Length Parameters +- `--max-input-tokens`: Maximum possible input prompt length (default: 4095) +- `--max-total-tokens`: Maximum possible total sequence length (input + output) (default: 4096) + +#### Batch Size Parameters +- `--max-batch-prefill-tokens`: Set as `bs * max-input-tokens` where `bs` is your expected maximum prefill batch size +- `--max-batch-size`: Set as `bs` where `bs` is your expected maximum decode batch size +- Note: Batch sizes are padded to the nearest multiple of `BATCH_BUCKET_SIZE` and `PREFILL_BATCH_BUCKET_SIZE` + +## Reference + +This section contains reference information about the Gaudi backend. + +### Environment Variables + +The following table contains the environment variables that can be used to configure the Gaudi backend: + +| Name | Value(s) | Default | Description | Usage | +|-----------------------------| :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- | +| ENABLE_HPU_GRAPH | True/False | True | Enable hpu graph or not | add -e in docker run command | +| LIMIT_HPU_GRAPH | True/False | True | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command | +| BATCH_BUCKET_SIZE | integer | 8 | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command | +| PREFILL_BATCH_BUCKET_SIZE | integer | 4 | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command | +| PAD_SEQUENCE_TO_MULTIPLE_OF | integer | 128 | For prefill operation, sequences will be padded to a multiple of provided value. | add -e in docker run command | +| SKIP_TOKENIZER_IN_TGI | True/False | False | Skip tokenizer for input/output processing | add -e in docker run command | +| WARMUP_ENABLED | True/False | True | Enable warmup during server initialization to recompile all graphs. This can increase TGI setup time. | add -e in docker run command | +| QUEUE_THRESHOLD_MS | integer | 120 | Controls the threshold beyond which the request are considered overdue and handled with priority. Shorter requests are prioritized otherwise. | add -e in docker run command | +| USE_FLASH_ATTENTION | True/False | True | Whether to enable Habana Flash Attention, provided that the model supports it. Please refer to https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html?highlight=fusedsdpa#using-fused-scaled-dot-product-attention-fusedsdpa | add -e in docker run command | +| FLASH_ATTENTION_RECOMPUTE | True/False | True | Whether to enable Habana Flash Attention in recompute mode on first token generation. | add -e in docker run command | + +## Contributing + +Contributions to the TGI-Gaudi project are welcome. Please refer to the [contributing guide](https://github.com/huggingface/text-generation-inference/blob/main/CONTRIBUTING.md). + +### Building the Docker Image from Source + +To build the Docker image from source: + +```bash +make -C backends/gaudi image +``` + +This builds the image and saves it as `tgi-gaudi`. You can then run TGI-Gaudi with this image: + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +volume=$PWD/data +hf_token=YOUR_ACCESS_TOKEN + +docker run --runtime=habana --ipc=host --cap-add=sys_nice \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + tgi-gaudi \ + --model-id $model +``` + +For more details, see the [README of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/README.md) and the [Makefile of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/Makefile). diff --git a/docs/source/installation_gaudi.md b/docs/source/installation_gaudi.md index 1ddf2b47..51aa667d 100644 --- a/docs/source/installation_gaudi.md +++ b/docs/source/installation_gaudi.md @@ -1,3 +1,3 @@ # Using TGI with Intel Gaudi -Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index). +You can use TGI on Intel Gaudi using the [TGI gaudi backend](https://huggingface.co/docs/text-generation-inference/backends/gaudi).