feat: Add load tests

2025-07-15 20:30:16 +00:00 · 2024-07-11 11:45:24 +02:00 · 2024-07-11 11:45:24 +02:00 · 8d358d9c61
commit 8d358d9c61
parent 8511669cb2
20 changed files with 2617 additions and 166 deletions
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@ -43,3 +43,5 @@ jobs:
      # https://github.com/actions/runner/issues/2206
      release-tests: ${{ inputs.release-tests == true }}
    secrets: inherit
+  load_tests:
+    uses: ./.github/workflows/load_test.yaml
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@ -3,6 +3,7 @@ name: Nightly load test
 on:
  schedule:
    - cron: '0 0 * * 1-5'
+  workflow_call:

  pull_request:
    paths:
@ -10,33 +11,92 @@ on:
    branches:
      - 'main'

+env:
+  AWS_DEFAULT_REGION: us-east-1
+  AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
+
 jobs:
  load-tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
+    runs-on: [ self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci ]
    env:
      DOCKER_VOLUME: /cache
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

-      - name: Install k6
+      - name: Install awscli
        run: |
-          curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
+          sudo apt-get update
+          sudo apt-get install -y awscli

-      - name: Start starcoder
+      - name: Install poetry
        run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
-          sleep 10
-          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
+          poetry --version

-      - name: Run k6
+      - name: Install texlive minimal
        run: |
-          ./k6 run load_tests/starcoder_load.js
+          sudo apt-get update
+          sudo apt-get install -y texlive-latex-extra texlive-fonts-recommended dvipng cm-super

-      - name: Stop starcoder
-        if: ${{ always() }}
+      - name: Install Go 1.21
+        uses: actions/setup-go@v2
+        with:
+          go-version: 1.21
+
+      - name: Install Python 3.11
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+
+      - name: Download artifacts from previous runs
+        uses: actions/github-script@v6
+        continue-on-error: true
+        env:
+          WORKFLOW_FILENAME: load_test.yaml
+          ARTIFACT_NAME: benchmark_results_csv
+          ARTIFACT_FILENAME: benchmark_results_csv.zip
+          UNZIP_DIR: /tmp/artifacts
+        with:
+          script: |
+            const script = require('./load_tests/download_artifact.js')
+            await script({github, context, core})
+
+      - name: Run load test
        run: |
-          docker stop tgi-starcoder || true
+          cd load_tests
+          make load-test
+        shell: bash
+
+      - name: Archive test results artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark_results_plots
+          path: |
+            load_tests/output/*
+
+      - name: Upload to S3
+        run: |
+          aws s3 cp load_tests/output/ s3://text-generation-inference-ci/${{ github.sha }} --recursive
+
+      - uses: actions/github-script@v6
+        if: github.event_name == 'pull_request'
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: '🚀 Load test results are in:\n\n'+
+                '## Variable length prompts\n'+
+                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_arrival_rate.png" width=200>\n' +
+                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_vus.png" width=200>\n\n' +
+                '## Constant length prompts\n'+
+                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_vus.png" width=200>\n' +
+                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_arrival_rate.png" width=200>\n'
+            })
--- a/load_tests/Makefile
+++ b/load_tests/Makefile
@ -1,9 +1,24 @@
+.PHONY: download-dataset load-test build-k6
+download-dataset:
+	@if [ ! -f ./benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json ]; then \
+		echo "Downloading dataset"; \
+		curl -L -o benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json; \
+	else \
+		echo "Dataset already downloaded"; \
+	fi

-ShareGPT_V3_unfiltered_cleaned_split.json:
-	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+load-test: download-dataset build-k6
+	poetry install && poetry run python load_test.py

-prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
-	python filter.py
-
-prepare_orca:
-	python orca.py
+build-k6:
+	mkdir -p /tmp/xk6 && \
+	cd /tmp/xk6 && \
+	git clone https://github.com/mstoykov/xk6-sse.git && \
+	cd xk6-sse && \
+	git checkout useSobek && \
+	go install go.k6.io/xk6/cmd/xk6@latest && \
+	xk6 build --with github.com/phymbert/xk6-sse=. && \
+	mkdir -p ~/.local/bin/ && \
+	mv k6 /tmp/k6-sse && \
+	rm -rf /tmp/xk6 && \
+	/tmp/k6-sse --version
--- a/load_tests/benchmarks/init.py
+++ b/load_tests/benchmarks/init.py
--- a/load_tests/benchmarks/engine.py
+++ b/load_tests/benchmarks/engine.py
@ -0,0 +1,151 @@
+import subprocess
+import threading
+from typing import Dict, List
+
+import docker
+from docker.models.containers import Container
+from loguru import logger
+
+from benchmarks.utils import kill
+
+
+class InferenceEngineRunner:
+    def __init__(self, model: str):
+        self.model = model
+
+    def run(self, parameters: list[tuple]):
+        NotImplementedError("This method should be implemented by the subclass")
+
+    def stop(self):
+        NotImplementedError("This method should be implemented by the subclass")
+
+
+class TGIRunner(InferenceEngineRunner):
+    def __init__(self, model: str):
+        super().__init__(model)
+        self.process = None
+        self.model = model
+
+    def run(self, parameters: list[tuple]):
+        params = ""
+        for p in parameters:
+            params += f"--{p[0]} {str(p[1])}"
+        # start a TGI subprocess with the given parameter
+        args = f"text-generation-launcher --port 8080 --model-id {self.model} --huggingface-hub-cache /scratch {params}"
+        logger.info(f"Running TGI with parameters: {args}")
+        self.process = subprocess.Popen(args,
+                                        stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        for line in iter(self.process.stdout.readline, b""):
+            print(line.decode("utf-8"))
+            # wait for TGI to listen to the port
+            if b"Connected" in line:
+                break
+            if b"Error" in line:
+                raise Exception(f"Error starting TGI: {line}")
+
+        # continue to stream the logs in a thread
+        def stream_logs():
+            for line in iter(self.process.stdout.readline, b""):
+                print(line.decode("utf-8"))
+
+        if self.process.returncode is not None:
+            raise Exception("Error starting TGI")
+        self.thread = threading.Thread(target=stream_logs)
+        self.thread.start()
+
+    def stop(self):
+        logger.warning(f"Killing TGI with PID {self.process.pid}")
+        if self.process:
+            kill(self.process.pid)
+        if self.thread:
+            self.thread.join()
+
+
+class TGIDockerRunner(InferenceEngineRunner):
+    def __init__(self,
+                 model: str,
+                 image: str = "ghcr.io/huggingface/text-generation-inference:latest",
+                 volumes=None):
+        super().__init__(model)
+        if volumes is None:
+            volumes = []
+        self.container = None
+        self.image = image
+        self.volumes = volumes
+
+    def run(self, parameters: list[tuple]):
+        params = f"--model-id {self.model} --port 8080"
+        for p in parameters:
+            params += f" --{p[0]} {str(p[1])}"
+        logger.info(f"Running TGI with parameters: {params}")
+        volumes = {}
+        for v in self.volumes:
+            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
+        self.container = run_docker(self.image, params,
+                                    "Connected",
+                                    "Error",
+                                    volumes=volumes)
+
+    def stop(self):
+        if self.container:
+            self.container.stop()
+
+
+class VLLMDockerRunner(InferenceEngineRunner):
+    def __init__(self,
+                 model: str,
+                 image: str = "vllm/vllm-openai:latest",
+                 volumes=None):
+        super().__init__(model)
+        if volumes is None:
+            volumes = []
+        self.container = None
+        self.image = image
+        self.volumes = volumes
+
+    def run(self, parameters: list[tuple]):
+        parameters.append(("max-num-seqs", "256"))
+        params = f"--model {self.model} --tensor-parallel-size {get_num_gpus()} --port 8080"
+        for p in parameters:
+            params += f" --{p[0]} {str(p[1])}"
+        logger.info(f"Running VLLM with parameters: {params}")
+        volumes = {}
+        for v in self.volumes:
+            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
+        self.container = run_docker(self.image, params, "Uvicorn running",
+                                    "Error ",
+                                    volumes=volumes)
+
+    def stop(self):
+        if self.container:
+            self.container.stop()
+
+
+def run_docker(image: str, args: str, success_sentinel: str,
+               error_sentinel: str, volumes=None) -> Container:
+    if volumes is None:
+        volumes = {}
+    client = docker.from_env()
+    # retrieve the GPU devices from CUDA_VISIBLE_DEVICES
+    devices = [f"{i}" for i in
+               range(get_num_gpus())]
+    container = client.containers.run(image, args,
+                                      detach=True,
+                                      device_requests=[
+                                          docker.types.DeviceRequest(device_ids=devices, capabilities=[['gpu']])
+                                      ],
+                                      volumes=volumes,
+                                      shm_size="1g",
+                                      ports={"8080/tcp": 8080})
+    for line in container.logs(stream=True):
+        print(line.decode("utf-8"), end="")
+        if success_sentinel.encode("utf-8") in line:
+            break
+        if error_sentinel.encode("utf-8") in line:
+            container.stop()
+            raise Exception(f"Error starting container: {line}")
+    return container
+
+
+def get_num_gpus() -> int:
+    return len(subprocess.run(["nvidia-smi", "-L"], capture_output=True).stdout.splitlines())
--- a/load_tests/benchmarks/k6.py
+++ b/load_tests/benchmarks/k6.py
@ -0,0 +1,233 @@
+import json
+import os
+import subprocess
+import tempfile
+from enum import Enum
+from typing import Any, Dict, List
+
+import numpy as np
+from jinja2 import Environment, PackageLoader, select_autoescape
+from loguru import logger
+from transformers import LlamaTokenizerFast
+
+from benchmarks.utils import kill
+
+env = Environment(
+    loader=PackageLoader("benchmarks"),
+    autoescape=select_autoescape()
+)
+
+
+class ExecutorInputType(Enum):
+    CONSTANT_TOKENS = "constant_tokens"
+    SHAREGPT_CONVERSATIONS = "sharegpt_conversations"
+
+
+class K6Executor:
+    def __init__(self, name, template_name, executor_input_type=ExecutorInputType.SHAREGPT_CONVERSATIONS):
+        self.template_name = template_name
+        self.variables = {}
+        self.rendered_file = None
+        self.name = name
+        self.executor_input_type = executor_input_type
+        if executor_input_type == ExecutorInputType.CONSTANT_TOKENS:
+            self.input_filename = "inputs_constant_tokens.json"
+        elif executor_input_type == ExecutorInputType.SHAREGPT_CONVERSATIONS:
+            self.input_filename = "inputs_variable_tokens.json"
+
+    def render(self):
+        template = env.get_template(self.template_name)
+        _, path = tempfile.mkstemp("k6", "benchmark")
+        cwd = os.getcwd()
+        with open(path, "w") as f:
+            f.write(template.render(cwd=cwd, input_filename=self.input_filename, **self.variables))
+        self.rendered_file = path
+
+    def __str__(self):
+        # returns an underscore separated string of the variables for filename generation
+        params = "_".join([f"{k}_{v}" for k, v in sorted(self.variables.items()) if type(v) == str or type(v) == int])
+        return f"{self.executor_input_type.value}_{params}"
+
+
+class K6ConstantArrivalRateExecutor(K6Executor):
+    def __init__(self, pre_allocated_vus: int, rate_per_second: int, duration: str,
+                 executor_input_type: ExecutorInputType):
+        super().__init__("constant_arrival_rate", "k6_constant_arrival_rate.js.j2", executor_input_type)
+        self.variables = {
+            "pre_allocated_vus": pre_allocated_vus,  # it's also the max vus
+            "rate": rate_per_second,
+            "duration": duration
+        }
+
+
+class K6RampingArrivalRateExecutor(K6Executor):
+    def __init__(self, pre_allocated_vus: int, start_rate: int, time_unit: str, stages: List[Dict[str, Any]],
+                 executor_input_type: ExecutorInputType):
+        super().__init__("ramping_arrival_rate", "k6_ramping_arrival_rate.js.j2", executor_input_type)
+        self.variables = {
+            "pre_allocated_vus": pre_allocated_vus,
+            "start_rate": start_rate,
+            "time_unit": time_unit,
+            "stages": stages
+        }
+
+
+class K6ConstantVUsExecutor(K6Executor):
+    def __init__(self, vus: int, duration: str, executor_input_type: ExecutorInputType):
+        super().__init__("constant_vus", "k6_constant_vus.js.j2", executor_input_type)
+        self.variables = {
+            "vus": vus,
+            "duration": duration
+        }
+
+
+class K6Config:
+    def __init__(self, name: str, executor: K6Executor,
+                 tokenizer=LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer"),
+                 conversations_input_file=None,
+                 input_num_tokens=200,
+                 max_new_tokens=200,
+                 extra_info=None
+                 ):
+        self.executor = executor
+        # max_new_token will be set in k6 template
+        self.executor.variables["max_new_tokens"] = max_new_tokens
+        self.name = name
+        self.tokenizer = tokenizer
+        self.extra_info = extra_info
+        if conversations_input_file is None:
+            self.conversation_input_file = "benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json"
+        self.input_num_tokens = input_num_tokens
+
+    def __str__(self):
+        return f"K6Config(name={self.name} executor={self.executor})"
+
+
+class K6Benchmark:
+    def __init__(self, k6_config: K6Config, output_dir: str):
+        self.process = None
+        self.k6_config = k6_config
+        self.output_dir = output_dir
+        self.input_tokens_len = k6_config.input_num_tokens
+        self._prepare_inputs()
+
+    def _prepare_inputs(self):
+        get_tokens_count = lambda txt: len(self.k6_config.tokenizer.encode(txt))
+        MAX_SAMPLES = 5000
+
+        # create a first input file with a constant number of tokens
+        # check if the file already exists
+        if not os.path.exists("inputs_constant_tokens.json"):
+            logger.info(f'Preparing input file with {self.input_tokens_len} input tokens')
+            outputs = []
+            with open(self.k6_config.conversation_input_file, "r") as f:
+                data = json.load(f)
+                for doc in data:
+                    for conversation in doc["conversations"]:
+                        if not conversation["from"] == "human":
+                            continue
+                        if get_tokens_count(conversation["value"]) < self.input_tokens_len:
+                            continue
+                        # encode the message
+                        encoding = self.k6_config.tokenizer(conversation["value"], truncation=True,
+                                                            max_length=self.input_tokens_len)
+                        # find last encoded characters
+                        span = encoding.token_to_chars(len(encoding["input_ids"]) - 1)
+                        outputs.append(
+                            {"message": conversation["value"][0:span.end], "num_tokens": len(encoding["input_ids"])})
+                    if len(outputs) >= MAX_SAMPLES:  # limit the number of inputs
+                        break
+            with open("inputs_constant_tokens.json", "w") as f:
+                f.write(json.dumps(outputs))
+
+        # create a second input file with a sampling of inputs
+        # check if the file already exists
+        if not os.path.exists("inputs_variable_tokens.json"):
+            logger.info(
+                f'Preparing input file by randomly sampling shareGPT conversations at "{self.k6_config.conversation_input_file}"')
+            outputs = []
+            with open(self.k6_config.conversation_input_file, "r") as f:
+                data = json.load(f)
+                num_docs = len(data)
+                # generate random indexes to sample the data
+                indexes = np.random.choice(num_docs, 200, replace=False)
+                for i in indexes:
+                    doc = data[i]
+                    for conversation in doc["conversations"]:
+                        if not conversation["from"] == "human":
+                            continue
+                        # encode the message without truncation
+                        encoding = self.k6_config.tokenizer(conversation["value"])
+                        outputs.append(
+                            {"message": conversation["value"], "num_tokens": len(encoding["input_ids"])})
+                    if len(outputs) >= MAX_SAMPLES:  # limit the number of inputs
+                        break
+            with open("inputs_variable_tokens.json", "w") as f:
+                f.write(json.dumps(outputs))
+
+    def run(self):
+        self.k6_config.executor.render()
+        args = f"/tmp/k6-sse run --out json=results.json {self.k6_config.executor.rendered_file}"
+        logger.info(f"Running k6 with parameters: {args}")
+        logger.info(f"K6Config is: {self.k6_config}")
+        # start a k6 subprocess
+        self.process = subprocess.Popen(args,
+                                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
+        while buffer := os.read(self.process.stdout.fileno(),
+                                2048):  # read the output of the process, don't buffer on new lines
+            print(buffer.decode(), end='')
+        self.process.wait()
+        logger.info(f"K6 process finished with return code {self.process.returncode}")
+        logger.info(f"Writing results to {self.get_results_path()}")
+        self.add_config_to_summary()
+        self.add_config_to_results()
+
+    def stop(self):
+        if self.process:
+            kill(self.process.pid)
+
+    def add_config_to_summary(self):
+        with open("summary.json", "r") as f:
+            summary = json.load(f)
+            summary["k6_config"] = {
+                "name": self.k6_config.name,
+                "input_type": self.k6_config.executor.executor_input_type.value,
+                "extra_info": self.k6_config.extra_info,
+                **self.k6_config.executor.variables
+            }
+            # create directory if it doesn't exist
+            os.makedirs(self._get_output_dir(), exist_ok=True)
+            with open(self.get_summary_path(), "w") as f2:
+                json.dump(summary, f2)
+
+    def add_config_to_results(self):
+        with open("results.json", "r") as f:
+            results = f.readlines()
+            # append the k6 config to the results in jsonlines format
+            results += "\n"
+            results += json.dumps({
+                "name": self.k6_config.name,
+                "input_type": self.k6_config.executor.executor_input_type.value,
+                "extra_info": self.k6_config.extra_info,
+                **self.k6_config.executor.variables
+            })
+            # create directory if it doesn't exist
+            os.makedirs(self._get_output_dir(), exist_ok=True)
+            with open(self.get_results_path(), "w") as f2:
+                f2.writelines(results)
+
+    def _get_output_dir(self):
+        # check if output_dir is relative or absolute
+        if self.output_dir.startswith("/"):
+            return f"{self.output_dir}/{self.k6_config.executor.name}"
+        else:
+            return f"{os.getcwd()}/{self.output_dir}/{self.k6_config.executor.name}"
+
+    def _get_output_path(self):
+        return f"{self._get_output_dir()}/{self.k6_config.name}_{self.k6_config.executor}"
+
+    def get_results_path(self):
+        return f"{self._get_output_path()}.json"
+
+    def get_summary_path(self):
+        return f"{self._get_output_path()}.summary.json"
--- a/load_tests/benchmarks/templates/k6_constant_arrival_rate.js.j2
+++ b/load_tests/benchmarks/templates/k6_constant_arrival_rate.js.j2
@ -0,0 +1,16 @@
+{% include 'main.js.j2' %}
+
+export function get_options() {
+    return {
+        scenarios: {
+            load_test: {
+                executor: 'constant-arrival-rate',
+                gracefulStop: '0s',
+                duration: '{{ duration }}',
+                preAllocatedVUs: {{ pre_allocated_vus }},
+                rate: {{ rate }},
+                timeUnit: '1s',
+            },
+        },
+    };
+}
--- a/load_tests/benchmarks/templates/k6_constant_vus.js.j2
+++ b/load_tests/benchmarks/templates/k6_constant_vus.js.j2
@ -0,0 +1,14 @@
+{% include 'main.js.j2' %}
+
+export function get_options() {
+    return {
+        scenarios: {
+            load_test: {
+                executor: 'constant-vus',
+                gracefulStop: '0s',
+                duration: '{{ duration }}',
+                vus: {{ vus }},
+            },
+        },
+    };
+}
--- a/load_tests/benchmarks/templates/k6_ramping_arrival_rate.js.j2
+++ b/load_tests/benchmarks/templates/k6_ramping_arrival_rate.js.j2
@ -0,0 +1,20 @@
+{% include 'main.js.j2' %}
+
+export function get_options() {
+    return {
+        scenarios: {
+            load_test: {
+                executor: 'ramping-arrival-rate',
+                gracefulStop: '0s',
+                preAllocatedVUs: {{ pre_allocated_vus }},
+                timeUnit: '{{ time_unit }}',
+                startRate: {{ start_rate }},
+                stages: [
+                    {%- for stage in stages %}
+                    {target: {{ stage.target }}, duration: '{{ stage.duration }}'},
+                    {%- endfor %}
+                ],
+            },
+        },
+    };
+}
--- a/load_tests/benchmarks/templates/main.js.j2
+++ b/load_tests/benchmarks/templates/main.js.j2
@ -0,0 +1,131 @@
+import {check, fail} from 'k6';
+import sse from "k6/x/sse"
+import {scenario} from 'k6/execution';
+import http from 'k6/http';
+import {Trend, Counter} from 'k6/metrics';
+
+const host = "127.0.0.1:8080";
+const model_id = "Qwen/Qwen2-72B";
+
+const endToEndLatency = new Trend('end_to_end_latency', true);
+const requestThroughput = new Counter('request_throughput');
+const tokenThroughput = new Counter('tokens_throughput');
+
+const timeToFirstToken = new Trend('time_to_first_token', true);
+const interTokenLatency = new Trend('inter_token_latency', true); // is microseconds
+
+const tokensReceived = new Trend('tokens_received');
+
+const max_new_tokens = {{ max_new_tokens }};
+
+const shareGPT = JSON.parse(open("{{ cwd }}/{{ input_filename }}"))
+
+export function handleSummary(data) {
+    return {
+        'summary.json': JSON.stringify(data),
+    };
+}
+
+function generate_payload(gpt, max_new_tokens) {
+    let input = gpt["message"];
+    return {
+        "messages": [{"role": "user", "content": input}],
+        "temperature": 0,
+        "model": `${model_id}`,
+        "max_tokens": max_new_tokens,
+        "stream": true
+    };
+}
+
+export const options = get_options();
+
+export default function run() {
+    const headers = {'Content-Type': 'application/json'};
+    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
+    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
+    const url = `http://${host}/v1/chat/completions`;
+    const params = {
+        method: 'POST',
+        body: payload,
+        headers
+    }
+
+    const startTime = Date.now();
+    let firstTokenTime = null;
+    let lastTokenTime = null;
+    let tokensCount = 0;
+    let response = ""
+
+    const res = sse.open(url, params, function (client) {
+        client.on('event', function (event) {
+            // console.log(event.data)
+            if (parseInt(event.id) === 4) {
+                client.close()
+            }
+            if (event.data.includes("[DONE]") || event.data === "") {
+                return
+            }
+            try {
+                const data = JSON.parse(event.data);
+                if (!'choices' in data) {
+                    fail('http_200')
+                    return;
+                }
+                const content = data['choices'][0]['delta']['content']
+                if (content !== undefined) {
+                    response += data['choices'][0]['delta']['content']
+                    tokensCount += 1;
+                }
+
+                // Measure time to first token
+                if (!firstTokenTime) {
+                    firstTokenTime = Date.now();
+                    timeToFirstToken.add(firstTokenTime - startTime);
+                }
+
+                // Measure inter-token latency
+                const currentTime = Date.now();
+                if (lastTokenTime) {
+                    interTokenLatency.add((currentTime - lastTokenTime) * 1000.);
+                }
+                lastTokenTime = currentTime;
+
+                if ('finish_reason' in data['choices'][0]) {
+                    if (data['choices'][0]['finish_reason'] != null) {
+                        const endTime = Date.now();
+                        const deltaMs = endTime - startTime;
+                        endToEndLatency.add(deltaMs)
+                        requestThroughput.add(1);
+                        tokenThroughput.add(tokensCount);
+                        tokensReceived.add(tokensCount);
+                    }
+                }
+            } catch (e) {
+                // catch any errors that occur during the event processing
+                // increase the fail count of the 'http_200' check
+                check(true, {
+                    'http_200': (val) => false,
+                })
+                fail('http_200')
+            }
+        })
+
+        client.on('error', function (e) {
+            console.log('An unexpected error occurred: ', e.error())
+        })
+    })
+
+    if (tokensCount === 0) {
+        // something went wrong with generation
+        fail('http_200')
+    }
+
+    if (res.status >= 400 && res.status < 500) {
+        return;
+    }
+
+    check(res, {
+        'http_200': (res) => res.status === 200,
+    });
+
+}
--- a/load_tests/benchmarks/test_k6.py
+++ b/load_tests/benchmarks/test_k6.py
@ -0,0 +1,31 @@
+import os
+from unittest import TestCase
+
+from benchmarks.k6 import K6RampingArrivalRateExecutor, K6Config, K6ConstantVUsExecutor, K6Benchmark, ExecutorInputType
+
+
+class K6RampingArrivalRateExecutorTest(TestCase):
+    def test_render(self):
+        executor = K6RampingArrivalRateExecutor(
+            100,
+            1,
+            "1s",
+            [
+                {"target": 1, "duration": "30s"},
+                {"target": 100, "duration": "30s"}
+            ],
+            ExecutorInputType.SHAREGPT_CONVERSATIONS)
+        executor.render()
+        self.assertIsNotNone(executor.rendered_file)
+        with open(executor.rendered_file, "r") as f:
+            content = f.read()
+            self.assertTrue("stages: [" in content)
+            self.assertTrue("target: 1, duration: '30s'" in content)
+            self.assertTrue(os.getcwd() in content)
+
+
+class K6BenchmarkTest(TestCase):
+    def test_prepare_inputs(self):
+        executor = K6ConstantVUsExecutor(1, '1m')
+        config = K6Config("test", executor, input_num_tokens=500)
+        bench = K6Benchmark(config, "output")
--- a/load_tests/benchmarks/utils.py
+++ b/load_tests/benchmarks/utils.py
@ -0,0 +1,25 @@
+from itertools import chain
+
+import psutil
+
+
+class SweepParameter:
+    def __init__(self, name, start, end, step):
+        self.name = name
+        self.start = start
+        self.end = end
+        self.step = step
+
+    def __str__(self):
+        return f"{self.name} from {self.start} to {self.end} in steps of {self.step}"
+
+    def __iter__(self):
+        for value in chain(range(self.start, self.end, self.step), [self.end]):
+            yield value
+
+
+def kill(proc_pid):
+    process = psutil.Process(proc_pid)
+    for proc in process.children(recursive=True):
+        proc.kill()
+    process.kill()
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -1,94 +0,0 @@
-import { check } from 'k6';
-import { scenario } from 'k6/execution';
-import http from 'k6/http';
-import { Trend, Counter } from 'k6/metrics';
-
-const host = __ENV.HOST;
-const model_id = __ENV.MODEL_ID;
-const timePerToken = new Trend('time_per_token', true);
-const tokens = new Counter('tokens');
-const new_tokens = new Counter('new_tokens');
-const input_tokens = new Counter('input_tokens');
-const max_new_tokens = 50;
-
-// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
-const shareGPT = JSON.parse(open("small.json"))
-
-
-export function get_options() {
-    return {
-        thresholds: {
-            http_req_failed: ['rate==0'],
-            // time_per_token: [{
-            //     threshold: `p(50)<${5 * reference_latency_ms}`,
-            //     abortOnFail: true,
-            //     delayAbortEval: '10s'
-            // }],
-        },
-        scenarios: {
-            // single_user: {
-            //     executor: 'constant-arrival-rate',
-            //     duration: '60s',
-            //     preAllocatedVUs: 1,
-            //     rate: 20,
-            //     timeUnit: '1s',
-            // },
-            load_test: {
-                executor: 'constant-arrival-rate',
-                duration: '60s',
-                preAllocatedVUs: 100,
-                rate: 1,
-                timeUnit: '1s',
-            },
-            // breakpoint: {
-            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
-            //     preAllocatedVUs: 300,
-            //     stages: [
-            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
-            //     ],
-            // },
-            // throughput: {
-            //     executor: 'shared-iterations',
-            //     vus: 100,
-            //     iterations: 200,
-            //     maxDuration: '40s',
-            // },
-        },
-    };
-}
-
-function generate_payload(gpt, max_new_tokens) {
-    const input = gpt["conversations"][0]["value"];
-    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
-}
-
-export const options = get_options();
-
-export default function run() {
-    const headers = { 'Content-Type': 'application/json' };
-    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
-    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
-    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
-        headers,
-    });
-    if (res.status >= 400 && res.status < 500) {
-        return;
-    }
-
-
-    check(res, {
-        'Post status is 200': (res) => res.status === 200,
-    });
-    const duration = res.timings.duration;
-
-    if (res.status === 200) {
-        const body = res.json();
-        const completion_tokens = body.usage.completion_tokens;
-        const latency_ms_per_token = duration / completion_tokens;
-        timePerToken.add(latency_ms_per_token);
-        const prompt_tokens = body.usage.prompt_tokens;
-        input_tokens.add(prompt_tokens);
-        new_tokens.add(completion_tokens);
-        tokens.add(completion_tokens + prompt_tokens);
-    }
-}
--- a/load_tests/download_artifact.js
+++ b/load_tests/download_artifact.js
@ -0,0 +1,98 @@
+module.exports = async ({
+                            github,
+                            context,
+                            core
+                        }) => {
+    const owner = context.repo.owner;
+    const repo = context.repo.repo;
+
+    const workflows = await github.rest.actions.listRepoWorkflows({
+        owner,
+        repo
+    })
+
+    const workflow = workflows.data.workflows.find(w => w.path.includes(process.env.WORKFLOW_FILENAME));
+
+    if (!workflow) {
+        core.setFailed("No workflow found");
+        return;
+    }
+
+    const runs = await github.rest.actions.listWorkflowRuns({
+        owner,
+        repo,
+        workflow_id: workflow.id,
+        status: "success",
+        per_page: 1
+    })
+
+    if (runs.data.total_count === 0) {
+        core.setFailed("No runs found");
+        return;
+    }
+
+    const lastRelease = await github.rest.repos.getLatestRelease({
+        owner,
+        repo
+    });
+
+    const lastReleaseTag = lastRelease.data.tag_name;
+    const tagRef = `tags/${lastReleaseTag}`;
+    const lastReleaseCommit = await github.rest.git.getRef({
+        owner,
+        repo,
+        ref: tagRef
+    });
+    const lastReleaseSha = lastReleaseCommit.data.object.sha;
+    const lastReleaseRun = await github.rest.actions.listWorkflowRuns({
+        owner,
+        repo,
+        workflow_id: workflow.id,
+        head_sha: lastReleaseSha,
+        status: "success",
+        per_page: 1
+    });
+    let lastReleaseArtifacts = {data: {artifacts: []}};
+    if (lastReleaseRun.data.total_count > 0) {
+        lastReleaseArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+            owner,
+            repo,
+            run_id: lastReleaseRun.data.workflow_runs[0].id
+        });
+    }
+
+    const lastArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+        owner,
+        repo,
+        run_id: runs.data.workflow_runs[0].id
+    });
+
+    const lastReleaseArtifact = lastReleaseArtifacts.data.artifacts.find(artifact => artifact.name === process.env.ARTIFACT_NAME);
+    const lastArtifact = lastArtifacts.data.artifacts.find(artifact => artifact.name === process.env.ARTIFACT_NAME);
+
+    if (lastReleaseArtifact) {
+        await downloadArtifact(github, owner, repo, lastReleaseArtifact, lastReleaseTag);
+    } else {
+        console.log("No release artifact found")
+    }
+    if (lastArtifact) {
+        await downloadArtifact(github, owner, repo, lastArtifact, lastArtifact.workflow_run.head_sha);
+    } else {
+        console.log("No last run artifact found")
+    }
+}
+
+async function downloadArtifact(github, owner, repo, artifact, suffix) {
+    const response = await github.rest.actions.downloadArtifact({
+        owner,
+        repo,
+        artifact_id: artifact.id,
+        archive_format: 'zip'
+    });
+    require('fs').writeFileSync(process.env.ARTIFACT_FILENAME, Buffer.from(response.data));
+    // create directory to unzip
+    require('fs').mkdirSync(`${process.env.UNZIP_DIR}/${artifact.workflow_run.head_sha}`, {recursive: true});
+    require('child_process').execSync(`unzip -o ${process.env.ARTIFACT_FILENAME} -d ${process.env.UNZIP_DIR}/${suffix}`);
+
+    console.log(`Artifact ${process.env.ARTIFACT_FILENAME} for ${suffix} downloaded successfully`);
+}
--- a/load_tests/filter.py
+++ b/load_tests/filter.py
@ -1,26 +0,0 @@
-import json
-
-
-def main():
-    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
-        data = json.load(f)
-
-    # Select only the first 2k conversations that start with a human.
-    max = 2000
-    conversations = []
-    for conversation in data:
-        conv = conversation.get("conversations")
-        if conv and conv[0]["from"] == "human":
-            # Trim the rest of the output
-            conversation["conversations"] = conversation["conversations"][:1]
-            conversations.append(conversation)
-
-            if len(conversation) >= max:
-                break
-
-    with open("./small.json", "w") as f:
-        data = json.dump(conversations, f, indent=4)
-
-
-if __name__ == "__main__":
-    main()
--- a/load_tests/load_test.py
+++ b/load_tests/load_test.py
@ -0,0 +1,103 @@
+import os
+import time
+import traceback
+
+from benchmarks.engine import TGIDockerRunner
+from benchmarks.k6 import K6Config, K6Benchmark, K6ConstantArrivalRateExecutor, K6ConstantVUsExecutor, ExecutorInputType
+from loguru import logger
+import pandas as pd
+import GPUtil
+
+from parse_load_test import TestType, parse_json_files, plot_metrics
+
+
+def run_full_test(engine_name: str):
+    vus_concurrences = list(range(0, 1024, 40))
+    vus_concurrences[0] = 1
+    vus_concurrences.append(1024)
+    arrival_rates = list(range(0, 200, 10))
+    arrival_rates[0] = 1
+    arrival_rates.append(200)
+    for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
+        for c in arrival_rates:
+            logger.info(f'Running k6 with constant arrival rate for {c} req/s with input type {input_type.value}')
+            k6_executor = K6ConstantArrivalRateExecutor(2000, c, '60s', input_type)
+            k6_config = K6Config(f'{engine_name}', k6_executor, input_num_tokens=200)
+            benchmark = K6Benchmark(k6_config, f'results/{input_type.value}/')
+            benchmark.run()
+        for c in vus_concurrences:
+            logger.info(f'Running k6 with constant VUs with concurrency {c} with input type {input_type.value}')
+            k6_executor = K6ConstantVUsExecutor(c, '60s', input_type)
+            k6_config = K6Config(f'{engine_name}', k6_executor, input_num_tokens=200)
+            benchmark = K6Benchmark(k6_config, f'results/{input_type.value}/')
+            benchmark.run()
+
+
+def merge_previous_results(csv_path: str, df: pd.DataFrame, version_id: str) -> pd.DataFrame:
+    if os.path.exists(csv_path):
+        previous_df = pd.read_csv(csv_path)
+        previous_df['name'] = previous_df['name'].str.replace('tgi', f'tgi_{version_id}')
+        df = pd.concat([previous_df, df])
+    return df
+
+
+def main():
+    model = 'Qwen/Qwen2-7B'
+    runner = TGIDockerRunner(model)
+    max_concurrent_requests = 8000
+    # run TGI
+    try:
+        logger.info('Running TGI')
+        runner.run([('max-concurrent-requests', max_concurrent_requests)])
+        logger.info('TGI is running')
+        run_full_test('tgi')
+    except Exception as e:
+        logger.error(f'Error: {e}')
+        # print the stack trace
+        print(traceback.format_exc())
+    finally:
+        runner.stop()
+        time.sleep(5)
+
+    for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
+        for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
+            directory = os.path.join('results', input_type.value.lower(), test_type.value.lower())
+            # check if directory exists
+            if not os.path.exists(directory):
+                logger.error(f'Directory {directory} does not exist')
+                continue
+            dfs = parse_json_files(directory, test_type)
+            # create output directory if it does not exist
+            os.makedirs('output', exist_ok=True)
+            # save the data to a csv file
+            path = os.path.join(os.getcwd(), 'output', f'{input_type.value.lower()}_{test_type.value.lower()}.csv')
+            dfs.to_csv(path)
+            # check if we have previous results CSV file by listing /tmp/artifacts/<input_type> directory,
+            # merge them if they exist
+            prev_root = '/tmp/artifacts'
+            try:
+                if os.path.exists(prev_root):
+                    directories = [item for item in os.listdir(prev_root) if
+                                   os.path.isdir(os.path.join(prev_root, item))]
+                    for d in directories:
+                        for f in os.listdir(f'{prev_root}/{d}'):
+                            if f.endswith(f'{input_type.value.lower()}_{test_type.value.lower()}.csv'):
+                                csv_path = os.path.join('/tmp/artifacts', d, f)
+                                # only keep short commit hash
+                                d = d[:7]
+                                dfs = merge_previous_results(csv_path, dfs, d)
+            except Exception as e:
+                logger.error(f'Error while merging previous results, skipping: {e}')
+            plot_metrics(f'{model} {get_gpu_names()}', dfs, test_type,
+                         f'output/{input_type.value.lower()}_{test_type.value.lower()}')
+
+
+def get_gpu_names() -> str:
+    gpus = GPUtil.getGPUs()
+    if len(gpus) == 0:
+        return ''
+    return f'{len(gpus)}x{gpus[0].name if gpus else "No GPU available"}'
+
+
+if __name__ == '__main__':
+    main()
--- a/load_tests/orca.py
+++ b/load_tests/orca.py
@ -1,27 +0,0 @@
-import json
-import datasets
-import tqdm
-
-
-def main():
-    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
-    # Select only the first 2k conversations that start with a human.
-    max = min(2000, len(dataset))
-    conversations = []
-    for item in tqdm.tqdm(dataset, total=max):
-        conversation = {
-            "conversations": [
-                {"from": "human", "value": item["question"]},
-            ],
-            "id": item["id"],
-        }
-        conversations.append(conversation)
-        if len(conversations) >= max:
-            break
-
-    with open("./small.json", "w") as f:
-        data = json.dump(conversations, f, indent=4)
-
-
-if __name__ == "__main__":
-    main()
--- a/load_tests/parse_load_test.py
+++ b/load_tests/parse_load_test.py
@ -0,0 +1,115 @@
+import json
+import os
+from enum import Enum
+import pandas as pd
+from loguru import logger
+
+from matplotlib import pyplot as plt
+import scienceplots
+
+plt.style.use('science')
+
+
+class TestType(Enum):
+    CONSTANT_VUS = "constant_vus"
+    CONSTANT_ARRIVAL_RATE = "constant_arrival_rate"
+
+
+def parse_json_files(directory: str, test_type: TestType) -> pd.DataFrame:
+    metrics_to_keep = {'inter_token_latency': {'y': 'Time (ms)'}, 'end_to_end_latency': {'y': 'Time (ms)'},
+                       'time_to_first_token': {'y': 'Time (ms)'}, 'tokens_throughput': {'y': 'Tokens/s'},
+                       'tokens_received': {'y': 'Count'}}
+    df = pd.DataFrame()
+    for file in os.listdir(directory):
+        if file.endswith("summary.json"):
+            filepath = os.path.join(directory, file)
+            with open(filepath, 'r') as f:
+                data = json.load(f)
+                if test_type == TestType.CONSTANT_VUS:
+                    entry = {
+                        "vus": data['k6_config']['vus'],
+                        "duration": data['k6_config']['duration']
+                    }
+                elif test_type == TestType.CONSTANT_ARRIVAL_RATE:
+                    entry = {
+                        'pre_allocated_vus': data['k6_config']['pre_allocated_vus'],
+                        'rate': data['k6_config']['rate'],
+                        'duration': data['k6_config']['duration']
+                    }
+                entry['test_duration'] = data['state']['testRunDurationMs'] / 1000.
+                entry['requests_ok'] = data['root_group']['checks'][0]['passes']
+                entry['requests_fail'] = data['root_group']['checks'][0]['fails']
+                entry['dropped_iterations'] = data['metrics']['dropped_iterations']['values'][
+                    'count'] if 'dropped_iterations' in data['metrics'] else 0
+                # add up requests_fail and dropped_iterations to get total dropped requests
+                entry['dropped_requests'] = entry['requests_fail'] + entry['dropped_iterations']
+                entry['error_rate'] = entry['dropped_requests'] / (
+                        entry['requests_ok'] + entry['dropped_requests']) * 100.0
+                entry['name'] = data['k6_config']['name']
+                for metric, values in sorted(data['metrics'].items()):
+                    if metric in metrics_to_keep:
+                        for value_key, value in values['values'].items():
+                            if value_key == 'p(90)' or value_key == 'count':  # Only keep p(90) values if trend
+                                entry[metric] = value
+                if 'tokens_throughput' in entry and 'test_duration' in entry:
+                    entry['tokens_throughput'] = entry['tokens_throughput'] / (entry['test_duration'])
+                if 'inter_token_latency' in entry:
+                    entry['inter_token_latency'] = entry['inter_token_latency'] / 1000.
+                df = pd.concat([df, pd.DataFrame(entry, index=[0])])
+    return df
+
+
+def plot_metrics(model_name:str, df: pd.DataFrame, test_type: TestType, save_name: str):
+    vus_param = ''
+    if test_type == TestType.CONSTANT_VUS:
+        vus_param = 'vus'
+    else:
+        vus_param = 'rate'
+    fig, axs = plt.subplots(3, 2, figsize=(15, 20))
+    fig.tight_layout(pad=6.0)
+    fig.subplots_adjust(hspace=0.2, wspace=0.2, bottom=0.15, top=0.92)
+
+    names = sorted(df['name'].unique())
+    metrics = {'inter_token_latency': {'y': 'Time (ms)'}, 'time_to_first_token': {'y': 'Time (ms)'},
+               'end_to_end_latency': {'y': 'Time (ms)'}, 'tokens_throughput': {'y': 'Tokens/s'},
+               'requests_ok': {'y': 'Count'}, 'error_rate': {'y': 'Count'}}
+    titles = ['Inter Token Latency P90 (lower is better)', 'TTFT P90 (lower is better)',
+              'End to End Latency P90 (lower is better)', 'Request Output Throughput P90 (higher is better)',
+              'Successful requests (higher is better)', 'Error rate (lower is better)']
+    labels = ['Time (ms)', 'Time (ms)', 'Time (ms)', 'Tokens/s', 'Count', '%']
+    colors = ['#FF9D00', '#2F5BA1']
+    # Plot each metric in its respective subplot
+    for ax, metric, title, label in zip(axs.flatten(), metrics, titles, labels):
+        for i, name in enumerate(names):
+            df_sorted = df[df['name'] == name].sort_values(by=vus_param)
+            ax.plot(df_sorted[vus_param], df_sorted[metric], marker='o', label=f"{name}", color=colors[i])
+            ax.set_title(title)
+            ax.tick_params(axis='x', rotation=0)
+            ax.set_ylabel(label)
+            if test_type == TestType.CONSTANT_VUS:
+                ax.set_xlabel('VUS')
+            else:
+                ax.set_xlabel('Requests/s')
+            # Add grid lines for better readability
+            ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.5)
+            ax.set_axisbelow(True)  # Ensure grid lines are below the bars
+            ax.legend(title='Engine', loc='upper right')
+
+    # show title on top of the figure
+    if test_type == TestType.CONSTANT_VUS:
+        plt.suptitle(f'Constant VUs Load Test\n{model_name}', fontsize=16)
+    elif test_type == TestType.CONSTANT_ARRIVAL_RATE:
+        plt.suptitle(f'Constant Arrival Rate Load Test\n{model_name}', fontsize=16)
+    logger.info(f"Saving plot to {save_name}.png")
+    plt.savefig(f"{save_name}.png")
+
+
+def main():
+    for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
+        directory = f"results/{test_type.value.lower()}"
+        dfs = parse_json_files(directory, test_type)
+        plot_metrics(dfs, test_type, test_type.value.lower())
+
+
+if __name__ == "__main__":
+    main()
--- a/load_tests/poetry.lock
+++ b/load_tests/poetry.lock
--- a/load_tests/pyproject.toml
+++ b/load_tests/pyproject.toml
@ -0,0 +1,23 @@
+[tool.poetry]
+name = "tgi-benchmarks"
+version = "0.1.0"
+description = ""
+authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.11"
+numpy = "<2.0"
+pandas = "^2.2.2"
+scienceplots = "^2.1.1"
+docker = "^7.1.0"
+loguru = "^0.7.2"
+psutil = "^6.0.0"
+jinja2 = "^3.1.4"
+transformers = "^4.42.3"
+gputil = "^1.4.0"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"