From 4642fd27ad555e07878d824cd85deae6cd9d9f27 Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Tue, 16 Jul 2024 11:15:29 +0200
Subject: [PATCH] fix: Compute comparison table

---
 .github/workflows/ci_build.yaml  |  2 --
 .github/workflows/load_test.yaml | 19 +++++++++----
 load_tests/load_test.py          | 48 +++++++++++++++++++++++++++++++-
 load_tests/parse_load_test.py    |  1 +
 load_tests/poetry.lock           | 16 ++++++++++-
 load_tests/pyproject.toml        |  1 +
 6 files changed, 78 insertions(+), 9 deletions(-)
diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
index bbfee495..d62297e4 100644
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@@ -43,5 +43,3 @@ jobs:
       # https://github.com/actions/runner/issues/2206
       release-tests: ${{ inputs.release-tests == true }}
     secrets: inherit
-  load_tests:
-    uses: ./.github/workflows/load_test.yaml
\ No newline at end of file
diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml
index 9bc6309e..0399e6d1 100644
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@@ -4,6 +4,7 @@ on:
   schedule:
     - cron: '0 0 * * 1-5'
   workflow_call:
+  workflow_dispatch:
 
   pull_request:
     paths:
@@ -15,6 +16,7 @@ env:
   AWS_DEFAULT_REGION: us-east-1
   AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
+  LOAD_TEST_ISSUE: 2235
 
 jobs:
   load-tests:
@@ -76,7 +78,7 @@ jobs:
       - name: Archive test results artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: benchmark_results_plots
+          name: benchmark_results
           path: |
             load_tests/output/*
 
@@ -88,15 +90,22 @@ jobs:
         if: github.event_name == 'pull_request'
         with:
           script: |
+            let content=require('fs').readFileSync('load_tests/output/benchmark_avg_delta.md', 'utf-8');
             github.rest.issues.createComment({
-              issue_number: context.issue.number,
+              issue_number: process.env.LOAD_TEST_ISSUE,
               owner: context.repo.owner,
               repo: context.repo.repo,
-              body: '🚀 Load test results are in:\n\n'+
+              body: '🚀 Load test results are in for commit [${{ github.sha }}](https://github.com/huggingface/text-generation-inference/commit/${{ github.sha }})\n\n'+
                 '## Variable length prompts\n'+
+                '<p float="left">\n'+
                 '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_arrival_rate.png" width=200>\n' +
-                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_vus.png" width=200>\n\n' +
+                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_vus.png" width=200>\n' +
+                '</p>\n\n' +
                 '## Constant length prompts\n'+
+                '<p float="left">\n'+
                 '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_vus.png" width=200>\n' +
-                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_arrival_rate.png" width=200>\n'
+                '<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_arrival_rate.png" width=200>\n'+
+                '</p>\n\n' +
+                '## Delta to last release\n\n'+
+                content
             })
diff --git a/load_tests/load_test.py b/load_tests/load_test.py
index 573fbe89..dced4a6f 100644
--- a/load_tests/load_test.py
+++ b/load_tests/load_test.py
@@ -41,6 +41,48 @@ def merge_previous_results(csv_path: str, df: pd.DataFrame, version_id: str) ->
     return df
 
 
+def percentage_diff(x):
+    # in case we have no value to compare
+    if len(x) < 2:
+        return 0
+    xsum = (x[1] + x[0])
+    if xsum == 0:
+        return 0
+    return abs(x[1] - x[0]) / (xsum / 2) * 100
+
+
+def compute_avg_delta(df: pd.DataFrame, metric: str, test_type: TestType) -> float:
+    if test_type == TestType.CONSTANT_VUS:
+        param = 'vus'
+    elif test_type == TestType.CONSTANT_ARRIVAL_RATE:
+        param = 'rate'
+    else:
+        return 0.0
+    filtered = df[df[param].notna()].groupby(param)[metric]
+    return filtered.apply(lambda x: percentage_diff(sorted(x.values))).mean()
+
+
+def compute_avg_table(df: pd.DataFrame):
+    # only keep the current version and semver rows for comparison
+    df = df[df['name'].str.startswith(('tgi', 'v'))]
+    # compute the average delta for each metric and test type
+    avg_table = pd.DataFrame()
+    for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
+        df_avg = df[df['input_type'] == input_type.value]
+        for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
+            for metric in df.columns:
+                if metric in ['inter_token_latency', 'time_to_first_token', 'end_to_end_latency',
+                              'tokens_throughput', 'requests_ok', 'error_rate']:
+                    avg_delta = compute_avg_delta(df_avg, metric, test_type)
+                    avg_table = pd.concat([avg_table, pd.DataFrame(
+                        {'metric': metric, 'input_type': input_type.value, 'test_type': test_type.value,
+                         'avg_delta': avg_delta}, index=[0])])
+    # write the result to a markdown formatted table in a file
+    path = os.path.join(os.getcwd(), 'output', f'benchmark_avg_delta.md')
+    avg_table.to_markdown(path, index=False, tablefmt='github',
+                          headers=['Metric', 'Input Type', 'Test Type', 'Avg Delta (%)'])
+
+
 def main():
     model = 'Qwen/Qwen2-7B'
     runner = TGIDockerRunner(model)
@@ -59,6 +101,7 @@ def main():
         runner.stop()
         time.sleep(5)
 
+    all_dfs = pd.DataFrame()
     for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
         for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
             directory = os.path.join('results', input_type.value.lower(), test_type.value.lower())
@@ -84,12 +127,15 @@ def main():
                             if f.endswith(f'{input_type.value.lower()}_{test_type.value.lower()}.csv'):
                                 csv_path = os.path.join('/tmp/artifacts', d, f)
                                 # only keep short commit hash
-                                d = d[:7]
+                                if len(d) > 7:
+                                    d = d[:7]
                                 dfs = merge_previous_results(csv_path, dfs, d)
             except Exception as e:
                 logger.error(f'Error while merging previous results, skipping: {e}')
             plot_metrics(f'{model} {get_gpu_names()}', dfs, test_type,
                          f'output/{input_type.value.lower()}_{test_type.value.lower()}')
+            all_dfs = pd.concat([all_dfs, dfs])
+    compute_avg_table(all_dfs)
 
 
 def get_gpu_names() -> str:
diff --git a/load_tests/parse_load_test.py b/load_tests/parse_load_test.py
index 0535d5cc..401b7d6d 100644
--- a/load_tests/parse_load_test.py
+++ b/load_tests/parse_load_test.py
@@ -36,6 +36,7 @@ def parse_json_files(directory: str, test_type: TestType) -> pd.DataFrame:
                         'rate': data['k6_config']['rate'],
                         'duration': data['k6_config']['duration']
                     }
+                entry['input_type'] = data['k6_config']['input_type']
                 entry['test_duration'] = data['state']['testRunDurationMs'] / 1000.
                 entry['requests_ok'] = data['root_group']['checks'][0]['passes']
                 entry['requests_fail'] = data['root_group']['checks'][0]['fails']
diff --git a/load_tests/poetry.lock b/load_tests/poetry.lock
index d0d34571..81f7bb32 100644
--- a/load_tests/poetry.lock
+++ b/load_tests/poetry.lock
@@ -1297,6 +1297,20 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+description = "Pretty-print tabular data"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
+    {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
+]
+
+[package.extras]
+widechars = ["wcwidth"]
+
 [[package]]
 name = "tokenizers"
 version = "0.19.1"
@@ -1558,4 +1572,4 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "7678b38af0a7b76457fa91758a1b1021404c5981ca1298863dece6158d67a867"
+content-hash = "e58b659457f8a7dc54ca3f5e7c247351d90f7e741ecdbbf1fa94f4a597da8844"
diff --git a/load_tests/pyproject.toml b/load_tests/pyproject.toml
index 4b19149d..1a3b0be0 100644
--- a/load_tests/pyproject.toml
+++ b/load_tests/pyproject.toml
@@ -16,6 +16,7 @@ psutil = "^6.0.0"
 jinja2 = "^3.1.4"
 transformers = "^4.42.3"
 gputil = "^1.4.0"
+tabulate = "^0.9.0"
 
 
 [build-system]