From 4642fd27ad555e07878d824cd85deae6cd9d9f27 Mon Sep 17 00:00:00 2001 From: Hugo Larcher Date: Tue, 16 Jul 2024 11:15:29 +0200 Subject: [PATCH] fix: Compute comparison table --- .github/workflows/ci_build.yaml | 2 -- .github/workflows/load_test.yaml | 19 +++++++++---- load_tests/load_test.py | 48 +++++++++++++++++++++++++++++++- load_tests/parse_load_test.py | 1 + load_tests/poetry.lock | 16 ++++++++++- load_tests/pyproject.toml | 1 + 6 files changed, 78 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml index bbfee495..d62297e4 100644 --- a/.github/workflows/ci_build.yaml +++ b/.github/workflows/ci_build.yaml @@ -43,5 +43,3 @@ jobs: # https://github.com/actions/runner/issues/2206 release-tests: ${{ inputs.release-tests == true }} secrets: inherit - load_tests: - uses: ./.github/workflows/load_test.yaml \ No newline at end of file diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml index 9bc6309e..0399e6d1 100644 --- a/.github/workflows/load_test.yaml +++ b/.github/workflows/load_test.yaml @@ -4,6 +4,7 @@ on: schedule: - cron: '0 0 * * 1-5' workflow_call: + workflow_dispatch: pull_request: paths: @@ -15,6 +16,7 @@ env: AWS_DEFAULT_REGION: us-east-1 AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }} + LOAD_TEST_ISSUE: 2235 jobs: load-tests: @@ -76,7 +78,7 @@ jobs: - name: Archive test results artifacts uses: actions/upload-artifact@v4 with: - name: benchmark_results_plots + name: benchmark_results path: | load_tests/output/* @@ -88,15 +90,22 @@ jobs: if: github.event_name == 'pull_request' with: script: | + let content=require('fs').readFileSync('load_tests/output/benchmark_avg_delta.md', 'utf-8'); github.rest.issues.createComment({ - issue_number: context.issue.number, + issue_number: process.env.LOAD_TEST_ISSUE, owner: context.repo.owner, repo: context.repo.repo, - body: '🚀 Load test results are in:\n\n'+ + body: '🚀 Load test results are in for commit [${{ github.sha }}](https://github.com/huggingface/text-generation-inference/commit/${{ github.sha }})\n\n'+ '## Variable length prompts\n'+ + '

\n'+ '\n' + - '\n\n' + + '\n' + + '

\n\n' + '## Constant length prompts\n'+ + '

\n'+ '\n' + - '\n' + '\n'+ + '

\n\n' + + '## Delta to last release\n\n'+ + content }) diff --git a/load_tests/load_test.py b/load_tests/load_test.py index 573fbe89..dced4a6f 100644 --- a/load_tests/load_test.py +++ b/load_tests/load_test.py @@ -41,6 +41,48 @@ def merge_previous_results(csv_path: str, df: pd.DataFrame, version_id: str) -> return df +def percentage_diff(x): + # in case we have no value to compare + if len(x) < 2: + return 0 + xsum = (x[1] + x[0]) + if xsum == 0: + return 0 + return abs(x[1] - x[0]) / (xsum / 2) * 100 + + +def compute_avg_delta(df: pd.DataFrame, metric: str, test_type: TestType) -> float: + if test_type == TestType.CONSTANT_VUS: + param = 'vus' + elif test_type == TestType.CONSTANT_ARRIVAL_RATE: + param = 'rate' + else: + return 0.0 + filtered = df[df[param].notna()].groupby(param)[metric] + return filtered.apply(lambda x: percentage_diff(sorted(x.values))).mean() + + +def compute_avg_table(df: pd.DataFrame): + # only keep the current version and semver rows for comparison + df = df[df['name'].str.startswith(('tgi', 'v'))] + # compute the average delta for each metric and test type + avg_table = pd.DataFrame() + for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]: + df_avg = df[df['input_type'] == input_type.value] + for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]: + for metric in df.columns: + if metric in ['inter_token_latency', 'time_to_first_token', 'end_to_end_latency', + 'tokens_throughput', 'requests_ok', 'error_rate']: + avg_delta = compute_avg_delta(df_avg, metric, test_type) + avg_table = pd.concat([avg_table, pd.DataFrame( + {'metric': metric, 'input_type': input_type.value, 'test_type': test_type.value, + 'avg_delta': avg_delta}, index=[0])]) + # write the result to a markdown formatted table in a file + path = os.path.join(os.getcwd(), 'output', f'benchmark_avg_delta.md') + avg_table.to_markdown(path, index=False, tablefmt='github', + headers=['Metric', 'Input Type', 'Test Type', 'Avg Delta (%)']) + + def main(): model = 'Qwen/Qwen2-7B' runner = TGIDockerRunner(model) @@ -59,6 +101,7 @@ def main(): runner.stop() time.sleep(5) + all_dfs = pd.DataFrame() for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]: for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]: directory = os.path.join('results', input_type.value.lower(), test_type.value.lower()) @@ -84,12 +127,15 @@ def main(): if f.endswith(f'{input_type.value.lower()}_{test_type.value.lower()}.csv'): csv_path = os.path.join('/tmp/artifacts', d, f) # only keep short commit hash - d = d[:7] + if len(d) > 7: + d = d[:7] dfs = merge_previous_results(csv_path, dfs, d) except Exception as e: logger.error(f'Error while merging previous results, skipping: {e}') plot_metrics(f'{model} {get_gpu_names()}', dfs, test_type, f'output/{input_type.value.lower()}_{test_type.value.lower()}') + all_dfs = pd.concat([all_dfs, dfs]) + compute_avg_table(all_dfs) def get_gpu_names() -> str: diff --git a/load_tests/parse_load_test.py b/load_tests/parse_load_test.py index 0535d5cc..401b7d6d 100644 --- a/load_tests/parse_load_test.py +++ b/load_tests/parse_load_test.py @@ -36,6 +36,7 @@ def parse_json_files(directory: str, test_type: TestType) -> pd.DataFrame: 'rate': data['k6_config']['rate'], 'duration': data['k6_config']['duration'] } + entry['input_type'] = data['k6_config']['input_type'] entry['test_duration'] = data['state']['testRunDurationMs'] / 1000. entry['requests_ok'] = data['root_group']['checks'][0]['passes'] entry['requests_fail'] = data['root_group']['checks'][0]['fails'] diff --git a/load_tests/poetry.lock b/load_tests/poetry.lock index d0d34571..81f7bb32 100644 --- a/load_tests/poetry.lock +++ b/load_tests/poetry.lock @@ -1297,6 +1297,20 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tokenizers" version = "0.19.1" @@ -1558,4 +1572,4 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "7678b38af0a7b76457fa91758a1b1021404c5981ca1298863dece6158d67a867" +content-hash = "e58b659457f8a7dc54ca3f5e7c247351d90f7e741ecdbbf1fa94f4a597da8844" diff --git a/load_tests/pyproject.toml b/load_tests/pyproject.toml index 4b19149d..1a3b0be0 100644 --- a/load_tests/pyproject.toml +++ b/load_tests/pyproject.toml @@ -16,6 +16,7 @@ psutil = "^6.0.0" jinja2 = "^3.1.4" transformers = "^4.42.3" gputil = "^1.4.0" +tabulate = "^0.9.0" [build-system]