feat: Add load tests

This commit is contained in:
Hugo Larcher 2024-07-11 11:45:24 +02:00
parent 8511669cb2
commit 8d358d9c61
No known key found for this signature in database
GPG Key ID: 3DAF63124699CA2B
20 changed files with 2617 additions and 166 deletions

View File

@ -43,3 +43,5 @@ jobs:
# https://github.com/actions/runner/issues/2206
release-tests: ${{ inputs.release-tests == true }}
secrets: inherit
load_tests:
uses: ./.github/workflows/load_test.yaml

View File

@ -3,6 +3,7 @@ name: Nightly load test
on:
schedule:
- cron: '0 0 * * 1-5'
workflow_call:
pull_request:
paths:
@ -10,33 +11,92 @@ on:
branches:
- 'main'
env:
AWS_DEFAULT_REGION: us-east-1
AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
jobs:
load-tests:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
runs-on: [ self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci ]
env:
DOCKER_VOLUME: /cache
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install k6
- name: Install awscli
run: |
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
sudo apt-get update
sudo apt-get install -y awscli
- name: Start starcoder
- name: Install poetry
run: |
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
sleep 10
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
curl -sSL https://install.python-poetry.org | python3 -
export PATH="$HOME/.local/bin:$PATH"
poetry --version
- name: Run k6
- name: Install texlive minimal
run: |
./k6 run load_tests/starcoder_load.js
sudo apt-get update
sudo apt-get install -y texlive-latex-extra texlive-fonts-recommended dvipng cm-super
- name: Stop starcoder
if: ${{ always() }}
- name: Install Go 1.21
uses: actions/setup-go@v2
with:
go-version: 1.21
- name: Install Python 3.11
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Download artifacts from previous runs
uses: actions/github-script@v6
continue-on-error: true
env:
WORKFLOW_FILENAME: load_test.yaml
ARTIFACT_NAME: benchmark_results_csv
ARTIFACT_FILENAME: benchmark_results_csv.zip
UNZIP_DIR: /tmp/artifacts
with:
script: |
const script = require('./load_tests/download_artifact.js')
await script({github, context, core})
- name: Run load test
run: |
docker stop tgi-starcoder || true
cd load_tests
make load-test
shell: bash
- name: Archive test results artifacts
uses: actions/upload-artifact@v4
with:
name: benchmark_results_plots
path: |
load_tests/output/*
- name: Upload to S3
run: |
aws s3 cp load_tests/output/ s3://text-generation-inference-ci/${{ github.sha }} --recursive
- uses: actions/github-script@v6
if: github.event_name == 'pull_request'
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '🚀 Load test results are in:\n\n'+
'## Variable length prompts\n'+
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_arrival_rate.png" width=200>\n' +
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_vus.png" width=200>\n\n' +
'## Constant length prompts\n'+
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_vus.png" width=200>\n' +
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_arrival_rate.png" width=200>\n'
})

View File

@ -1,9 +1,24 @@
.PHONY: download-dataset load-test build-k6
download-dataset:
@if [ ! -f ./benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json ]; then \
echo "Downloading dataset"; \
curl -L -o benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json; \
else \
echo "Dataset already downloaded"; \
fi
ShareGPT_V3_unfiltered_cleaned_split.json:
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
load-test: download-dataset build-k6
poetry install && poetry run python load_test.py
prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
python filter.py
prepare_orca:
python orca.py
build-k6:
mkdir -p /tmp/xk6 && \
cd /tmp/xk6 && \
git clone https://github.com/mstoykov/xk6-sse.git && \
cd xk6-sse && \
git checkout useSobek && \
go install go.k6.io/xk6/cmd/xk6@latest && \
xk6 build --with github.com/phymbert/xk6-sse=. && \
mkdir -p ~/.local/bin/ && \
mv k6 /tmp/k6-sse && \
rm -rf /tmp/xk6 && \
/tmp/k6-sse --version

View File

View File

@ -0,0 +1,151 @@
import subprocess
import threading
from typing import Dict, List
import docker
from docker.models.containers import Container
from loguru import logger
from benchmarks.utils import kill
class InferenceEngineRunner:
def __init__(self, model: str):
self.model = model
def run(self, parameters: list[tuple]):
NotImplementedError("This method should be implemented by the subclass")
def stop(self):
NotImplementedError("This method should be implemented by the subclass")
class TGIRunner(InferenceEngineRunner):
def __init__(self, model: str):
super().__init__(model)
self.process = None
self.model = model
def run(self, parameters: list[tuple]):
params = ""
for p in parameters:
params += f"--{p[0]} {str(p[1])}"
# start a TGI subprocess with the given parameter
args = f"text-generation-launcher --port 8080 --model-id {self.model} --huggingface-hub-cache /scratch {params}"
logger.info(f"Running TGI with parameters: {args}")
self.process = subprocess.Popen(args,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
for line in iter(self.process.stdout.readline, b""):
print(line.decode("utf-8"))
# wait for TGI to listen to the port
if b"Connected" in line:
break
if b"Error" in line:
raise Exception(f"Error starting TGI: {line}")
# continue to stream the logs in a thread
def stream_logs():
for line in iter(self.process.stdout.readline, b""):
print(line.decode("utf-8"))
if self.process.returncode is not None:
raise Exception("Error starting TGI")
self.thread = threading.Thread(target=stream_logs)
self.thread.start()
def stop(self):
logger.warning(f"Killing TGI with PID {self.process.pid}")
if self.process:
kill(self.process.pid)
if self.thread:
self.thread.join()
class TGIDockerRunner(InferenceEngineRunner):
def __init__(self,
model: str,
image: str = "ghcr.io/huggingface/text-generation-inference:latest",
volumes=None):
super().__init__(model)
if volumes is None:
volumes = []
self.container = None
self.image = image
self.volumes = volumes
def run(self, parameters: list[tuple]):
params = f"--model-id {self.model} --port 8080"
for p in parameters:
params += f" --{p[0]} {str(p[1])}"
logger.info(f"Running TGI with parameters: {params}")
volumes = {}
for v in self.volumes:
volumes[v[0]] = {"bind": v[1], "mode": "rw"}
self.container = run_docker(self.image, params,
"Connected",
"Error",
volumes=volumes)
def stop(self):
if self.container:
self.container.stop()
class VLLMDockerRunner(InferenceEngineRunner):
def __init__(self,
model: str,
image: str = "vllm/vllm-openai:latest",
volumes=None):
super().__init__(model)
if volumes is None:
volumes = []
self.container = None
self.image = image
self.volumes = volumes
def run(self, parameters: list[tuple]):
parameters.append(("max-num-seqs", "256"))
params = f"--model {self.model} --tensor-parallel-size {get_num_gpus()} --port 8080"
for p in parameters:
params += f" --{p[0]} {str(p[1])}"
logger.info(f"Running VLLM with parameters: {params}")
volumes = {}
for v in self.volumes:
volumes[v[0]] = {"bind": v[1], "mode": "rw"}
self.container = run_docker(self.image, params, "Uvicorn running",
"Error ",
volumes=volumes)
def stop(self):
if self.container:
self.container.stop()
def run_docker(image: str, args: str, success_sentinel: str,
error_sentinel: str, volumes=None) -> Container:
if volumes is None:
volumes = {}
client = docker.from_env()
# retrieve the GPU devices from CUDA_VISIBLE_DEVICES
devices = [f"{i}" for i in
range(get_num_gpus())]
container = client.containers.run(image, args,
detach=True,
device_requests=[
docker.types.DeviceRequest(device_ids=devices, capabilities=[['gpu']])
],
volumes=volumes,
shm_size="1g",
ports={"8080/tcp": 8080})
for line in container.logs(stream=True):
print(line.decode("utf-8"), end="")
if success_sentinel.encode("utf-8") in line:
break
if error_sentinel.encode("utf-8") in line:
container.stop()
raise Exception(f"Error starting container: {line}")
return container
def get_num_gpus() -> int:
return len(subprocess.run(["nvidia-smi", "-L"], capture_output=True).stdout.splitlines())

233
load_tests/benchmarks/k6.py Normal file
View File

@ -0,0 +1,233 @@
import json
import os
import subprocess
import tempfile
from enum import Enum
from typing import Any, Dict, List
import numpy as np
from jinja2 import Environment, PackageLoader, select_autoescape
from loguru import logger
from transformers import LlamaTokenizerFast
from benchmarks.utils import kill
env = Environment(
loader=PackageLoader("benchmarks"),
autoescape=select_autoescape()
)
class ExecutorInputType(Enum):
CONSTANT_TOKENS = "constant_tokens"
SHAREGPT_CONVERSATIONS = "sharegpt_conversations"
class K6Executor:
def __init__(self, name, template_name, executor_input_type=ExecutorInputType.SHAREGPT_CONVERSATIONS):
self.template_name = template_name
self.variables = {}
self.rendered_file = None
self.name = name
self.executor_input_type = executor_input_type
if executor_input_type == ExecutorInputType.CONSTANT_TOKENS:
self.input_filename = "inputs_constant_tokens.json"
elif executor_input_type == ExecutorInputType.SHAREGPT_CONVERSATIONS:
self.input_filename = "inputs_variable_tokens.json"
def render(self):
template = env.get_template(self.template_name)
_, path = tempfile.mkstemp("k6", "benchmark")
cwd = os.getcwd()
with open(path, "w") as f:
f.write(template.render(cwd=cwd, input_filename=self.input_filename, **self.variables))
self.rendered_file = path
def __str__(self):
# returns an underscore separated string of the variables for filename generation
params = "_".join([f"{k}_{v}" for k, v in sorted(self.variables.items()) if type(v) == str or type(v) == int])
return f"{self.executor_input_type.value}_{params}"
class K6ConstantArrivalRateExecutor(K6Executor):
def __init__(self, pre_allocated_vus: int, rate_per_second: int, duration: str,
executor_input_type: ExecutorInputType):
super().__init__("constant_arrival_rate", "k6_constant_arrival_rate.js.j2", executor_input_type)
self.variables = {
"pre_allocated_vus": pre_allocated_vus, # it's also the max vus
"rate": rate_per_second,
"duration": duration
}
class K6RampingArrivalRateExecutor(K6Executor):
def __init__(self, pre_allocated_vus: int, start_rate: int, time_unit: str, stages: List[Dict[str, Any]],
executor_input_type: ExecutorInputType):
super().__init__("ramping_arrival_rate", "k6_ramping_arrival_rate.js.j2", executor_input_type)
self.variables = {
"pre_allocated_vus": pre_allocated_vus,
"start_rate": start_rate,
"time_unit": time_unit,
"stages": stages
}
class K6ConstantVUsExecutor(K6Executor):
def __init__(self, vus: int, duration: str, executor_input_type: ExecutorInputType):
super().__init__("constant_vus", "k6_constant_vus.js.j2", executor_input_type)
self.variables = {
"vus": vus,
"duration": duration
}
class K6Config:
def __init__(self, name: str, executor: K6Executor,
tokenizer=LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer"),
conversations_input_file=None,
input_num_tokens=200,
max_new_tokens=200,
extra_info=None
):
self.executor = executor
# max_new_token will be set in k6 template
self.executor.variables["max_new_tokens"] = max_new_tokens
self.name = name
self.tokenizer = tokenizer
self.extra_info = extra_info
if conversations_input_file is None:
self.conversation_input_file = "benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json"
self.input_num_tokens = input_num_tokens
def __str__(self):
return f"K6Config(name={self.name} executor={self.executor})"
class K6Benchmark:
def __init__(self, k6_config: K6Config, output_dir: str):
self.process = None
self.k6_config = k6_config
self.output_dir = output_dir
self.input_tokens_len = k6_config.input_num_tokens
self._prepare_inputs()
def _prepare_inputs(self):
get_tokens_count = lambda txt: len(self.k6_config.tokenizer.encode(txt))
MAX_SAMPLES = 5000
# create a first input file with a constant number of tokens
# check if the file already exists
if not os.path.exists("inputs_constant_tokens.json"):
logger.info(f'Preparing input file with {self.input_tokens_len} input tokens')
outputs = []
with open(self.k6_config.conversation_input_file, "r") as f:
data = json.load(f)
for doc in data:
for conversation in doc["conversations"]:
if not conversation["from"] == "human":
continue
if get_tokens_count(conversation["value"]) < self.input_tokens_len:
continue
# encode the message
encoding = self.k6_config.tokenizer(conversation["value"], truncation=True,
max_length=self.input_tokens_len)
# find last encoded characters
span = encoding.token_to_chars(len(encoding["input_ids"]) - 1)
outputs.append(
{"message": conversation["value"][0:span.end], "num_tokens": len(encoding["input_ids"])})
if len(outputs) >= MAX_SAMPLES: # limit the number of inputs
break
with open("inputs_constant_tokens.json", "w") as f:
f.write(json.dumps(outputs))
# create a second input file with a sampling of inputs
# check if the file already exists
if not os.path.exists("inputs_variable_tokens.json"):
logger.info(
f'Preparing input file by randomly sampling shareGPT conversations at "{self.k6_config.conversation_input_file}"')
outputs = []
with open(self.k6_config.conversation_input_file, "r") as f:
data = json.load(f)
num_docs = len(data)
# generate random indexes to sample the data
indexes = np.random.choice(num_docs, 200, replace=False)
for i in indexes:
doc = data[i]
for conversation in doc["conversations"]:
if not conversation["from"] == "human":
continue
# encode the message without truncation
encoding = self.k6_config.tokenizer(conversation["value"])
outputs.append(
{"message": conversation["value"], "num_tokens": len(encoding["input_ids"])})
if len(outputs) >= MAX_SAMPLES: # limit the number of inputs
break
with open("inputs_variable_tokens.json", "w") as f:
f.write(json.dumps(outputs))
def run(self):
self.k6_config.executor.render()
args = f"/tmp/k6-sse run --out json=results.json {self.k6_config.executor.rendered_file}"
logger.info(f"Running k6 with parameters: {args}")
logger.info(f"K6Config is: {self.k6_config}")
# start a k6 subprocess
self.process = subprocess.Popen(args,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
while buffer := os.read(self.process.stdout.fileno(),
2048): # read the output of the process, don't buffer on new lines
print(buffer.decode(), end='')
self.process.wait()
logger.info(f"K6 process finished with return code {self.process.returncode}")
logger.info(f"Writing results to {self.get_results_path()}")
self.add_config_to_summary()
self.add_config_to_results()
def stop(self):
if self.process:
kill(self.process.pid)
def add_config_to_summary(self):
with open("summary.json", "r") as f:
summary = json.load(f)
summary["k6_config"] = {
"name": self.k6_config.name,
"input_type": self.k6_config.executor.executor_input_type.value,
"extra_info": self.k6_config.extra_info,
**self.k6_config.executor.variables
}
# create directory if it doesn't exist
os.makedirs(self._get_output_dir(), exist_ok=True)
with open(self.get_summary_path(), "w") as f2:
json.dump(summary, f2)
def add_config_to_results(self):
with open("results.json", "r") as f:
results = f.readlines()
# append the k6 config to the results in jsonlines format
results += "\n"
results += json.dumps({
"name": self.k6_config.name,
"input_type": self.k6_config.executor.executor_input_type.value,
"extra_info": self.k6_config.extra_info,
**self.k6_config.executor.variables
})
# create directory if it doesn't exist
os.makedirs(self._get_output_dir(), exist_ok=True)
with open(self.get_results_path(), "w") as f2:
f2.writelines(results)
def _get_output_dir(self):
# check if output_dir is relative or absolute
if self.output_dir.startswith("/"):
return f"{self.output_dir}/{self.k6_config.executor.name}"
else:
return f"{os.getcwd()}/{self.output_dir}/{self.k6_config.executor.name}"
def _get_output_path(self):
return f"{self._get_output_dir()}/{self.k6_config.name}_{self.k6_config.executor}"
def get_results_path(self):
return f"{self._get_output_path()}.json"
def get_summary_path(self):
return f"{self._get_output_path()}.summary.json"

View File

@ -0,0 +1,16 @@
{% include 'main.js.j2' %}
export function get_options() {
return {
scenarios: {
load_test: {
executor: 'constant-arrival-rate',
gracefulStop: '0s',
duration: '{{ duration }}',
preAllocatedVUs: {{ pre_allocated_vus }},
rate: {{ rate }},
timeUnit: '1s',
},
},
};
}

View File

@ -0,0 +1,14 @@
{% include 'main.js.j2' %}
export function get_options() {
return {
scenarios: {
load_test: {
executor: 'constant-vus',
gracefulStop: '0s',
duration: '{{ duration }}',
vus: {{ vus }},
},
},
};
}

View File

@ -0,0 +1,20 @@
{% include 'main.js.j2' %}
export function get_options() {
return {
scenarios: {
load_test: {
executor: 'ramping-arrival-rate',
gracefulStop: '0s',
preAllocatedVUs: {{ pre_allocated_vus }},
timeUnit: '{{ time_unit }}',
startRate: {{ start_rate }},
stages: [
{%- for stage in stages %}
{target: {{ stage.target }}, duration: '{{ stage.duration }}'},
{%- endfor %}
],
},
},
};
}

View File

@ -0,0 +1,131 @@
import {check, fail} from 'k6';
import sse from "k6/x/sse"
import {scenario} from 'k6/execution';
import http from 'k6/http';
import {Trend, Counter} from 'k6/metrics';
const host = "127.0.0.1:8080";
const model_id = "Qwen/Qwen2-72B";
const endToEndLatency = new Trend('end_to_end_latency', true);
const requestThroughput = new Counter('request_throughput');
const tokenThroughput = new Counter('tokens_throughput');
const timeToFirstToken = new Trend('time_to_first_token', true);
const interTokenLatency = new Trend('inter_token_latency', true); // is microseconds
const tokensReceived = new Trend('tokens_received');
const max_new_tokens = {{ max_new_tokens }};
const shareGPT = JSON.parse(open("{{ cwd }}/{{ input_filename }}"))
export function handleSummary(data) {
return {
'summary.json': JSON.stringify(data),
};
}
function generate_payload(gpt, max_new_tokens) {
let input = gpt["message"];
return {
"messages": [{"role": "user", "content": input}],
"temperature": 0,
"model": `${model_id}`,
"max_tokens": max_new_tokens,
"stream": true
};
}
export const options = get_options();
export default function run() {
const headers = {'Content-Type': 'application/json'};
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
const url = `http://${host}/v1/chat/completions`;
const params = {
method: 'POST',
body: payload,
headers
}
const startTime = Date.now();
let firstTokenTime = null;
let lastTokenTime = null;
let tokensCount = 0;
let response = ""
const res = sse.open(url, params, function (client) {
client.on('event', function (event) {
// console.log(event.data)
if (parseInt(event.id) === 4) {
client.close()
}
if (event.data.includes("[DONE]") || event.data === "") {
return
}
try {
const data = JSON.parse(event.data);
if (!'choices' in data) {
fail('http_200')
return;
}
const content = data['choices'][0]['delta']['content']
if (content !== undefined) {
response += data['choices'][0]['delta']['content']
tokensCount += 1;
}
// Measure time to first token
if (!firstTokenTime) {
firstTokenTime = Date.now();
timeToFirstToken.add(firstTokenTime - startTime);
}
// Measure inter-token latency
const currentTime = Date.now();
if (lastTokenTime) {
interTokenLatency.add((currentTime - lastTokenTime) * 1000.);
}
lastTokenTime = currentTime;
if ('finish_reason' in data['choices'][0]) {
if (data['choices'][0]['finish_reason'] != null) {
const endTime = Date.now();
const deltaMs = endTime - startTime;
endToEndLatency.add(deltaMs)
requestThroughput.add(1);
tokenThroughput.add(tokensCount);
tokensReceived.add(tokensCount);
}
}
} catch (e) {
// catch any errors that occur during the event processing
// increase the fail count of the 'http_200' check
check(true, {
'http_200': (val) => false,
})
fail('http_200')
}
})
client.on('error', function (e) {
console.log('An unexpected error occurred: ', e.error())
})
})
if (tokensCount === 0) {
// something went wrong with generation
fail('http_200')
}
if (res.status >= 400 && res.status < 500) {
return;
}
check(res, {
'http_200': (res) => res.status === 200,
});
}

View File

@ -0,0 +1,31 @@
import os
from unittest import TestCase
from benchmarks.k6 import K6RampingArrivalRateExecutor, K6Config, K6ConstantVUsExecutor, K6Benchmark, ExecutorInputType
class K6RampingArrivalRateExecutorTest(TestCase):
def test_render(self):
executor = K6RampingArrivalRateExecutor(
100,
1,
"1s",
[
{"target": 1, "duration": "30s"},
{"target": 100, "duration": "30s"}
],
ExecutorInputType.SHAREGPT_CONVERSATIONS)
executor.render()
self.assertIsNotNone(executor.rendered_file)
with open(executor.rendered_file, "r") as f:
content = f.read()
self.assertTrue("stages: [" in content)
self.assertTrue("target: 1, duration: '30s'" in content)
self.assertTrue(os.getcwd() in content)
class K6BenchmarkTest(TestCase):
def test_prepare_inputs(self):
executor = K6ConstantVUsExecutor(1, '1m')
config = K6Config("test", executor, input_num_tokens=500)
bench = K6Benchmark(config, "output")

View File

@ -0,0 +1,25 @@
from itertools import chain
import psutil
class SweepParameter:
def __init__(self, name, start, end, step):
self.name = name
self.start = start
self.end = end
self.step = step
def __str__(self):
return f"{self.name} from {self.start} to {self.end} in steps of {self.step}"
def __iter__(self):
for value in chain(range(self.start, self.end, self.step), [self.end]):
yield value
def kill(proc_pid):
process = psutil.Process(proc_pid)
for proc in process.children(recursive=True):
proc.kill()
process.kill()

View File

@ -1,94 +0,0 @@
import { check } from 'k6';
import { scenario } from 'k6/execution';
import http from 'k6/http';
import { Trend, Counter } from 'k6/metrics';
const host = __ENV.HOST;
const model_id = __ENV.MODEL_ID;
const timePerToken = new Trend('time_per_token', true);
const tokens = new Counter('tokens');
const new_tokens = new Counter('new_tokens');
const input_tokens = new Counter('input_tokens');
const max_new_tokens = 50;
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
const shareGPT = JSON.parse(open("small.json"))
export function get_options() {
return {
thresholds: {
http_req_failed: ['rate==0'],
// time_per_token: [{
// threshold: `p(50)<${5 * reference_latency_ms}`,
// abortOnFail: true,
// delayAbortEval: '10s'
// }],
},
scenarios: {
// single_user: {
// executor: 'constant-arrival-rate',
// duration: '60s',
// preAllocatedVUs: 1,
// rate: 20,
// timeUnit: '1s',
// },
load_test: {
executor: 'constant-arrival-rate',
duration: '60s',
preAllocatedVUs: 100,
rate: 1,
timeUnit: '1s',
},
// breakpoint: {
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
// preAllocatedVUs: 300,
// stages: [
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
// ],
// },
// throughput: {
// executor: 'shared-iterations',
// vus: 100,
// iterations: 200,
// maxDuration: '40s',
// },
},
};
}
function generate_payload(gpt, max_new_tokens) {
const input = gpt["conversations"][0]["value"];
return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
}
export const options = get_options();
export default function run() {
const headers = { 'Content-Type': 'application/json' };
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
const res = http.post(`http://${host}/v1/chat/completions`, payload, {
headers,
});
if (res.status >= 400 && res.status < 500) {
return;
}
check(res, {
'Post status is 200': (res) => res.status === 200,
});
const duration = res.timings.duration;
if (res.status === 200) {
const body = res.json();
const completion_tokens = body.usage.completion_tokens;
const latency_ms_per_token = duration / completion_tokens;
timePerToken.add(latency_ms_per_token);
const prompt_tokens = body.usage.prompt_tokens;
input_tokens.add(prompt_tokens);
new_tokens.add(completion_tokens);
tokens.add(completion_tokens + prompt_tokens);
}
}

View File

@ -0,0 +1,98 @@
module.exports = async ({
github,
context,
core
}) => {
const owner = context.repo.owner;
const repo = context.repo.repo;
const workflows = await github.rest.actions.listRepoWorkflows({
owner,
repo
})
const workflow = workflows.data.workflows.find(w => w.path.includes(process.env.WORKFLOW_FILENAME));
if (!workflow) {
core.setFailed("No workflow found");
return;
}
const runs = await github.rest.actions.listWorkflowRuns({
owner,
repo,
workflow_id: workflow.id,
status: "success",
per_page: 1
})
if (runs.data.total_count === 0) {
core.setFailed("No runs found");
return;
}
const lastRelease = await github.rest.repos.getLatestRelease({
owner,
repo
});
const lastReleaseTag = lastRelease.data.tag_name;
const tagRef = `tags/${lastReleaseTag}`;
const lastReleaseCommit = await github.rest.git.getRef({
owner,
repo,
ref: tagRef
});
const lastReleaseSha = lastReleaseCommit.data.object.sha;
const lastReleaseRun = await github.rest.actions.listWorkflowRuns({
owner,
repo,
workflow_id: workflow.id,
head_sha: lastReleaseSha,
status: "success",
per_page: 1
});
let lastReleaseArtifacts = {data: {artifacts: []}};
if (lastReleaseRun.data.total_count > 0) {
lastReleaseArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
owner,
repo,
run_id: lastReleaseRun.data.workflow_runs[0].id
});
}
const lastArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
owner,
repo,
run_id: runs.data.workflow_runs[0].id
});
const lastReleaseArtifact = lastReleaseArtifacts.data.artifacts.find(artifact => artifact.name === process.env.ARTIFACT_NAME);
const lastArtifact = lastArtifacts.data.artifacts.find(artifact => artifact.name === process.env.ARTIFACT_NAME);
if (lastReleaseArtifact) {
await downloadArtifact(github, owner, repo, lastReleaseArtifact, lastReleaseTag);
} else {
console.log("No release artifact found")
}
if (lastArtifact) {
await downloadArtifact(github, owner, repo, lastArtifact, lastArtifact.workflow_run.head_sha);
} else {
console.log("No last run artifact found")
}
}
async function downloadArtifact(github, owner, repo, artifact, suffix) {
const response = await github.rest.actions.downloadArtifact({
owner,
repo,
artifact_id: artifact.id,
archive_format: 'zip'
});
require('fs').writeFileSync(process.env.ARTIFACT_FILENAME, Buffer.from(response.data));
// create directory to unzip
require('fs').mkdirSync(`${process.env.UNZIP_DIR}/${artifact.workflow_run.head_sha}`, {recursive: true});
require('child_process').execSync(`unzip -o ${process.env.ARTIFACT_FILENAME} -d ${process.env.UNZIP_DIR}/${suffix}`);
console.log(`Artifact ${process.env.ARTIFACT_FILENAME} for ${suffix} downloaded successfully`);
}

View File

@ -1,26 +0,0 @@
import json
def main():
with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
data = json.load(f)
# Select only the first 2k conversations that start with a human.
max = 2000
conversations = []
for conversation in data:
conv = conversation.get("conversations")
if conv and conv[0]["from"] == "human":
# Trim the rest of the output
conversation["conversations"] = conversation["conversations"][:1]
conversations.append(conversation)
if len(conversation) >= max:
break
with open("./small.json", "w") as f:
data = json.dump(conversations, f, indent=4)
if __name__ == "__main__":
main()

103
load_tests/load_test.py Normal file
View File

@ -0,0 +1,103 @@
import os
import time
import traceback
from benchmarks.engine import TGIDockerRunner
from benchmarks.k6 import K6Config, K6Benchmark, K6ConstantArrivalRateExecutor, K6ConstantVUsExecutor, ExecutorInputType
from loguru import logger
import pandas as pd
import GPUtil
from parse_load_test import TestType, parse_json_files, plot_metrics
def run_full_test(engine_name: str):
vus_concurrences = list(range(0, 1024, 40))
vus_concurrences[0] = 1
vus_concurrences.append(1024)
arrival_rates = list(range(0, 200, 10))
arrival_rates[0] = 1
arrival_rates.append(200)
for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
for c in arrival_rates:
logger.info(f'Running k6 with constant arrival rate for {c} req/s with input type {input_type.value}')
k6_executor = K6ConstantArrivalRateExecutor(2000, c, '60s', input_type)
k6_config = K6Config(f'{engine_name}', k6_executor, input_num_tokens=200)
benchmark = K6Benchmark(k6_config, f'results/{input_type.value}/')
benchmark.run()
for c in vus_concurrences:
logger.info(f'Running k6 with constant VUs with concurrency {c} with input type {input_type.value}')
k6_executor = K6ConstantVUsExecutor(c, '60s', input_type)
k6_config = K6Config(f'{engine_name}', k6_executor, input_num_tokens=200)
benchmark = K6Benchmark(k6_config, f'results/{input_type.value}/')
benchmark.run()
def merge_previous_results(csv_path: str, df: pd.DataFrame, version_id: str) -> pd.DataFrame:
if os.path.exists(csv_path):
previous_df = pd.read_csv(csv_path)
previous_df['name'] = previous_df['name'].str.replace('tgi', f'tgi_{version_id}')
df = pd.concat([previous_df, df])
return df
def main():
model = 'Qwen/Qwen2-7B'
runner = TGIDockerRunner(model)
max_concurrent_requests = 8000
# run TGI
try:
logger.info('Running TGI')
runner.run([('max-concurrent-requests', max_concurrent_requests)])
logger.info('TGI is running')
run_full_test('tgi')
except Exception as e:
logger.error(f'Error: {e}')
# print the stack trace
print(traceback.format_exc())
finally:
runner.stop()
time.sleep(5)
for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
directory = os.path.join('results', input_type.value.lower(), test_type.value.lower())
# check if directory exists
if not os.path.exists(directory):
logger.error(f'Directory {directory} does not exist')
continue
dfs = parse_json_files(directory, test_type)
# create output directory if it does not exist
os.makedirs('output', exist_ok=True)
# save the data to a csv file
path = os.path.join(os.getcwd(), 'output', f'{input_type.value.lower()}_{test_type.value.lower()}.csv')
dfs.to_csv(path)
# check if we have previous results CSV file by listing /tmp/artifacts/<input_type> directory,
# merge them if they exist
prev_root = '/tmp/artifacts'
try:
if os.path.exists(prev_root):
directories = [item for item in os.listdir(prev_root) if
os.path.isdir(os.path.join(prev_root, item))]
for d in directories:
for f in os.listdir(f'{prev_root}/{d}'):
if f.endswith(f'{input_type.value.lower()}_{test_type.value.lower()}.csv'):
csv_path = os.path.join('/tmp/artifacts', d, f)
# only keep short commit hash
d = d[:7]
dfs = merge_previous_results(csv_path, dfs, d)
except Exception as e:
logger.error(f'Error while merging previous results, skipping: {e}')
plot_metrics(f'{model} {get_gpu_names()}', dfs, test_type,
f'output/{input_type.value.lower()}_{test_type.value.lower()}')
def get_gpu_names() -> str:
gpus = GPUtil.getGPUs()
if len(gpus) == 0:
return ''
return f'{len(gpus)}x{gpus[0].name if gpus else "No GPU available"}'
if __name__ == '__main__':
main()

View File

@ -1,27 +0,0 @@
import json
import datasets
import tqdm
def main():
dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
# Select only the first 2k conversations that start with a human.
max = min(2000, len(dataset))
conversations = []
for item in tqdm.tqdm(dataset, total=max):
conversation = {
"conversations": [
{"from": "human", "value": item["question"]},
],
"id": item["id"],
}
conversations.append(conversation)
if len(conversations) >= max:
break
with open("./small.json", "w") as f:
data = json.dump(conversations, f, indent=4)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,115 @@
import json
import os
from enum import Enum
import pandas as pd
from loguru import logger
from matplotlib import pyplot as plt
import scienceplots
plt.style.use('science')
class TestType(Enum):
CONSTANT_VUS = "constant_vus"
CONSTANT_ARRIVAL_RATE = "constant_arrival_rate"
def parse_json_files(directory: str, test_type: TestType) -> pd.DataFrame:
metrics_to_keep = {'inter_token_latency': {'y': 'Time (ms)'}, 'end_to_end_latency': {'y': 'Time (ms)'},
'time_to_first_token': {'y': 'Time (ms)'}, 'tokens_throughput': {'y': 'Tokens/s'},
'tokens_received': {'y': 'Count'}}
df = pd.DataFrame()
for file in os.listdir(directory):
if file.endswith("summary.json"):
filepath = os.path.join(directory, file)
with open(filepath, 'r') as f:
data = json.load(f)
if test_type == TestType.CONSTANT_VUS:
entry = {
"vus": data['k6_config']['vus'],
"duration": data['k6_config']['duration']
}
elif test_type == TestType.CONSTANT_ARRIVAL_RATE:
entry = {
'pre_allocated_vus': data['k6_config']['pre_allocated_vus'],
'rate': data['k6_config']['rate'],
'duration': data['k6_config']['duration']
}
entry['test_duration'] = data['state']['testRunDurationMs'] / 1000.
entry['requests_ok'] = data['root_group']['checks'][0]['passes']
entry['requests_fail'] = data['root_group']['checks'][0]['fails']
entry['dropped_iterations'] = data['metrics']['dropped_iterations']['values'][
'count'] if 'dropped_iterations' in data['metrics'] else 0
# add up requests_fail and dropped_iterations to get total dropped requests
entry['dropped_requests'] = entry['requests_fail'] + entry['dropped_iterations']
entry['error_rate'] = entry['dropped_requests'] / (
entry['requests_ok'] + entry['dropped_requests']) * 100.0
entry['name'] = data['k6_config']['name']
for metric, values in sorted(data['metrics'].items()):
if metric in metrics_to_keep:
for value_key, value in values['values'].items():
if value_key == 'p(90)' or value_key == 'count': # Only keep p(90) values if trend
entry[metric] = value
if 'tokens_throughput' in entry and 'test_duration' in entry:
entry['tokens_throughput'] = entry['tokens_throughput'] / (entry['test_duration'])
if 'inter_token_latency' in entry:
entry['inter_token_latency'] = entry['inter_token_latency'] / 1000.
df = pd.concat([df, pd.DataFrame(entry, index=[0])])
return df
def plot_metrics(model_name:str, df: pd.DataFrame, test_type: TestType, save_name: str):
vus_param = ''
if test_type == TestType.CONSTANT_VUS:
vus_param = 'vus'
else:
vus_param = 'rate'
fig, axs = plt.subplots(3, 2, figsize=(15, 20))
fig.tight_layout(pad=6.0)
fig.subplots_adjust(hspace=0.2, wspace=0.2, bottom=0.15, top=0.92)
names = sorted(df['name'].unique())
metrics = {'inter_token_latency': {'y': 'Time (ms)'}, 'time_to_first_token': {'y': 'Time (ms)'},
'end_to_end_latency': {'y': 'Time (ms)'}, 'tokens_throughput': {'y': 'Tokens/s'},
'requests_ok': {'y': 'Count'}, 'error_rate': {'y': 'Count'}}
titles = ['Inter Token Latency P90 (lower is better)', 'TTFT P90 (lower is better)',
'End to End Latency P90 (lower is better)', 'Request Output Throughput P90 (higher is better)',
'Successful requests (higher is better)', 'Error rate (lower is better)']
labels = ['Time (ms)', 'Time (ms)', 'Time (ms)', 'Tokens/s', 'Count', '%']
colors = ['#FF9D00', '#2F5BA1']
# Plot each metric in its respective subplot
for ax, metric, title, label in zip(axs.flatten(), metrics, titles, labels):
for i, name in enumerate(names):
df_sorted = df[df['name'] == name].sort_values(by=vus_param)
ax.plot(df_sorted[vus_param], df_sorted[metric], marker='o', label=f"{name}", color=colors[i])
ax.set_title(title)
ax.tick_params(axis='x', rotation=0)
ax.set_ylabel(label)
if test_type == TestType.CONSTANT_VUS:
ax.set_xlabel('VUS')
else:
ax.set_xlabel('Requests/s')
# Add grid lines for better readability
ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.5)
ax.set_axisbelow(True) # Ensure grid lines are below the bars
ax.legend(title='Engine', loc='upper right')
# show title on top of the figure
if test_type == TestType.CONSTANT_VUS:
plt.suptitle(f'Constant VUs Load Test\n{model_name}', fontsize=16)
elif test_type == TestType.CONSTANT_ARRIVAL_RATE:
plt.suptitle(f'Constant Arrival Rate Load Test\n{model_name}', fontsize=16)
logger.info(f"Saving plot to {save_name}.png")
plt.savefig(f"{save_name}.png")
def main():
for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
directory = f"results/{test_type.value.lower()}"
dfs = parse_json_files(directory, test_type)
plot_metrics(dfs, test_type, test_type.value.lower())
if __name__ == "__main__":
main()

1561
load_tests/poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

23
load_tests/pyproject.toml Normal file
View File

@ -0,0 +1,23 @@
[tool.poetry]
name = "tgi-benchmarks"
version = "0.1.0"
description = ""
authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
numpy = "<2.0"
pandas = "^2.2.2"
scienceplots = "^2.1.1"
docker = "^7.1.0"
loguru = "^0.7.2"
psutil = "^6.0.0"
jinja2 = "^3.1.4"
transformers = "^4.42.3"
gputil = "^1.4.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"