mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 16:32:12 +00:00
feat: Add load tests
This commit is contained in:
parent
8511669cb2
commit
8d358d9c61
2
.github/workflows/ci_build.yaml
vendored
2
.github/workflows/ci_build.yaml
vendored
@ -43,3 +43,5 @@ jobs:
|
||||
# https://github.com/actions/runner/issues/2206
|
||||
release-tests: ${{ inputs.release-tests == true }}
|
||||
secrets: inherit
|
||||
load_tests:
|
||||
uses: ./.github/workflows/load_test.yaml
|
84
.github/workflows/load_test.yaml
vendored
84
.github/workflows/load_test.yaml
vendored
@ -3,6 +3,7 @@ name: Nightly load test
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0 * * 1-5'
|
||||
workflow_call:
|
||||
|
||||
pull_request:
|
||||
paths:
|
||||
@ -10,33 +11,92 @@ on:
|
||||
branches:
|
||||
- 'main'
|
||||
|
||||
env:
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
|
||||
|
||||
jobs:
|
||||
load-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
|
||||
runs-on: [ self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci ]
|
||||
env:
|
||||
DOCKER_VOLUME: /cache
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Install k6
|
||||
- name: Install awscli
|
||||
run: |
|
||||
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y awscli
|
||||
|
||||
- name: Start starcoder
|
||||
- name: Install poetry
|
||||
run: |
|
||||
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
|
||||
sleep 10
|
||||
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
poetry --version
|
||||
|
||||
- name: Run k6
|
||||
- name: Install texlive minimal
|
||||
run: |
|
||||
./k6 run load_tests/starcoder_load.js
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y texlive-latex-extra texlive-fonts-recommended dvipng cm-super
|
||||
|
||||
- name: Stop starcoder
|
||||
if: ${{ always() }}
|
||||
- name: Install Go 1.21
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: 1.21
|
||||
|
||||
- name: Install Python 3.11
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.11
|
||||
|
||||
- name: Download artifacts from previous runs
|
||||
uses: actions/github-script@v6
|
||||
continue-on-error: true
|
||||
env:
|
||||
WORKFLOW_FILENAME: load_test.yaml
|
||||
ARTIFACT_NAME: benchmark_results_csv
|
||||
ARTIFACT_FILENAME: benchmark_results_csv.zip
|
||||
UNZIP_DIR: /tmp/artifacts
|
||||
with:
|
||||
script: |
|
||||
const script = require('./load_tests/download_artifact.js')
|
||||
await script({github, context, core})
|
||||
|
||||
- name: Run load test
|
||||
run: |
|
||||
docker stop tgi-starcoder || true
|
||||
cd load_tests
|
||||
make load-test
|
||||
shell: bash
|
||||
|
||||
- name: Archive test results artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: benchmark_results_plots
|
||||
path: |
|
||||
load_tests/output/*
|
||||
|
||||
- name: Upload to S3
|
||||
run: |
|
||||
aws s3 cp load_tests/output/ s3://text-generation-inference-ci/${{ github.sha }} --recursive
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: github.event_name == 'pull_request'
|
||||
with:
|
||||
script: |
|
||||
github.rest.issues.createComment({
|
||||
issue_number: context.issue.number,
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
body: '🚀 Load test results are in:\n\n'+
|
||||
'## Variable length prompts\n'+
|
||||
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_arrival_rate.png" width=200>\n' +
|
||||
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/sharegpt_conversations_constant_vus.png" width=200>\n\n' +
|
||||
'## Constant length prompts\n'+
|
||||
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_vus.png" width=200>\n' +
|
||||
'<img src="http://text-generation-inference-ci.s3-website-us-east-1.amazonaws.com/${{github.sha}}/constant_tokens_constant_arrival_rate.png" width=200>\n'
|
||||
})
|
||||
|
@ -1,9 +1,24 @@
|
||||
.PHONY: download-dataset load-test build-k6
|
||||
download-dataset:
|
||||
@if [ ! -f ./benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json ]; then \
|
||||
echo "Downloading dataset"; \
|
||||
curl -L -o benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json; \
|
||||
else \
|
||||
echo "Dataset already downloaded"; \
|
||||
fi
|
||||
|
||||
ShareGPT_V3_unfiltered_cleaned_split.json:
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
load-test: download-dataset build-k6
|
||||
poetry install && poetry run python load_test.py
|
||||
|
||||
prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python filter.py
|
||||
|
||||
prepare_orca:
|
||||
python orca.py
|
||||
build-k6:
|
||||
mkdir -p /tmp/xk6 && \
|
||||
cd /tmp/xk6 && \
|
||||
git clone https://github.com/mstoykov/xk6-sse.git && \
|
||||
cd xk6-sse && \
|
||||
git checkout useSobek && \
|
||||
go install go.k6.io/xk6/cmd/xk6@latest && \
|
||||
xk6 build --with github.com/phymbert/xk6-sse=. && \
|
||||
mkdir -p ~/.local/bin/ && \
|
||||
mv k6 /tmp/k6-sse && \
|
||||
rm -rf /tmp/xk6 && \
|
||||
/tmp/k6-sse --version
|
0
load_tests/benchmarks/__init__.py
Normal file
0
load_tests/benchmarks/__init__.py
Normal file
151
load_tests/benchmarks/engine.py
Normal file
151
load_tests/benchmarks/engine.py
Normal file
@ -0,0 +1,151 @@
|
||||
import subprocess
|
||||
import threading
|
||||
from typing import Dict, List
|
||||
|
||||
import docker
|
||||
from docker.models.containers import Container
|
||||
from loguru import logger
|
||||
|
||||
from benchmarks.utils import kill
|
||||
|
||||
|
||||
class InferenceEngineRunner:
|
||||
def __init__(self, model: str):
|
||||
self.model = model
|
||||
|
||||
def run(self, parameters: list[tuple]):
|
||||
NotImplementedError("This method should be implemented by the subclass")
|
||||
|
||||
def stop(self):
|
||||
NotImplementedError("This method should be implemented by the subclass")
|
||||
|
||||
|
||||
class TGIRunner(InferenceEngineRunner):
|
||||
def __init__(self, model: str):
|
||||
super().__init__(model)
|
||||
self.process = None
|
||||
self.model = model
|
||||
|
||||
def run(self, parameters: list[tuple]):
|
||||
params = ""
|
||||
for p in parameters:
|
||||
params += f"--{p[0]} {str(p[1])}"
|
||||
# start a TGI subprocess with the given parameter
|
||||
args = f"text-generation-launcher --port 8080 --model-id {self.model} --huggingface-hub-cache /scratch {params}"
|
||||
logger.info(f"Running TGI with parameters: {args}")
|
||||
self.process = subprocess.Popen(args,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
for line in iter(self.process.stdout.readline, b""):
|
||||
print(line.decode("utf-8"))
|
||||
# wait for TGI to listen to the port
|
||||
if b"Connected" in line:
|
||||
break
|
||||
if b"Error" in line:
|
||||
raise Exception(f"Error starting TGI: {line}")
|
||||
|
||||
# continue to stream the logs in a thread
|
||||
def stream_logs():
|
||||
for line in iter(self.process.stdout.readline, b""):
|
||||
print(line.decode("utf-8"))
|
||||
|
||||
if self.process.returncode is not None:
|
||||
raise Exception("Error starting TGI")
|
||||
self.thread = threading.Thread(target=stream_logs)
|
||||
self.thread.start()
|
||||
|
||||
def stop(self):
|
||||
logger.warning(f"Killing TGI with PID {self.process.pid}")
|
||||
if self.process:
|
||||
kill(self.process.pid)
|
||||
if self.thread:
|
||||
self.thread.join()
|
||||
|
||||
|
||||
class TGIDockerRunner(InferenceEngineRunner):
|
||||
def __init__(self,
|
||||
model: str,
|
||||
image: str = "ghcr.io/huggingface/text-generation-inference:latest",
|
||||
volumes=None):
|
||||
super().__init__(model)
|
||||
if volumes is None:
|
||||
volumes = []
|
||||
self.container = None
|
||||
self.image = image
|
||||
self.volumes = volumes
|
||||
|
||||
def run(self, parameters: list[tuple]):
|
||||
params = f"--model-id {self.model} --port 8080"
|
||||
for p in parameters:
|
||||
params += f" --{p[0]} {str(p[1])}"
|
||||
logger.info(f"Running TGI with parameters: {params}")
|
||||
volumes = {}
|
||||
for v in self.volumes:
|
||||
volumes[v[0]] = {"bind": v[1], "mode": "rw"}
|
||||
self.container = run_docker(self.image, params,
|
||||
"Connected",
|
||||
"Error",
|
||||
volumes=volumes)
|
||||
|
||||
def stop(self):
|
||||
if self.container:
|
||||
self.container.stop()
|
||||
|
||||
|
||||
class VLLMDockerRunner(InferenceEngineRunner):
|
||||
def __init__(self,
|
||||
model: str,
|
||||
image: str = "vllm/vllm-openai:latest",
|
||||
volumes=None):
|
||||
super().__init__(model)
|
||||
if volumes is None:
|
||||
volumes = []
|
||||
self.container = None
|
||||
self.image = image
|
||||
self.volumes = volumes
|
||||
|
||||
def run(self, parameters: list[tuple]):
|
||||
parameters.append(("max-num-seqs", "256"))
|
||||
params = f"--model {self.model} --tensor-parallel-size {get_num_gpus()} --port 8080"
|
||||
for p in parameters:
|
||||
params += f" --{p[0]} {str(p[1])}"
|
||||
logger.info(f"Running VLLM with parameters: {params}")
|
||||
volumes = {}
|
||||
for v in self.volumes:
|
||||
volumes[v[0]] = {"bind": v[1], "mode": "rw"}
|
||||
self.container = run_docker(self.image, params, "Uvicorn running",
|
||||
"Error ",
|
||||
volumes=volumes)
|
||||
|
||||
def stop(self):
|
||||
if self.container:
|
||||
self.container.stop()
|
||||
|
||||
|
||||
def run_docker(image: str, args: str, success_sentinel: str,
|
||||
error_sentinel: str, volumes=None) -> Container:
|
||||
if volumes is None:
|
||||
volumes = {}
|
||||
client = docker.from_env()
|
||||
# retrieve the GPU devices from CUDA_VISIBLE_DEVICES
|
||||
devices = [f"{i}" for i in
|
||||
range(get_num_gpus())]
|
||||
container = client.containers.run(image, args,
|
||||
detach=True,
|
||||
device_requests=[
|
||||
docker.types.DeviceRequest(device_ids=devices, capabilities=[['gpu']])
|
||||
],
|
||||
volumes=volumes,
|
||||
shm_size="1g",
|
||||
ports={"8080/tcp": 8080})
|
||||
for line in container.logs(stream=True):
|
||||
print(line.decode("utf-8"), end="")
|
||||
if success_sentinel.encode("utf-8") in line:
|
||||
break
|
||||
if error_sentinel.encode("utf-8") in line:
|
||||
container.stop()
|
||||
raise Exception(f"Error starting container: {line}")
|
||||
return container
|
||||
|
||||
|
||||
def get_num_gpus() -> int:
|
||||
return len(subprocess.run(["nvidia-smi", "-L"], capture_output=True).stdout.splitlines())
|
233
load_tests/benchmarks/k6.py
Normal file
233
load_tests/benchmarks/k6.py
Normal file
@ -0,0 +1,233 @@
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
from jinja2 import Environment, PackageLoader, select_autoescape
|
||||
from loguru import logger
|
||||
from transformers import LlamaTokenizerFast
|
||||
|
||||
from benchmarks.utils import kill
|
||||
|
||||
env = Environment(
|
||||
loader=PackageLoader("benchmarks"),
|
||||
autoescape=select_autoescape()
|
||||
)
|
||||
|
||||
|
||||
class ExecutorInputType(Enum):
|
||||
CONSTANT_TOKENS = "constant_tokens"
|
||||
SHAREGPT_CONVERSATIONS = "sharegpt_conversations"
|
||||
|
||||
|
||||
class K6Executor:
|
||||
def __init__(self, name, template_name, executor_input_type=ExecutorInputType.SHAREGPT_CONVERSATIONS):
|
||||
self.template_name = template_name
|
||||
self.variables = {}
|
||||
self.rendered_file = None
|
||||
self.name = name
|
||||
self.executor_input_type = executor_input_type
|
||||
if executor_input_type == ExecutorInputType.CONSTANT_TOKENS:
|
||||
self.input_filename = "inputs_constant_tokens.json"
|
||||
elif executor_input_type == ExecutorInputType.SHAREGPT_CONVERSATIONS:
|
||||
self.input_filename = "inputs_variable_tokens.json"
|
||||
|
||||
def render(self):
|
||||
template = env.get_template(self.template_name)
|
||||
_, path = tempfile.mkstemp("k6", "benchmark")
|
||||
cwd = os.getcwd()
|
||||
with open(path, "w") as f:
|
||||
f.write(template.render(cwd=cwd, input_filename=self.input_filename, **self.variables))
|
||||
self.rendered_file = path
|
||||
|
||||
def __str__(self):
|
||||
# returns an underscore separated string of the variables for filename generation
|
||||
params = "_".join([f"{k}_{v}" for k, v in sorted(self.variables.items()) if type(v) == str or type(v) == int])
|
||||
return f"{self.executor_input_type.value}_{params}"
|
||||
|
||||
|
||||
class K6ConstantArrivalRateExecutor(K6Executor):
|
||||
def __init__(self, pre_allocated_vus: int, rate_per_second: int, duration: str,
|
||||
executor_input_type: ExecutorInputType):
|
||||
super().__init__("constant_arrival_rate", "k6_constant_arrival_rate.js.j2", executor_input_type)
|
||||
self.variables = {
|
||||
"pre_allocated_vus": pre_allocated_vus, # it's also the max vus
|
||||
"rate": rate_per_second,
|
||||
"duration": duration
|
||||
}
|
||||
|
||||
|
||||
class K6RampingArrivalRateExecutor(K6Executor):
|
||||
def __init__(self, pre_allocated_vus: int, start_rate: int, time_unit: str, stages: List[Dict[str, Any]],
|
||||
executor_input_type: ExecutorInputType):
|
||||
super().__init__("ramping_arrival_rate", "k6_ramping_arrival_rate.js.j2", executor_input_type)
|
||||
self.variables = {
|
||||
"pre_allocated_vus": pre_allocated_vus,
|
||||
"start_rate": start_rate,
|
||||
"time_unit": time_unit,
|
||||
"stages": stages
|
||||
}
|
||||
|
||||
|
||||
class K6ConstantVUsExecutor(K6Executor):
|
||||
def __init__(self, vus: int, duration: str, executor_input_type: ExecutorInputType):
|
||||
super().__init__("constant_vus", "k6_constant_vus.js.j2", executor_input_type)
|
||||
self.variables = {
|
||||
"vus": vus,
|
||||
"duration": duration
|
||||
}
|
||||
|
||||
|
||||
class K6Config:
|
||||
def __init__(self, name: str, executor: K6Executor,
|
||||
tokenizer=LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer"),
|
||||
conversations_input_file=None,
|
||||
input_num_tokens=200,
|
||||
max_new_tokens=200,
|
||||
extra_info=None
|
||||
):
|
||||
self.executor = executor
|
||||
# max_new_token will be set in k6 template
|
||||
self.executor.variables["max_new_tokens"] = max_new_tokens
|
||||
self.name = name
|
||||
self.tokenizer = tokenizer
|
||||
self.extra_info = extra_info
|
||||
if conversations_input_file is None:
|
||||
self.conversation_input_file = "benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
self.input_num_tokens = input_num_tokens
|
||||
|
||||
def __str__(self):
|
||||
return f"K6Config(name={self.name} executor={self.executor})"
|
||||
|
||||
|
||||
class K6Benchmark:
|
||||
def __init__(self, k6_config: K6Config, output_dir: str):
|
||||
self.process = None
|
||||
self.k6_config = k6_config
|
||||
self.output_dir = output_dir
|
||||
self.input_tokens_len = k6_config.input_num_tokens
|
||||
self._prepare_inputs()
|
||||
|
||||
def _prepare_inputs(self):
|
||||
get_tokens_count = lambda txt: len(self.k6_config.tokenizer.encode(txt))
|
||||
MAX_SAMPLES = 5000
|
||||
|
||||
# create a first input file with a constant number of tokens
|
||||
# check if the file already exists
|
||||
if not os.path.exists("inputs_constant_tokens.json"):
|
||||
logger.info(f'Preparing input file with {self.input_tokens_len} input tokens')
|
||||
outputs = []
|
||||
with open(self.k6_config.conversation_input_file, "r") as f:
|
||||
data = json.load(f)
|
||||
for doc in data:
|
||||
for conversation in doc["conversations"]:
|
||||
if not conversation["from"] == "human":
|
||||
continue
|
||||
if get_tokens_count(conversation["value"]) < self.input_tokens_len:
|
||||
continue
|
||||
# encode the message
|
||||
encoding = self.k6_config.tokenizer(conversation["value"], truncation=True,
|
||||
max_length=self.input_tokens_len)
|
||||
# find last encoded characters
|
||||
span = encoding.token_to_chars(len(encoding["input_ids"]) - 1)
|
||||
outputs.append(
|
||||
{"message": conversation["value"][0:span.end], "num_tokens": len(encoding["input_ids"])})
|
||||
if len(outputs) >= MAX_SAMPLES: # limit the number of inputs
|
||||
break
|
||||
with open("inputs_constant_tokens.json", "w") as f:
|
||||
f.write(json.dumps(outputs))
|
||||
|
||||
# create a second input file with a sampling of inputs
|
||||
# check if the file already exists
|
||||
if not os.path.exists("inputs_variable_tokens.json"):
|
||||
logger.info(
|
||||
f'Preparing input file by randomly sampling shareGPT conversations at "{self.k6_config.conversation_input_file}"')
|
||||
outputs = []
|
||||
with open(self.k6_config.conversation_input_file, "r") as f:
|
||||
data = json.load(f)
|
||||
num_docs = len(data)
|
||||
# generate random indexes to sample the data
|
||||
indexes = np.random.choice(num_docs, 200, replace=False)
|
||||
for i in indexes:
|
||||
doc = data[i]
|
||||
for conversation in doc["conversations"]:
|
||||
if not conversation["from"] == "human":
|
||||
continue
|
||||
# encode the message without truncation
|
||||
encoding = self.k6_config.tokenizer(conversation["value"])
|
||||
outputs.append(
|
||||
{"message": conversation["value"], "num_tokens": len(encoding["input_ids"])})
|
||||
if len(outputs) >= MAX_SAMPLES: # limit the number of inputs
|
||||
break
|
||||
with open("inputs_variable_tokens.json", "w") as f:
|
||||
f.write(json.dumps(outputs))
|
||||
|
||||
def run(self):
|
||||
self.k6_config.executor.render()
|
||||
args = f"/tmp/k6-sse run --out json=results.json {self.k6_config.executor.rendered_file}"
|
||||
logger.info(f"Running k6 with parameters: {args}")
|
||||
logger.info(f"K6Config is: {self.k6_config}")
|
||||
# start a k6 subprocess
|
||||
self.process = subprocess.Popen(args,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
|
||||
while buffer := os.read(self.process.stdout.fileno(),
|
||||
2048): # read the output of the process, don't buffer on new lines
|
||||
print(buffer.decode(), end='')
|
||||
self.process.wait()
|
||||
logger.info(f"K6 process finished with return code {self.process.returncode}")
|
||||
logger.info(f"Writing results to {self.get_results_path()}")
|
||||
self.add_config_to_summary()
|
||||
self.add_config_to_results()
|
||||
|
||||
def stop(self):
|
||||
if self.process:
|
||||
kill(self.process.pid)
|
||||
|
||||
def add_config_to_summary(self):
|
||||
with open("summary.json", "r") as f:
|
||||
summary = json.load(f)
|
||||
summary["k6_config"] = {
|
||||
"name": self.k6_config.name,
|
||||
"input_type": self.k6_config.executor.executor_input_type.value,
|
||||
"extra_info": self.k6_config.extra_info,
|
||||
**self.k6_config.executor.variables
|
||||
}
|
||||
# create directory if it doesn't exist
|
||||
os.makedirs(self._get_output_dir(), exist_ok=True)
|
||||
with open(self.get_summary_path(), "w") as f2:
|
||||
json.dump(summary, f2)
|
||||
|
||||
def add_config_to_results(self):
|
||||
with open("results.json", "r") as f:
|
||||
results = f.readlines()
|
||||
# append the k6 config to the results in jsonlines format
|
||||
results += "\n"
|
||||
results += json.dumps({
|
||||
"name": self.k6_config.name,
|
||||
"input_type": self.k6_config.executor.executor_input_type.value,
|
||||
"extra_info": self.k6_config.extra_info,
|
||||
**self.k6_config.executor.variables
|
||||
})
|
||||
# create directory if it doesn't exist
|
||||
os.makedirs(self._get_output_dir(), exist_ok=True)
|
||||
with open(self.get_results_path(), "w") as f2:
|
||||
f2.writelines(results)
|
||||
|
||||
def _get_output_dir(self):
|
||||
# check if output_dir is relative or absolute
|
||||
if self.output_dir.startswith("/"):
|
||||
return f"{self.output_dir}/{self.k6_config.executor.name}"
|
||||
else:
|
||||
return f"{os.getcwd()}/{self.output_dir}/{self.k6_config.executor.name}"
|
||||
|
||||
def _get_output_path(self):
|
||||
return f"{self._get_output_dir()}/{self.k6_config.name}_{self.k6_config.executor}"
|
||||
|
||||
def get_results_path(self):
|
||||
return f"{self._get_output_path()}.json"
|
||||
|
||||
def get_summary_path(self):
|
||||
return f"{self._get_output_path()}.summary.json"
|
@ -0,0 +1,16 @@
|
||||
{% include 'main.js.j2' %}
|
||||
|
||||
export function get_options() {
|
||||
return {
|
||||
scenarios: {
|
||||
load_test: {
|
||||
executor: 'constant-arrival-rate',
|
||||
gracefulStop: '0s',
|
||||
duration: '{{ duration }}',
|
||||
preAllocatedVUs: {{ pre_allocated_vus }},
|
||||
rate: {{ rate }},
|
||||
timeUnit: '1s',
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
14
load_tests/benchmarks/templates/k6_constant_vus.js.j2
Normal file
14
load_tests/benchmarks/templates/k6_constant_vus.js.j2
Normal file
@ -0,0 +1,14 @@
|
||||
{% include 'main.js.j2' %}
|
||||
|
||||
export function get_options() {
|
||||
return {
|
||||
scenarios: {
|
||||
load_test: {
|
||||
executor: 'constant-vus',
|
||||
gracefulStop: '0s',
|
||||
duration: '{{ duration }}',
|
||||
vus: {{ vus }},
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
{% include 'main.js.j2' %}
|
||||
|
||||
export function get_options() {
|
||||
return {
|
||||
scenarios: {
|
||||
load_test: {
|
||||
executor: 'ramping-arrival-rate',
|
||||
gracefulStop: '0s',
|
||||
preAllocatedVUs: {{ pre_allocated_vus }},
|
||||
timeUnit: '{{ time_unit }}',
|
||||
startRate: {{ start_rate }},
|
||||
stages: [
|
||||
{%- for stage in stages %}
|
||||
{target: {{ stage.target }}, duration: '{{ stage.duration }}'},
|
||||
{%- endfor %}
|
||||
],
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
131
load_tests/benchmarks/templates/main.js.j2
Normal file
131
load_tests/benchmarks/templates/main.js.j2
Normal file
@ -0,0 +1,131 @@
|
||||
import {check, fail} from 'k6';
|
||||
import sse from "k6/x/sse"
|
||||
import {scenario} from 'k6/execution';
|
||||
import http from 'k6/http';
|
||||
import {Trend, Counter} from 'k6/metrics';
|
||||
|
||||
const host = "127.0.0.1:8080";
|
||||
const model_id = "Qwen/Qwen2-72B";
|
||||
|
||||
const endToEndLatency = new Trend('end_to_end_latency', true);
|
||||
const requestThroughput = new Counter('request_throughput');
|
||||
const tokenThroughput = new Counter('tokens_throughput');
|
||||
|
||||
const timeToFirstToken = new Trend('time_to_first_token', true);
|
||||
const interTokenLatency = new Trend('inter_token_latency', true); // is microseconds
|
||||
|
||||
const tokensReceived = new Trend('tokens_received');
|
||||
|
||||
const max_new_tokens = {{ max_new_tokens }};
|
||||
|
||||
const shareGPT = JSON.parse(open("{{ cwd }}/{{ input_filename }}"))
|
||||
|
||||
export function handleSummary(data) {
|
||||
return {
|
||||
'summary.json': JSON.stringify(data),
|
||||
};
|
||||
}
|
||||
|
||||
function generate_payload(gpt, max_new_tokens) {
|
||||
let input = gpt["message"];
|
||||
return {
|
||||
"messages": [{"role": "user", "content": input}],
|
||||
"temperature": 0,
|
||||
"model": `${model_id}`,
|
||||
"max_tokens": max_new_tokens,
|
||||
"stream": true
|
||||
};
|
||||
}
|
||||
|
||||
export const options = get_options();
|
||||
|
||||
export default function run() {
|
||||
const headers = {'Content-Type': 'application/json'};
|
||||
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
|
||||
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
|
||||
const url = `http://${host}/v1/chat/completions`;
|
||||
const params = {
|
||||
method: 'POST',
|
||||
body: payload,
|
||||
headers
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
let firstTokenTime = null;
|
||||
let lastTokenTime = null;
|
||||
let tokensCount = 0;
|
||||
let response = ""
|
||||
|
||||
const res = sse.open(url, params, function (client) {
|
||||
client.on('event', function (event) {
|
||||
// console.log(event.data)
|
||||
if (parseInt(event.id) === 4) {
|
||||
client.close()
|
||||
}
|
||||
if (event.data.includes("[DONE]") || event.data === "") {
|
||||
return
|
||||
}
|
||||
try {
|
||||
const data = JSON.parse(event.data);
|
||||
if (!'choices' in data) {
|
||||
fail('http_200')
|
||||
return;
|
||||
}
|
||||
const content = data['choices'][0]['delta']['content']
|
||||
if (content !== undefined) {
|
||||
response += data['choices'][0]['delta']['content']
|
||||
tokensCount += 1;
|
||||
}
|
||||
|
||||
// Measure time to first token
|
||||
if (!firstTokenTime) {
|
||||
firstTokenTime = Date.now();
|
||||
timeToFirstToken.add(firstTokenTime - startTime);
|
||||
}
|
||||
|
||||
// Measure inter-token latency
|
||||
const currentTime = Date.now();
|
||||
if (lastTokenTime) {
|
||||
interTokenLatency.add((currentTime - lastTokenTime) * 1000.);
|
||||
}
|
||||
lastTokenTime = currentTime;
|
||||
|
||||
if ('finish_reason' in data['choices'][0]) {
|
||||
if (data['choices'][0]['finish_reason'] != null) {
|
||||
const endTime = Date.now();
|
||||
const deltaMs = endTime - startTime;
|
||||
endToEndLatency.add(deltaMs)
|
||||
requestThroughput.add(1);
|
||||
tokenThroughput.add(tokensCount);
|
||||
tokensReceived.add(tokensCount);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// catch any errors that occur during the event processing
|
||||
// increase the fail count of the 'http_200' check
|
||||
check(true, {
|
||||
'http_200': (val) => false,
|
||||
})
|
||||
fail('http_200')
|
||||
}
|
||||
})
|
||||
|
||||
client.on('error', function (e) {
|
||||
console.log('An unexpected error occurred: ', e.error())
|
||||
})
|
||||
})
|
||||
|
||||
if (tokensCount === 0) {
|
||||
// something went wrong with generation
|
||||
fail('http_200')
|
||||
}
|
||||
|
||||
if (res.status >= 400 && res.status < 500) {
|
||||
return;
|
||||
}
|
||||
|
||||
check(res, {
|
||||
'http_200': (res) => res.status === 200,
|
||||
});
|
||||
|
||||
}
|
31
load_tests/benchmarks/test_k6.py
Normal file
31
load_tests/benchmarks/test_k6.py
Normal file
@ -0,0 +1,31 @@
|
||||
import os
|
||||
from unittest import TestCase
|
||||
|
||||
from benchmarks.k6 import K6RampingArrivalRateExecutor, K6Config, K6ConstantVUsExecutor, K6Benchmark, ExecutorInputType
|
||||
|
||||
|
||||
class K6RampingArrivalRateExecutorTest(TestCase):
|
||||
def test_render(self):
|
||||
executor = K6RampingArrivalRateExecutor(
|
||||
100,
|
||||
1,
|
||||
"1s",
|
||||
[
|
||||
{"target": 1, "duration": "30s"},
|
||||
{"target": 100, "duration": "30s"}
|
||||
],
|
||||
ExecutorInputType.SHAREGPT_CONVERSATIONS)
|
||||
executor.render()
|
||||
self.assertIsNotNone(executor.rendered_file)
|
||||
with open(executor.rendered_file, "r") as f:
|
||||
content = f.read()
|
||||
self.assertTrue("stages: [" in content)
|
||||
self.assertTrue("target: 1, duration: '30s'" in content)
|
||||
self.assertTrue(os.getcwd() in content)
|
||||
|
||||
|
||||
class K6BenchmarkTest(TestCase):
|
||||
def test_prepare_inputs(self):
|
||||
executor = K6ConstantVUsExecutor(1, '1m')
|
||||
config = K6Config("test", executor, input_num_tokens=500)
|
||||
bench = K6Benchmark(config, "output")
|
25
load_tests/benchmarks/utils.py
Normal file
25
load_tests/benchmarks/utils.py
Normal file
@ -0,0 +1,25 @@
|
||||
from itertools import chain
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
class SweepParameter:
|
||||
def __init__(self, name, start, end, step):
|
||||
self.name = name
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.step = step
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} from {self.start} to {self.end} in steps of {self.step}"
|
||||
|
||||
def __iter__(self):
|
||||
for value in chain(range(self.start, self.end, self.step), [self.end]):
|
||||
yield value
|
||||
|
||||
|
||||
def kill(proc_pid):
|
||||
process = psutil.Process(proc_pid)
|
||||
for proc in process.children(recursive=True):
|
||||
proc.kill()
|
||||
process.kill()
|
@ -1,94 +0,0 @@
|
||||
import { check } from 'k6';
|
||||
import { scenario } from 'k6/execution';
|
||||
import http from 'k6/http';
|
||||
import { Trend, Counter } from 'k6/metrics';
|
||||
|
||||
const host = __ENV.HOST;
|
||||
const model_id = __ENV.MODEL_ID;
|
||||
const timePerToken = new Trend('time_per_token', true);
|
||||
const tokens = new Counter('tokens');
|
||||
const new_tokens = new Counter('new_tokens');
|
||||
const input_tokens = new Counter('input_tokens');
|
||||
const max_new_tokens = 50;
|
||||
|
||||
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
||||
const shareGPT = JSON.parse(open("small.json"))
|
||||
|
||||
|
||||
export function get_options() {
|
||||
return {
|
||||
thresholds: {
|
||||
http_req_failed: ['rate==0'],
|
||||
// time_per_token: [{
|
||||
// threshold: `p(50)<${5 * reference_latency_ms}`,
|
||||
// abortOnFail: true,
|
||||
// delayAbortEval: '10s'
|
||||
// }],
|
||||
},
|
||||
scenarios: {
|
||||
// single_user: {
|
||||
// executor: 'constant-arrival-rate',
|
||||
// duration: '60s',
|
||||
// preAllocatedVUs: 1,
|
||||
// rate: 20,
|
||||
// timeUnit: '1s',
|
||||
// },
|
||||
load_test: {
|
||||
executor: 'constant-arrival-rate',
|
||||
duration: '60s',
|
||||
preAllocatedVUs: 100,
|
||||
rate: 1,
|
||||
timeUnit: '1s',
|
||||
},
|
||||
// breakpoint: {
|
||||
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
|
||||
// preAllocatedVUs: 300,
|
||||
// stages: [
|
||||
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
|
||||
// ],
|
||||
// },
|
||||
// throughput: {
|
||||
// executor: 'shared-iterations',
|
||||
// vus: 100,
|
||||
// iterations: 200,
|
||||
// maxDuration: '40s',
|
||||
// },
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function generate_payload(gpt, max_new_tokens) {
|
||||
const input = gpt["conversations"][0]["value"];
|
||||
return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
|
||||
}
|
||||
|
||||
export const options = get_options();
|
||||
|
||||
export default function run() {
|
||||
const headers = { 'Content-Type': 'application/json' };
|
||||
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
|
||||
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
|
||||
const res = http.post(`http://${host}/v1/chat/completions`, payload, {
|
||||
headers,
|
||||
});
|
||||
if (res.status >= 400 && res.status < 500) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
check(res, {
|
||||
'Post status is 200': (res) => res.status === 200,
|
||||
});
|
||||
const duration = res.timings.duration;
|
||||
|
||||
if (res.status === 200) {
|
||||
const body = res.json();
|
||||
const completion_tokens = body.usage.completion_tokens;
|
||||
const latency_ms_per_token = duration / completion_tokens;
|
||||
timePerToken.add(latency_ms_per_token);
|
||||
const prompt_tokens = body.usage.prompt_tokens;
|
||||
input_tokens.add(prompt_tokens);
|
||||
new_tokens.add(completion_tokens);
|
||||
tokens.add(completion_tokens + prompt_tokens);
|
||||
}
|
||||
}
|
98
load_tests/download_artifact.js
Normal file
98
load_tests/download_artifact.js
Normal file
@ -0,0 +1,98 @@
|
||||
module.exports = async ({
|
||||
github,
|
||||
context,
|
||||
core
|
||||
}) => {
|
||||
const owner = context.repo.owner;
|
||||
const repo = context.repo.repo;
|
||||
|
||||
const workflows = await github.rest.actions.listRepoWorkflows({
|
||||
owner,
|
||||
repo
|
||||
})
|
||||
|
||||
const workflow = workflows.data.workflows.find(w => w.path.includes(process.env.WORKFLOW_FILENAME));
|
||||
|
||||
if (!workflow) {
|
||||
core.setFailed("No workflow found");
|
||||
return;
|
||||
}
|
||||
|
||||
const runs = await github.rest.actions.listWorkflowRuns({
|
||||
owner,
|
||||
repo,
|
||||
workflow_id: workflow.id,
|
||||
status: "success",
|
||||
per_page: 1
|
||||
})
|
||||
|
||||
if (runs.data.total_count === 0) {
|
||||
core.setFailed("No runs found");
|
||||
return;
|
||||
}
|
||||
|
||||
const lastRelease = await github.rest.repos.getLatestRelease({
|
||||
owner,
|
||||
repo
|
||||
});
|
||||
|
||||
const lastReleaseTag = lastRelease.data.tag_name;
|
||||
const tagRef = `tags/${lastReleaseTag}`;
|
||||
const lastReleaseCommit = await github.rest.git.getRef({
|
||||
owner,
|
||||
repo,
|
||||
ref: tagRef
|
||||
});
|
||||
const lastReleaseSha = lastReleaseCommit.data.object.sha;
|
||||
const lastReleaseRun = await github.rest.actions.listWorkflowRuns({
|
||||
owner,
|
||||
repo,
|
||||
workflow_id: workflow.id,
|
||||
head_sha: lastReleaseSha,
|
||||
status: "success",
|
||||
per_page: 1
|
||||
});
|
||||
let lastReleaseArtifacts = {data: {artifacts: []}};
|
||||
if (lastReleaseRun.data.total_count > 0) {
|
||||
lastReleaseArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
|
||||
owner,
|
||||
repo,
|
||||
run_id: lastReleaseRun.data.workflow_runs[0].id
|
||||
});
|
||||
}
|
||||
|
||||
const lastArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
|
||||
owner,
|
||||
repo,
|
||||
run_id: runs.data.workflow_runs[0].id
|
||||
});
|
||||
|
||||
const lastReleaseArtifact = lastReleaseArtifacts.data.artifacts.find(artifact => artifact.name === process.env.ARTIFACT_NAME);
|
||||
const lastArtifact = lastArtifacts.data.artifacts.find(artifact => artifact.name === process.env.ARTIFACT_NAME);
|
||||
|
||||
if (lastReleaseArtifact) {
|
||||
await downloadArtifact(github, owner, repo, lastReleaseArtifact, lastReleaseTag);
|
||||
} else {
|
||||
console.log("No release artifact found")
|
||||
}
|
||||
if (lastArtifact) {
|
||||
await downloadArtifact(github, owner, repo, lastArtifact, lastArtifact.workflow_run.head_sha);
|
||||
} else {
|
||||
console.log("No last run artifact found")
|
||||
}
|
||||
}
|
||||
|
||||
async function downloadArtifact(github, owner, repo, artifact, suffix) {
|
||||
const response = await github.rest.actions.downloadArtifact({
|
||||
owner,
|
||||
repo,
|
||||
artifact_id: artifact.id,
|
||||
archive_format: 'zip'
|
||||
});
|
||||
require('fs').writeFileSync(process.env.ARTIFACT_FILENAME, Buffer.from(response.data));
|
||||
// create directory to unzip
|
||||
require('fs').mkdirSync(`${process.env.UNZIP_DIR}/${artifact.workflow_run.head_sha}`, {recursive: true});
|
||||
require('child_process').execSync(`unzip -o ${process.env.ARTIFACT_FILENAME} -d ${process.env.UNZIP_DIR}/${suffix}`);
|
||||
|
||||
console.log(`Artifact ${process.env.ARTIFACT_FILENAME} for ${suffix} downloaded successfully`);
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
import json
|
||||
|
||||
|
||||
def main():
|
||||
with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Select only the first 2k conversations that start with a human.
|
||||
max = 2000
|
||||
conversations = []
|
||||
for conversation in data:
|
||||
conv = conversation.get("conversations")
|
||||
if conv and conv[0]["from"] == "human":
|
||||
# Trim the rest of the output
|
||||
conversation["conversations"] = conversation["conversations"][:1]
|
||||
conversations.append(conversation)
|
||||
|
||||
if len(conversation) >= max:
|
||||
break
|
||||
|
||||
with open("./small.json", "w") as f:
|
||||
data = json.dump(conversations, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
103
load_tests/load_test.py
Normal file
103
load_tests/load_test.py
Normal file
@ -0,0 +1,103 @@
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from benchmarks.engine import TGIDockerRunner
|
||||
from benchmarks.k6 import K6Config, K6Benchmark, K6ConstantArrivalRateExecutor, K6ConstantVUsExecutor, ExecutorInputType
|
||||
from loguru import logger
|
||||
import pandas as pd
|
||||
import GPUtil
|
||||
|
||||
from parse_load_test import TestType, parse_json_files, plot_metrics
|
||||
|
||||
|
||||
def run_full_test(engine_name: str):
|
||||
vus_concurrences = list(range(0, 1024, 40))
|
||||
vus_concurrences[0] = 1
|
||||
vus_concurrences.append(1024)
|
||||
arrival_rates = list(range(0, 200, 10))
|
||||
arrival_rates[0] = 1
|
||||
arrival_rates.append(200)
|
||||
for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
|
||||
for c in arrival_rates:
|
||||
logger.info(f'Running k6 with constant arrival rate for {c} req/s with input type {input_type.value}')
|
||||
k6_executor = K6ConstantArrivalRateExecutor(2000, c, '60s', input_type)
|
||||
k6_config = K6Config(f'{engine_name}', k6_executor, input_num_tokens=200)
|
||||
benchmark = K6Benchmark(k6_config, f'results/{input_type.value}/')
|
||||
benchmark.run()
|
||||
for c in vus_concurrences:
|
||||
logger.info(f'Running k6 with constant VUs with concurrency {c} with input type {input_type.value}')
|
||||
k6_executor = K6ConstantVUsExecutor(c, '60s', input_type)
|
||||
k6_config = K6Config(f'{engine_name}', k6_executor, input_num_tokens=200)
|
||||
benchmark = K6Benchmark(k6_config, f'results/{input_type.value}/')
|
||||
benchmark.run()
|
||||
|
||||
|
||||
def merge_previous_results(csv_path: str, df: pd.DataFrame, version_id: str) -> pd.DataFrame:
|
||||
if os.path.exists(csv_path):
|
||||
previous_df = pd.read_csv(csv_path)
|
||||
previous_df['name'] = previous_df['name'].str.replace('tgi', f'tgi_{version_id}')
|
||||
df = pd.concat([previous_df, df])
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
model = 'Qwen/Qwen2-7B'
|
||||
runner = TGIDockerRunner(model)
|
||||
max_concurrent_requests = 8000
|
||||
# run TGI
|
||||
try:
|
||||
logger.info('Running TGI')
|
||||
runner.run([('max-concurrent-requests', max_concurrent_requests)])
|
||||
logger.info('TGI is running')
|
||||
run_full_test('tgi')
|
||||
except Exception as e:
|
||||
logger.error(f'Error: {e}')
|
||||
# print the stack trace
|
||||
print(traceback.format_exc())
|
||||
finally:
|
||||
runner.stop()
|
||||
time.sleep(5)
|
||||
|
||||
for input_type in [ExecutorInputType.SHAREGPT_CONVERSATIONS, ExecutorInputType.CONSTANT_TOKENS]:
|
||||
for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
|
||||
directory = os.path.join('results', input_type.value.lower(), test_type.value.lower())
|
||||
# check if directory exists
|
||||
if not os.path.exists(directory):
|
||||
logger.error(f'Directory {directory} does not exist')
|
||||
continue
|
||||
dfs = parse_json_files(directory, test_type)
|
||||
# create output directory if it does not exist
|
||||
os.makedirs('output', exist_ok=True)
|
||||
# save the data to a csv file
|
||||
path = os.path.join(os.getcwd(), 'output', f'{input_type.value.lower()}_{test_type.value.lower()}.csv')
|
||||
dfs.to_csv(path)
|
||||
# check if we have previous results CSV file by listing /tmp/artifacts/<input_type> directory,
|
||||
# merge them if they exist
|
||||
prev_root = '/tmp/artifacts'
|
||||
try:
|
||||
if os.path.exists(prev_root):
|
||||
directories = [item for item in os.listdir(prev_root) if
|
||||
os.path.isdir(os.path.join(prev_root, item))]
|
||||
for d in directories:
|
||||
for f in os.listdir(f'{prev_root}/{d}'):
|
||||
if f.endswith(f'{input_type.value.lower()}_{test_type.value.lower()}.csv'):
|
||||
csv_path = os.path.join('/tmp/artifacts', d, f)
|
||||
# only keep short commit hash
|
||||
d = d[:7]
|
||||
dfs = merge_previous_results(csv_path, dfs, d)
|
||||
except Exception as e:
|
||||
logger.error(f'Error while merging previous results, skipping: {e}')
|
||||
plot_metrics(f'{model} {get_gpu_names()}', dfs, test_type,
|
||||
f'output/{input_type.value.lower()}_{test_type.value.lower()}')
|
||||
|
||||
|
||||
def get_gpu_names() -> str:
|
||||
gpus = GPUtil.getGPUs()
|
||||
if len(gpus) == 0:
|
||||
return ''
|
||||
return f'{len(gpus)}x{gpus[0].name if gpus else "No GPU available"}'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,27 +0,0 @@
|
||||
import json
|
||||
import datasets
|
||||
import tqdm
|
||||
|
||||
|
||||
def main():
|
||||
dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
|
||||
# Select only the first 2k conversations that start with a human.
|
||||
max = min(2000, len(dataset))
|
||||
conversations = []
|
||||
for item in tqdm.tqdm(dataset, total=max):
|
||||
conversation = {
|
||||
"conversations": [
|
||||
{"from": "human", "value": item["question"]},
|
||||
],
|
||||
"id": item["id"],
|
||||
}
|
||||
conversations.append(conversation)
|
||||
if len(conversations) >= max:
|
||||
break
|
||||
|
||||
with open("./small.json", "w") as f:
|
||||
data = json.dump(conversations, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
115
load_tests/parse_load_test.py
Normal file
115
load_tests/parse_load_test.py
Normal file
@ -0,0 +1,115 @@
|
||||
import json
|
||||
import os
|
||||
from enum import Enum
|
||||
import pandas as pd
|
||||
from loguru import logger
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
import scienceplots
|
||||
|
||||
plt.style.use('science')
|
||||
|
||||
|
||||
class TestType(Enum):
|
||||
CONSTANT_VUS = "constant_vus"
|
||||
CONSTANT_ARRIVAL_RATE = "constant_arrival_rate"
|
||||
|
||||
|
||||
def parse_json_files(directory: str, test_type: TestType) -> pd.DataFrame:
|
||||
metrics_to_keep = {'inter_token_latency': {'y': 'Time (ms)'}, 'end_to_end_latency': {'y': 'Time (ms)'},
|
||||
'time_to_first_token': {'y': 'Time (ms)'}, 'tokens_throughput': {'y': 'Tokens/s'},
|
||||
'tokens_received': {'y': 'Count'}}
|
||||
df = pd.DataFrame()
|
||||
for file in os.listdir(directory):
|
||||
if file.endswith("summary.json"):
|
||||
filepath = os.path.join(directory, file)
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
if test_type == TestType.CONSTANT_VUS:
|
||||
entry = {
|
||||
"vus": data['k6_config']['vus'],
|
||||
"duration": data['k6_config']['duration']
|
||||
}
|
||||
elif test_type == TestType.CONSTANT_ARRIVAL_RATE:
|
||||
entry = {
|
||||
'pre_allocated_vus': data['k6_config']['pre_allocated_vus'],
|
||||
'rate': data['k6_config']['rate'],
|
||||
'duration': data['k6_config']['duration']
|
||||
}
|
||||
entry['test_duration'] = data['state']['testRunDurationMs'] / 1000.
|
||||
entry['requests_ok'] = data['root_group']['checks'][0]['passes']
|
||||
entry['requests_fail'] = data['root_group']['checks'][0]['fails']
|
||||
entry['dropped_iterations'] = data['metrics']['dropped_iterations']['values'][
|
||||
'count'] if 'dropped_iterations' in data['metrics'] else 0
|
||||
# add up requests_fail and dropped_iterations to get total dropped requests
|
||||
entry['dropped_requests'] = entry['requests_fail'] + entry['dropped_iterations']
|
||||
entry['error_rate'] = entry['dropped_requests'] / (
|
||||
entry['requests_ok'] + entry['dropped_requests']) * 100.0
|
||||
entry['name'] = data['k6_config']['name']
|
||||
for metric, values in sorted(data['metrics'].items()):
|
||||
if metric in metrics_to_keep:
|
||||
for value_key, value in values['values'].items():
|
||||
if value_key == 'p(90)' or value_key == 'count': # Only keep p(90) values if trend
|
||||
entry[metric] = value
|
||||
if 'tokens_throughput' in entry and 'test_duration' in entry:
|
||||
entry['tokens_throughput'] = entry['tokens_throughput'] / (entry['test_duration'])
|
||||
if 'inter_token_latency' in entry:
|
||||
entry['inter_token_latency'] = entry['inter_token_latency'] / 1000.
|
||||
df = pd.concat([df, pd.DataFrame(entry, index=[0])])
|
||||
return df
|
||||
|
||||
|
||||
def plot_metrics(model_name:str, df: pd.DataFrame, test_type: TestType, save_name: str):
|
||||
vus_param = ''
|
||||
if test_type == TestType.CONSTANT_VUS:
|
||||
vus_param = 'vus'
|
||||
else:
|
||||
vus_param = 'rate'
|
||||
fig, axs = plt.subplots(3, 2, figsize=(15, 20))
|
||||
fig.tight_layout(pad=6.0)
|
||||
fig.subplots_adjust(hspace=0.2, wspace=0.2, bottom=0.15, top=0.92)
|
||||
|
||||
names = sorted(df['name'].unique())
|
||||
metrics = {'inter_token_latency': {'y': 'Time (ms)'}, 'time_to_first_token': {'y': 'Time (ms)'},
|
||||
'end_to_end_latency': {'y': 'Time (ms)'}, 'tokens_throughput': {'y': 'Tokens/s'},
|
||||
'requests_ok': {'y': 'Count'}, 'error_rate': {'y': 'Count'}}
|
||||
titles = ['Inter Token Latency P90 (lower is better)', 'TTFT P90 (lower is better)',
|
||||
'End to End Latency P90 (lower is better)', 'Request Output Throughput P90 (higher is better)',
|
||||
'Successful requests (higher is better)', 'Error rate (lower is better)']
|
||||
labels = ['Time (ms)', 'Time (ms)', 'Time (ms)', 'Tokens/s', 'Count', '%']
|
||||
colors = ['#FF9D00', '#2F5BA1']
|
||||
# Plot each metric in its respective subplot
|
||||
for ax, metric, title, label in zip(axs.flatten(), metrics, titles, labels):
|
||||
for i, name in enumerate(names):
|
||||
df_sorted = df[df['name'] == name].sort_values(by=vus_param)
|
||||
ax.plot(df_sorted[vus_param], df_sorted[metric], marker='o', label=f"{name}", color=colors[i])
|
||||
ax.set_title(title)
|
||||
ax.tick_params(axis='x', rotation=0)
|
||||
ax.set_ylabel(label)
|
||||
if test_type == TestType.CONSTANT_VUS:
|
||||
ax.set_xlabel('VUS')
|
||||
else:
|
||||
ax.set_xlabel('Requests/s')
|
||||
# Add grid lines for better readability
|
||||
ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.5)
|
||||
ax.set_axisbelow(True) # Ensure grid lines are below the bars
|
||||
ax.legend(title='Engine', loc='upper right')
|
||||
|
||||
# show title on top of the figure
|
||||
if test_type == TestType.CONSTANT_VUS:
|
||||
plt.suptitle(f'Constant VUs Load Test\n{model_name}', fontsize=16)
|
||||
elif test_type == TestType.CONSTANT_ARRIVAL_RATE:
|
||||
plt.suptitle(f'Constant Arrival Rate Load Test\n{model_name}', fontsize=16)
|
||||
logger.info(f"Saving plot to {save_name}.png")
|
||||
plt.savefig(f"{save_name}.png")
|
||||
|
||||
|
||||
def main():
|
||||
for test_type in [TestType.CONSTANT_VUS, TestType.CONSTANT_ARRIVAL_RATE]:
|
||||
directory = f"results/{test_type.value.lower()}"
|
||||
dfs = parse_json_files(directory, test_type)
|
||||
plot_metrics(dfs, test_type, test_type.value.lower())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1561
load_tests/poetry.lock
generated
Normal file
1561
load_tests/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
23
load_tests/pyproject.toml
Normal file
23
load_tests/pyproject.toml
Normal file
@ -0,0 +1,23 @@
|
||||
[tool.poetry]
|
||||
name = "tgi-benchmarks"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
numpy = "<2.0"
|
||||
pandas = "^2.2.2"
|
||||
scienceplots = "^2.1.1"
|
||||
docker = "^7.1.0"
|
||||
loguru = "^0.7.2"
|
||||
psutil = "^6.0.0"
|
||||
jinja2 = "^3.1.4"
|
||||
transformers = "^4.42.3"
|
||||
gputil = "^1.4.0"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
Loading…
Reference in New Issue
Block a user