fix: Update runners group

This commit is contained in:
Hugo Larcher 2024-09-30 18:00:54 +02:00
parent fc7dcb0ba6
commit 2980720af4
No known key found for this signature in database
GPG Key ID: 3DAF63124699CA2B
2 changed files with 62 additions and 30 deletions

View File

@ -21,7 +21,7 @@ jobs:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
runs-on: runs-on:
group: aws-g6-12xlarge-plus-priv group: aws-g6-12xl-plus-priv-cache
env: env:
DOCKER_VOLUME: /cache DOCKER_VOLUME: /cache
steps: steps:
@ -41,8 +41,10 @@ jobs:
- name: Run bench test - name: Run bench test
run: | run: |
export PATH="$HOME/.local/bin:$PATH"
cd load_tests cd load_tests
python benchmarks.py poetry install
poetry run python benchmarks.py
shell: bash shell: bash
env: env:
HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }} HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }}

View File

@ -1,6 +1,7 @@
import json import json
import os import os
import traceback import traceback
from typing import Dict, Tuple, List
import GPUtil import GPUtil
import docker import docker
@ -13,7 +14,7 @@ class InferenceEngineRunner:
def __init__(self, model: str): def __init__(self, model: str):
self.model = model self.model = model
def run(self, parameters: list[tuple]): def run(self, parameters: list[tuple], gpus: int = 0):
NotImplementedError("This method should be implemented by the subclass") NotImplementedError("This method should be implemented by the subclass")
def stop(self): def stop(self):
@ -32,7 +33,7 @@ class TGIDockerRunner(InferenceEngineRunner):
self.image = image self.image = image
self.volumes = volumes self.volumes = volumes
def run(self, parameters: list[tuple]): def run(self, parameters: list[tuple], gpus: int = 0):
params = f"--model-id {self.model} --port 8080" params = f"--model-id {self.model} --port 8080"
for p in parameters: for p in parameters:
params += f" --{p[0]} {str(p[1])}" params += f" --{p[0]} {str(p[1])}"
@ -43,7 +44,10 @@ class TGIDockerRunner(InferenceEngineRunner):
self.container = run_docker(self.image, params, self.container = run_docker(self.image, params,
"Connected", "Connected",
"ERROR", "ERROR",
volumes=volumes) volumes=volumes,
gpus=gpus,
ports={"8080/tcp": 8080}
)
def stop(self): def stop(self):
if self.container: if self.container:
@ -53,15 +57,15 @@ class TGIDockerRunner(InferenceEngineRunner):
class BenchmarkRunner: class BenchmarkRunner:
def __init__(self, def __init__(self,
image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest", image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest",
volumes=None): volumes: List[Tuple[str, str]] = None):
if volumes is None: if volumes is None:
volumes = [] volumes = []
self.container = None self.container = None
self.image = image self.image = image
self.volumes = volumes self.volumes = volumes
def run(self, parameters: list[tuple]): def run(self, parameters: list[tuple], network_mode):
params = "" params = "text-generation-inference-benchmark"
for p in parameters: for p in parameters:
params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}" params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}"
logger.info(f"Running text-generation-inference-benchmarks with parameters: {params}") logger.info(f"Running text-generation-inference-benchmarks with parameters: {params}")
@ -70,8 +74,11 @@ class BenchmarkRunner:
volumes[v[0]] = {"bind": v[1], "mode": "rw"} volumes[v[0]] = {"bind": v[1], "mode": "rw"}
self.container = run_docker(self.image, params, self.container = run_docker(self.image, params,
"Benchmark finished", "Benchmark finished",
"Error", "Fatal:",
volumes=volumes) volumes=volumes,
extra_env={"RUST_LOG": "text_generation_inference_benchmark=info",
"RUST_BACKTRACE": "full"},
network_mode=network_mode)
def stop(self): def stop(self):
if self.container: if self.container:
@ -79,23 +86,31 @@ class BenchmarkRunner:
def run_docker(image: str, args: str, success_sentinel: str, def run_docker(image: str, args: str, success_sentinel: str,
error_sentinel: str, volumes=None, gpus: int = 0) -> Container: error_sentinel: str, ports: Dict[str, int] = None, volumes=None, network_mode: str = "bridge",
gpus: int = 0, extra_env: Dict[str, str] = None) -> Container:
if ports is None:
ports = {}
if volumes is None: if volumes is None:
volumes = {} volumes = {}
client = docker.from_env() if extra_env is None:
extra_env = {}
client = docker.from_env(timeout=300)
# retrieve the GPU devices from CUDA_VISIBLE_DEVICES # retrieve the GPU devices from CUDA_VISIBLE_DEVICES
devices = [f"{i}" for i in devices = [f"{i}" for i in
range(get_num_gpus())][:gpus] range(get_num_gpus())][:gpus]
environment = {"HF_TOKEN": os.environ.get("HF_TOKEN")}
environment.update(extra_env)
container = client.containers.run(image, args, container = client.containers.run(image, args,
detach=True, detach=True,
device_requests=[ device_requests=[
docker.types.DeviceRequest(device_ids=devices, docker.types.DeviceRequest(device_ids=devices,
capabilities=[['gpu']]) if gpus > 0 else None capabilities=[['gpu']])
], ] if gpus > 0 else None,
volumes=volumes, volumes=volumes,
shm_size="1g", shm_size="1g",
ports={"8080/tcp": 8080}, ports=ports,
environment={"HF_TOKEN": os.environ.get("HF_TOKEN")}, ) network_mode=network_mode,
environment=environment, )
for line in container.logs(stream=True): for line in container.logs(stream=True):
print(line.decode("utf-8"), end="") print(line.decode("utf-8"), end="")
if success_sentinel.encode("utf-8") in line: if success_sentinel.encode("utf-8") in line:
@ -145,6 +160,8 @@ def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
def main(): def main():
results_dir = 'results' results_dir = 'results'
# get absolute path
results_dir = os.path.join(os.path.dirname(__file__), results_dir)
logger.info('Starting benchmark') logger.info('Starting benchmark')
models = [ models = [
('meta-llama/Llama-3.1-8B-Instruct', 1), ('meta-llama/Llama-3.1-8B-Instruct', 1),
@ -152,15 +169,17 @@ def main():
# ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2), # ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
] ]
sha = os.environ.get('GITHUB_SHA') sha = os.environ.get('GITHUB_SHA')
# create results directory success = True
os.makedirs(results_dir, exist_ok=True)
for model in models: for model in models:
tgi_runner = TGIDockerRunner(model[0]) tgi_runner = TGIDockerRunner(model[0])
# create results directory
model_dir = os.path.join(results_dir, f'{model[0].replace("/", "_").replace(".", "_")}')
os.makedirs(model_dir, exist_ok=True)
runner = BenchmarkRunner( runner = BenchmarkRunner(
volumes=['results', '/opt/text-generation-inference-benchmark/results'] volumes=[(model_dir, '/opt/text-generation-inference-benchmark/results')]
) )
try: try:
tgi_runner.run([('max-concurrent-requests', 512)]) tgi_runner.run([('max-concurrent-requests', 512)], gpus=model[1])
logger.info(f'TGI started for model {model[0]}') logger.info(f'TGI started for model {model[0]}')
parameters = [ parameters = [
('tokenizer-name', model[0]), ('tokenizer-name', model[0]),
@ -171,27 +190,38 @@ def main():
('benchmark-kind', 'rate'), ('benchmark-kind', 'rate'),
('prompt-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'), ('prompt-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
('decode-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'), ('decode-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
('extra-meta', f'engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}'), ('extra-meta', f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"'),
('--no-console', None) ('no-console', None)
] ]
runner.run(parameters) rates = [('rates', f'{r / 10.}') for r in list(range(8, 248, 8))]
parameters.extend(rates)
runner.run(parameters, f'container:{tgi_runner.container.id}')
except Exception as e: except Exception as e:
logger.error(f'Error running benchmark for model {model[0]}: {e}') logger.error(f'Error running benchmark for model {model[0]}: {e}')
# print the stack trace # print the stack trace
print(traceback.format_exc()) print(traceback.format_exc())
success = False
finally: finally:
tgi_runner.stop() tgi_runner.stop()
runner.stop() runner.stop()
# list json files in results directory if not success:
data_files = {} logger.error('Some benchmarks failed')
exit(1)
df = pd.DataFrame() df = pd.DataFrame()
for filename in os.listdir(results_dir): # list recursively directories
if filename.endswith('.json'): directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')]
data_files[filename.split('.')[-2]] = f'{results_dir}/{filename}' logger.info(f'Found result directories: {directories}')
df = pd.concat([df, build_df(results_dir.split('/')[-1], data_files)]) for directory in directories:
data_files = {}
for filename in os.listdir(directory):
if filename.endswith('.json'):
data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
logger.info(f'Processing directory {directory}')
df = pd.concat([df, build_df(directory.split('/')[-1], data_files)])
df['device'] = get_gpu_name() df['device'] = get_gpu_name()
df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0 df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
df.to_parquet('s3://text-generation-inference-ci/benchmarks/ci/') df.to_parquet(f's3://text-generation-inference-ci/benchmarks/ci/{sha}.parquet')
if __name__ == "__main__": if __name__ == "__main__":