Adding small benchmark script.

2025-09-10 20:04:52 +00:00 · 2023-08-18 17:28:31 +00:00 · 2023-08-18 17:28:31 +00:00 · d93e2ab5b6
commit d93e2ab5b6
parent bfa070611d
3 changed files with 98 additions and 0 deletions
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -0,0 +1,64 @@
 import { check, randomSeed } from 'k6';
 import http from 'k6/http';
 import { Trend, Counter } from 'k6/metrics';
 import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
 const seed = 0;
 const host = __ENV.HOST || '127.0.0.1:8000';
 const timePerToken = new Trend('time_per_token', true);
 const throughput = new Counter('tokens_per_s');
 randomSeed(seed);
 // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
 const shareGPT = JSON.parse(open("small.json"))
 export function get_options(reference_latency_ms){
    return {
        thresholds: {
            http_req_failed: ['rate==0'],
            time_per_token: [{
                threshold: `p(50)<${3 * reference_latency_ms}`,
                abortOnFail: true,
                delayAbortEval: '10s'
            }],
        },
        scenarios: {
            load_test: {
                executor: 'constant-arrival-rate',
                duration: '60s',
                preAllocatedVUs: 100,
                rate: 10,
                timeUnit: '1s',
            },
        },
    };
 }
 export function run(host, generate_payload, max_new_tokens) {
    const headers = {'Content-Type': 'application/json'};
    const query = randomItem(shareGPT);
    const payload = JSON.stringify(generate_payload(query));
    const res = http.post(`http://${host}/generate`, payload, {
        headers,
    });
    if(res.status >= 400 && res.status < 500){
        return;
    }
    check(res, {
        'Post status is 200': (r) => res.status === 200,
    });
    const n_tokens = max_new_tokens;
    const timings = res.timings.duration;
    if (res.status === 200) {
        const latency_ms_per_token = timings / n_tokens;
        timePerToken.add(latency_ms_per_token);
        const latency_in_s = latency_ms_per_token / 1000;
        const individual_throughput = 1 / latency_in_s;
        throughput.add(individual_throughput);
    }
 }
--- a/load_tests/tgi.js
+++ b/load_tests/tgi.js
@ -0,0 +1,17 @@
 import { get_options, run } from "./common.js";
 const reference_latency_ms = 30;
 const host = __ENV.HOST || '127.0.0.1:8000';
 const max_new_tokens = 50;
 function generate_payload(gpt){
    const input = gpt["conversations"][0]["value"];
    return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}}
 }
 export const options = get_options(reference_latency_ms);
 export default function(){
    run(host, generate_payload, max_new_tokens);
 }
--- a/load_tests/vllm.js
+++ b/load_tests/vllm.js
@ -0,0 +1,17 @@
 import { get_options, run } from "./common.js";
 const reference_latency_ms = 22;
 const host = __ENV.HOST || '127.0.0.1:8000';
 const max_new_tokens = 50;
 function generate_payload(gpt){
    const input = gpt["conversations"][0]["value"];
    return {"prompt": input, "temperature": 0.5, "ignore_eos": true}
 }
 export const options = get_options(reference_latency_ms);
 export default function(){
    run(host, generate_payload, max_new_tokens);
 }