Updating the benchmarks so everyone uses openai compat layer.

This commit is contained in:
Nicolas Patry 2024-04-23 21:07:36 +00:00
parent ed72e92126
commit ba33c66b5b
3 changed files with 29 additions and 51 deletions

View File

@ -5,32 +5,35 @@ import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
const seed = 0; const seed = 0;
const host = __ENV.HOST || '127.0.0.1:8000'; const host = __ENV.HOST;
const model_id = __ENV.MODEL_ID;
const timePerToken = new Trend('time_per_token', true); const timePerToken = new Trend('time_per_token', true);
const tokens = new Counter('tokens'); const tokens = new Counter('tokens');
const new_tokens = new Counter('new_tokens'); const new_tokens = new Counter('new_tokens');
const input_tokens = new Counter('input_tokens'); const input_tokens = new Counter('input_tokens');
const max_new_tokens = 50;
const reference_latency_ms = 32;
randomSeed(seed); randomSeed(seed);
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json")) // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
const shareGPT = JSON.parse(open("small.json")) const shareGPT = JSON.parse(open("small.json"))
export function get_options(reference_latency_ms){ export function get_options() {
return { return {
thresholds: { thresholds: {
http_req_failed: ['rate==0'], http_req_failed: ['rate==0'],
time_per_token: [{ // time_per_token: [{
threshold: `p(50)<${5 * reference_latency_ms}`, // threshold: `p(50)<${5 * reference_latency_ms}`,
abortOnFail: true, // abortOnFail: true,
delayAbortEval: '10s' // delayAbortEval: '10s'
}], // }],
}, },
scenarios: { scenarios: {
load_test: { load_test: {
executor: 'constant-arrival-rate', executor: 'constant-arrival-rate',
duration: '60s', duration: '60s',
preAllocatedVUs: 10, preAllocatedVUs: 1000,
rate: 10, rate: 10,
timeUnit: '1s', timeUnit: '1s',
}, },
@ -38,12 +41,16 @@ export function get_options(reference_latency_ms){
}; };
} }
function generate_payload(gpt, max_new_tokens) {
const input = gpt["conversations"][0]["value"];
return { "messages": [{ "role": "user", "content": input }], "temperature": 0.5, "ignore_eos": true, "model": `${model_id}`, "max_tokens": max_new_tokens }
}
export function run(host, generate_payload, max_new_tokens) { export function run() {
const headers = { 'Content-Type': 'application/json' }; const headers = { 'Content-Type': 'application/json' };
const query = randomItem(shareGPT); const query = randomItem(shareGPT);
const payload = JSON.stringify(generate_payload(query)); const payload = JSON.stringify(generate_payload(query, max_new_tokens));
const res = http.post(`http://${host}/generate`, payload, { const res = http.post(`http://${host}/v1/chat/completions`, payload, {
headers, headers,
}); });
if (res.status >= 400 && res.status < 500) { if (res.status >= 400 && res.status < 500) {
@ -58,14 +65,19 @@ export function run(host, generate_payload, max_new_tokens) {
if (res.status === 200) { if (res.status === 200) {
const body = res.json(); const body = res.json();
const n_tokens = body.details.tokens.length; const n_tokens = body.usage.completion_tokens;
const latency_ms_per_token = duration / n_tokens; const latency_ms_per_token = duration / n_tokens;
timePerToken.add(latency_ms_per_token); timePerToken.add(latency_ms_per_token);
const latency_in_s = latency_ms_per_token / 1000; const _input_tokens = body.usage.prompt_tokens;
const individual_throughput = 1 / latency_in_s;
const _input_tokens = body.details.prefill.length;
tokens.add(n_tokens + _input_tokens); tokens.add(n_tokens + _input_tokens);
input_tokens.add(_input_tokens); input_tokens.add(_input_tokens);
new_tokens.add(n_tokens); new_tokens.add(n_tokens);
tokens.add(n_tokens + _input_tokens);
} }
} }
export const options = get_options();
export default function() {
run();
}

View File

@ -1,17 +0,0 @@
import { get_options, run } from "./common.js";
const reference_latency_ms = 70;
const host = __ENV.HOST || '127.0.0.1:8000';
const max_new_tokens = 50;
function generate_payload(gpt){
const input = gpt["conversations"][0]["value"];
return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "decoder_input_details": true}}
}
export const options = get_options(reference_latency_ms);
export default function(){
run(host, generate_payload, max_new_tokens);
}

View File

@ -1,17 +0,0 @@
import { get_options, run } from "./common.js";
const reference_latency_ms = 22;
const host = __ENV.HOST || '127.0.0.1:8000';
const max_new_tokens = 50;
function generate_payload(gpt){
const input = gpt["conversations"][0]["value"];
return {"prompt": input, "temperature": 0.5, "ignore_eos": true}
}
export const options = get_options(reference_latency_ms);
export default function(){
run(host, generate_payload, max_new_tokens);
}