diff --git a/load_tests/common.js b/load_tests/common.js index 2d34e3a1..80728214 100644 --- a/load_tests/common.js +++ b/load_tests/common.js @@ -1,9 +1,7 @@ -import { check, randomSeed } from 'k6'; +import { check } from 'k6'; +import { scenario } from 'k6/execution'; import http from 'k6/http'; import { Trend, Counter } from 'k6/metrics'; -import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js'; - -const seed = 0; const host = __ENV.HOST; const model_id = __ENV.MODEL_ID; @@ -12,9 +10,7 @@ const tokens = new Counter('tokens'); const new_tokens = new Counter('new_tokens'); const input_tokens = new Counter('input_tokens'); const max_new_tokens = 50; -const reference_latency_ms = 32; -randomSeed(seed); // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json")) const shareGPT = JSON.parse(open("small.json")) @@ -30,25 +26,47 @@ export function get_options() { // }], }, scenarios: { - load_test: { + single_user: { executor: 'constant-arrival-rate', duration: '60s', - preAllocatedVUs: 1000, - rate: 10, + preAllocatedVUs: 1, + rate: 1, timeUnit: '1s', }, + // load_test: { + // executor: 'constant-arrival-rate', + // duration: '60s', + // preAllocatedVUs: 100, + // rate: 1, + // timeUnit: '1s', + // }, + // breakpoint: { + // executor: 'ramping-arrival-rate', //Assure load increase if the system slows + // preAllocatedVUs: 1000, + // stages: [ + // { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load + // ], + // }, + // throughput: { + // executor: 'shared-iterations', + // vus: 100, + // iterations: 200, + // maxDuration: '40s', + // }, }, }; } function generate_payload(gpt, max_new_tokens) { const input = gpt["conversations"][0]["value"]; - return { "messages": [{ "role": "user", "content": input }], "temperature": 0.5, "ignore_eos": true, "model": `${model_id}`, "max_tokens": max_new_tokens } + return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens } } -export function run() { +export const options = get_options(); + +export default function run() { const headers = { 'Content-Type': 'application/json' }; - const query = randomItem(shareGPT); + const query = shareGPT[scenario.iterationInTest % shareGPT.length]; const payload = JSON.stringify(generate_payload(query, max_new_tokens)); const res = http.post(`http://${host}/v1/chat/completions`, payload, { headers, @@ -59,25 +77,18 @@ export function run() { check(res, { - 'Post status is 200': (r) => res.status === 200, + 'Post status is 200': (res) => res.status === 200, }); const duration = res.timings.duration; if (res.status === 200) { const body = res.json(); - const n_tokens = body.usage.completion_tokens; - const latency_ms_per_token = duration / n_tokens; + const completion_tokens = body.usage.completion_tokens; + const latency_ms_per_token = duration / completion_tokens; timePerToken.add(latency_ms_per_token); - const _input_tokens = body.usage.prompt_tokens; - tokens.add(n_tokens + _input_tokens); - input_tokens.add(_input_tokens); - new_tokens.add(n_tokens); - tokens.add(n_tokens + _input_tokens); + const prompt_tokens = body.usage.prompt_tokens; + input_tokens.add(prompt_tokens); + new_tokens.add(completion_tokens); + tokens.add(completion_tokens + prompt_tokens); } } - -export const options = get_options(); - -export default function() { - run(); -}