mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
Revamp slightly.
This commit is contained in:
parent
ba33c66b5b
commit
12b3765896
@ -1,9 +1,7 @@
|
|||||||
import { check, randomSeed } from 'k6';
|
import { check } from 'k6';
|
||||||
|
import { scenario } from 'k6/execution';
|
||||||
import http from 'k6/http';
|
import http from 'k6/http';
|
||||||
import { Trend, Counter } from 'k6/metrics';
|
import { Trend, Counter } from 'k6/metrics';
|
||||||
import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
|
|
||||||
|
|
||||||
const seed = 0;
|
|
||||||
|
|
||||||
const host = __ENV.HOST;
|
const host = __ENV.HOST;
|
||||||
const model_id = __ENV.MODEL_ID;
|
const model_id = __ENV.MODEL_ID;
|
||||||
@ -12,9 +10,7 @@ const tokens = new Counter('tokens');
|
|||||||
const new_tokens = new Counter('new_tokens');
|
const new_tokens = new Counter('new_tokens');
|
||||||
const input_tokens = new Counter('input_tokens');
|
const input_tokens = new Counter('input_tokens');
|
||||||
const max_new_tokens = 50;
|
const max_new_tokens = 50;
|
||||||
const reference_latency_ms = 32;
|
|
||||||
|
|
||||||
randomSeed(seed);
|
|
||||||
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
||||||
const shareGPT = JSON.parse(open("small.json"))
|
const shareGPT = JSON.parse(open("small.json"))
|
||||||
|
|
||||||
@ -30,25 +26,47 @@ export function get_options() {
|
|||||||
// }],
|
// }],
|
||||||
},
|
},
|
||||||
scenarios: {
|
scenarios: {
|
||||||
load_test: {
|
single_user: {
|
||||||
executor: 'constant-arrival-rate',
|
executor: 'constant-arrival-rate',
|
||||||
duration: '60s',
|
duration: '60s',
|
||||||
preAllocatedVUs: 1000,
|
preAllocatedVUs: 1,
|
||||||
rate: 10,
|
rate: 1,
|
||||||
timeUnit: '1s',
|
timeUnit: '1s',
|
||||||
},
|
},
|
||||||
|
// load_test: {
|
||||||
|
// executor: 'constant-arrival-rate',
|
||||||
|
// duration: '60s',
|
||||||
|
// preAllocatedVUs: 100,
|
||||||
|
// rate: 1,
|
||||||
|
// timeUnit: '1s',
|
||||||
|
// },
|
||||||
|
// breakpoint: {
|
||||||
|
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
|
||||||
|
// preAllocatedVUs: 1000,
|
||||||
|
// stages: [
|
||||||
|
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
|
||||||
|
// ],
|
||||||
|
// },
|
||||||
|
// throughput: {
|
||||||
|
// executor: 'shared-iterations',
|
||||||
|
// vus: 100,
|
||||||
|
// iterations: 200,
|
||||||
|
// maxDuration: '40s',
|
||||||
|
// },
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function generate_payload(gpt, max_new_tokens) {
|
function generate_payload(gpt, max_new_tokens) {
|
||||||
const input = gpt["conversations"][0]["value"];
|
const input = gpt["conversations"][0]["value"];
|
||||||
return { "messages": [{ "role": "user", "content": input }], "temperature": 0.5, "ignore_eos": true, "model": `${model_id}`, "max_tokens": max_new_tokens }
|
return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
|
||||||
}
|
}
|
||||||
|
|
||||||
export function run() {
|
export const options = get_options();
|
||||||
|
|
||||||
|
export default function run() {
|
||||||
const headers = { 'Content-Type': 'application/json' };
|
const headers = { 'Content-Type': 'application/json' };
|
||||||
const query = randomItem(shareGPT);
|
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
|
||||||
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
|
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
|
||||||
const res = http.post(`http://${host}/v1/chat/completions`, payload, {
|
const res = http.post(`http://${host}/v1/chat/completions`, payload, {
|
||||||
headers,
|
headers,
|
||||||
@ -59,25 +77,18 @@ export function run() {
|
|||||||
|
|
||||||
|
|
||||||
check(res, {
|
check(res, {
|
||||||
'Post status is 200': (r) => res.status === 200,
|
'Post status is 200': (res) => res.status === 200,
|
||||||
});
|
});
|
||||||
const duration = res.timings.duration;
|
const duration = res.timings.duration;
|
||||||
|
|
||||||
if (res.status === 200) {
|
if (res.status === 200) {
|
||||||
const body = res.json();
|
const body = res.json();
|
||||||
const n_tokens = body.usage.completion_tokens;
|
const completion_tokens = body.usage.completion_tokens;
|
||||||
const latency_ms_per_token = duration / n_tokens;
|
const latency_ms_per_token = duration / completion_tokens;
|
||||||
timePerToken.add(latency_ms_per_token);
|
timePerToken.add(latency_ms_per_token);
|
||||||
const _input_tokens = body.usage.prompt_tokens;
|
const prompt_tokens = body.usage.prompt_tokens;
|
||||||
tokens.add(n_tokens + _input_tokens);
|
input_tokens.add(prompt_tokens);
|
||||||
input_tokens.add(_input_tokens);
|
new_tokens.add(completion_tokens);
|
||||||
new_tokens.add(n_tokens);
|
tokens.add(completion_tokens + prompt_tokens);
|
||||||
tokens.add(n_tokens + _input_tokens);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export const options = get_options();
|
|
||||||
|
|
||||||
export default function() {
|
|
||||||
run();
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user