This commit is contained in:
Xuan Son Nguyen 2024-08-14 10:36:17 +02:00
parent d5d168a4d2
commit a6506a51b6

View File

@ -49,8 +49,8 @@ export function get_options() {
// }, // },
throughput: { throughput: {
executor: 'shared-iterations', executor: 'shared-iterations',
vus: 100, vus: 16,
iterations: 500, iterations: 200,
maxDuration: '400s', maxDuration: '400s',
}, },
}, },
@ -59,9 +59,12 @@ export function get_options() {
function generate_payload(gpt, max_new_tokens) { function generate_payload(gpt, max_new_tokens) {
const input = gpt["conversations"][0]["value"]; const input = gpt["conversations"][0]["value"];
return { "messages": [ return {
{ "role": "user", "content": input.substring(0, 5000) } "prompt": `<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n${input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens } "max_tokens": max_new_tokens,
"temperature": 0,
"model": `${model_id}`,
}
} }
export const options = get_options(); export const options = get_options();
@ -70,7 +73,7 @@ export default function run() {
const headers = { 'Content-Type': 'application/json' }; const headers = { 'Content-Type': 'application/json' };
const query = shareGPT[scenario.iterationInTest % shareGPT.length]; const query = shareGPT[scenario.iterationInTest % shareGPT.length];
const payload = JSON.stringify(generate_payload(query, max_new_tokens)); const payload = JSON.stringify(generate_payload(query, max_new_tokens));
const res = http.post(`https://${host}/v1/chat/completions`, payload, { const res = http.post(`https://${host}/v1/completions`, payload, {
headers, headers,
}); });
// if (res.status >= 400 && res.status < 500) { // if (res.status >= 400 && res.status < 500) {
@ -90,6 +93,7 @@ export default function run() {
if (res.status === 200) { if (res.status === 200) {
const body = res.json(); const body = res.json();
if (body.usage) {
const completion_tokens = body.usage.completion_tokens; const completion_tokens = body.usage.completion_tokens;
const latency_ms_per_token = duration / completion_tokens; const latency_ms_per_token = duration / completion_tokens;
timePerToken.add(latency_ms_per_token); timePerToken.add(latency_ms_per_token);
@ -98,6 +102,17 @@ export default function run() {
new_tokens.add(completion_tokens); new_tokens.add(completion_tokens);
tokens.add(completion_tokens + prompt_tokens); tokens.add(completion_tokens + prompt_tokens);
} }
if (body.tokens_predicted) {
// llama.cpp specific
const completion_tokens = body.tokens_predicted;
const latency_ms_per_token = duration / completion_tokens;
timePerToken.add(latency_ms_per_token);
const prompt_tokens = body.tokens_evaluated;
input_tokens.add(prompt_tokens);
new_tokens.add(completion_tokens);
tokens.add(completion_tokens + prompt_tokens);
}
}
sleep(1); sleep(1);
} }