mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
Adding small benchmark script.
This commit is contained in:
parent
bfa070611d
commit
d93e2ab5b6
64
load_tests/common.js
Normal file
64
load_tests/common.js
Normal file
@ -0,0 +1,64 @@
|
||||
import { check, randomSeed } from 'k6';
|
||||
import http from 'k6/http';
|
||||
import { Trend, Counter } from 'k6/metrics';
|
||||
import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
|
||||
|
||||
const seed = 0;
|
||||
|
||||
const host = __ENV.HOST || '127.0.0.1:8000';
|
||||
const timePerToken = new Trend('time_per_token', true);
|
||||
const throughput = new Counter('tokens_per_s');
|
||||
|
||||
randomSeed(seed);
|
||||
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
||||
const shareGPT = JSON.parse(open("small.json"))
|
||||
|
||||
|
||||
export function get_options(reference_latency_ms){
|
||||
return {
|
||||
thresholds: {
|
||||
http_req_failed: ['rate==0'],
|
||||
time_per_token: [{
|
||||
threshold: `p(50)<${3 * reference_latency_ms}`,
|
||||
abortOnFail: true,
|
||||
delayAbortEval: '10s'
|
||||
}],
|
||||
},
|
||||
scenarios: {
|
||||
load_test: {
|
||||
executor: 'constant-arrival-rate',
|
||||
duration: '60s',
|
||||
preAllocatedVUs: 100,
|
||||
rate: 10,
|
||||
timeUnit: '1s',
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
export function run(host, generate_payload, max_new_tokens) {
|
||||
const headers = {'Content-Type': 'application/json'};
|
||||
const query = randomItem(shareGPT);
|
||||
const payload = JSON.stringify(generate_payload(query));
|
||||
const res = http.post(`http://${host}/generate`, payload, {
|
||||
headers,
|
||||
});
|
||||
if(res.status >= 400 && res.status < 500){
|
||||
return;
|
||||
}
|
||||
|
||||
check(res, {
|
||||
'Post status is 200': (r) => res.status === 200,
|
||||
});
|
||||
const n_tokens = max_new_tokens;
|
||||
const timings = res.timings.duration;
|
||||
|
||||
if (res.status === 200) {
|
||||
const latency_ms_per_token = timings / n_tokens;
|
||||
timePerToken.add(latency_ms_per_token);
|
||||
const latency_in_s = latency_ms_per_token / 1000;
|
||||
const individual_throughput = 1 / latency_in_s;
|
||||
throughput.add(individual_throughput);
|
||||
}
|
||||
}
|
17
load_tests/tgi.js
Normal file
17
load_tests/tgi.js
Normal file
@ -0,0 +1,17 @@
|
||||
import { get_options, run } from "./common.js";
|
||||
|
||||
const reference_latency_ms = 30;
|
||||
const host = __ENV.HOST || '127.0.0.1:8000';
|
||||
const max_new_tokens = 50;
|
||||
|
||||
|
||||
function generate_payload(gpt){
|
||||
const input = gpt["conversations"][0]["value"];
|
||||
return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}}
|
||||
}
|
||||
|
||||
export const options = get_options(reference_latency_ms);
|
||||
|
||||
export default function(){
|
||||
run(host, generate_payload, max_new_tokens);
|
||||
}
|
17
load_tests/vllm.js
Normal file
17
load_tests/vllm.js
Normal file
@ -0,0 +1,17 @@
|
||||
import { get_options, run } from "./common.js";
|
||||
|
||||
const reference_latency_ms = 22;
|
||||
const host = __ENV.HOST || '127.0.0.1:8000';
|
||||
const max_new_tokens = 50;
|
||||
|
||||
|
||||
function generate_payload(gpt){
|
||||
const input = gpt["conversations"][0]["value"];
|
||||
return {"prompt": input, "temperature": 0.5, "ignore_eos": true}
|
||||
}
|
||||
|
||||
export const options = get_options(reference_latency_ms);
|
||||
|
||||
export default function(){
|
||||
run(host, generate_payload, max_new_tokens);
|
||||
}
|
Loading…
Reference in New Issue
Block a user