From fccf5edf45836491d8cdd9e2c98d5cde9bae76ab Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 25 Apr 2024 15:42:17 +0200 Subject: [PATCH] Updating the benchmarks so everyone uses openai compat layer. (#1800) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- load_tests/common.js | 83 ++++++++++++++++++++++++++++---------------- load_tests/tgi.js | 17 --------- load_tests/vllm.js | 17 --------- 3 files changed, 53 insertions(+), 64 deletions(-) delete mode 100644 load_tests/tgi.js delete mode 100644 load_tests/vllm.js diff --git a/load_tests/common.js b/load_tests/common.js index 06d2506f4..80728214d 100644 --- a/load_tests/common.js +++ b/load_tests/common.js @@ -1,71 +1,94 @@ -import { check, randomSeed } from 'k6'; +import { check } from 'k6'; +import { scenario } from 'k6/execution'; import http from 'k6/http'; import { Trend, Counter } from 'k6/metrics'; -import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js'; -const seed = 0; - -const host = __ENV.HOST || '127.0.0.1:8000'; +const host = __ENV.HOST; +const model_id = __ENV.MODEL_ID; const timePerToken = new Trend('time_per_token', true); const tokens = new Counter('tokens'); const new_tokens = new Counter('new_tokens'); const input_tokens = new Counter('input_tokens'); +const max_new_tokens = 50; -randomSeed(seed); // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json")) const shareGPT = JSON.parse(open("small.json")) -export function get_options(reference_latency_ms){ +export function get_options() { return { thresholds: { http_req_failed: ['rate==0'], - time_per_token: [{ - threshold: `p(50)<${5 * reference_latency_ms}`, - abortOnFail: true, - delayAbortEval: '10s' - }], + // time_per_token: [{ + // threshold: `p(50)<${5 * reference_latency_ms}`, + // abortOnFail: true, + // delayAbortEval: '10s' + // }], }, scenarios: { - load_test: { + single_user: { executor: 'constant-arrival-rate', duration: '60s', - preAllocatedVUs: 10, - rate: 10, + preAllocatedVUs: 1, + rate: 1, timeUnit: '1s', }, + // load_test: { + // executor: 'constant-arrival-rate', + // duration: '60s', + // preAllocatedVUs: 100, + // rate: 1, + // timeUnit: '1s', + // }, + // breakpoint: { + // executor: 'ramping-arrival-rate', //Assure load increase if the system slows + // preAllocatedVUs: 1000, + // stages: [ + // { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load + // ], + // }, + // throughput: { + // executor: 'shared-iterations', + // vus: 100, + // iterations: 200, + // maxDuration: '40s', + // }, }, }; } +function generate_payload(gpt, max_new_tokens) { + const input = gpt["conversations"][0]["value"]; + return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens } +} -export function run(host, generate_payload, max_new_tokens) { - const headers = {'Content-Type': 'application/json'}; - const query = randomItem(shareGPT); - const payload = JSON.stringify(generate_payload(query)); - const res = http.post(`http://${host}/generate`, payload, { +export const options = get_options(); + +export default function run() { + const headers = { 'Content-Type': 'application/json' }; + const query = shareGPT[scenario.iterationInTest % shareGPT.length]; + const payload = JSON.stringify(generate_payload(query, max_new_tokens)); + const res = http.post(`http://${host}/v1/chat/completions`, payload, { headers, }); - if(res.status >= 400 && res.status < 500){ + if (res.status >= 400 && res.status < 500) { return; } check(res, { - 'Post status is 200': (r) => res.status === 200, + 'Post status is 200': (res) => res.status === 200, }); const duration = res.timings.duration; if (res.status === 200) { const body = res.json(); - const n_tokens = body.details.tokens.length; - const latency_ms_per_token = duration / n_tokens; + const completion_tokens = body.usage.completion_tokens; + const latency_ms_per_token = duration / completion_tokens; timePerToken.add(latency_ms_per_token); - const latency_in_s = latency_ms_per_token / 1000; - const individual_throughput = 1 / latency_in_s; - const _input_tokens = body.details.prefill.length; - tokens.add(n_tokens + _input_tokens); - input_tokens.add(_input_tokens); - new_tokens.add(n_tokens); + const prompt_tokens = body.usage.prompt_tokens; + input_tokens.add(prompt_tokens); + new_tokens.add(completion_tokens); + tokens.add(completion_tokens + prompt_tokens); } } diff --git a/load_tests/tgi.js b/load_tests/tgi.js deleted file mode 100644 index 6c559a9f2..000000000 --- a/load_tests/tgi.js +++ /dev/null @@ -1,17 +0,0 @@ -import { get_options, run } from "./common.js"; - -const reference_latency_ms = 70; -const host = __ENV.HOST || '127.0.0.1:8000'; -const max_new_tokens = 50; - - -function generate_payload(gpt){ - const input = gpt["conversations"][0]["value"]; - return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "decoder_input_details": true}} -} - -export const options = get_options(reference_latency_ms); - -export default function(){ - run(host, generate_payload, max_new_tokens); -} diff --git a/load_tests/vllm.js b/load_tests/vllm.js deleted file mode 100644 index 1edc039a9..000000000 --- a/load_tests/vllm.js +++ /dev/null @@ -1,17 +0,0 @@ -import { get_options, run } from "./common.js"; - -const reference_latency_ms = 22; -const host = __ENV.HOST || '127.0.0.1:8000'; -const max_new_tokens = 50; - - -function generate_payload(gpt){ - const input = gpt["conversations"][0]["value"]; - return {"prompt": input, "temperature": 0.5, "ignore_eos": true} -} - -export const options = get_options(reference_latency_ms); - -export default function(){ - run(host, generate_payload, max_new_tokens); -}