mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
* Making prefix/flashinfer the default and testing the full release tests. * Include flashinfer in the docker. * Using prebuilt. * Allowing window_left_size (dummy version). * Disabling flashinfer/prefix caching on odd head_dim * Disable prefix caching for lora. * More specific codes. * Update lock * Updating integration tests with new values with FI/FD. Remove paged as a default too, and using FD everywhere. * Update cargo lock ? * Upgrade to 1.80 because of bitstream... * Everywhere 1.80 * Forgot last default place. * Apply suggestions from code review Co-authored-by: drbh <david.richard.holtz@gmail.com> * Updated flake lock * Tmp * Upgrade resolution system for less errors in resolution. * Remove lambda for cleaner function. * Handling debugger. * OVerride the env in server tests. * Is this enough to make it work ? * This seems to be working. * Downgrade some logs. * Fixing the default for vlm. * Don't enable prefix caching on VLM just yet. * Change `add_special_tokens` in order to have the correct tokens for chat input and not (since it's super important with the prefixing now) * Fixing prefix caching for flashdecoding. * Update all models. * Fixed flashinfer version. * add_special_tokens is internal only * Fixing seqlen with the new vlms. * Fixing the issue with `add_special_tokens` not being passed around. * Fixing the test. * Removing encoder_decoder (seq2seq). * Update the chat test. * Fixing the batching tokenization in flash causal lm. * Truncating left for radix purposes. * Oops this doesn't belong here. * Put back default pure shell. * Update server tests - Default to throughput test in k6 - Use TGI_WIGGLE_ROOM to adjust wiggle room * Only n_heads / process_group.size() are necessary. * Revert the integrationt tests change (seem linked to head_size modification). * Adding error message when assert is violated. * Fixing the free algorithm to handle times where the common prefix is smaller. * Apply suggestions from code review Co-authored-by: OlivierDehaene <olivier@huggingface.co> * Update server/text_generation_server/layers/attention/common.py Co-authored-by: OlivierDehaene <olivier@huggingface.co> * Fix disabling prefix caching - Fix windowing checks. * Revert the Cohere tokenizer change (for now using a revision instead). * Fmt. --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co>
95 lines
3.1 KiB
JavaScript
95 lines
3.1 KiB
JavaScript
import { check } from 'k6';
|
|
import { scenario } from 'k6/execution';
|
|
import http from 'k6/http';
|
|
import { Trend, Counter } from 'k6/metrics';
|
|
|
|
const host = __ENV.HOST;
|
|
const model_id = __ENV.MODEL_ID;
|
|
const timePerToken = new Trend('time_per_token', true);
|
|
const tokens = new Counter('tokens');
|
|
const new_tokens = new Counter('new_tokens');
|
|
const input_tokens = new Counter('input_tokens');
|
|
const max_new_tokens = 50;
|
|
|
|
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
|
const shareGPT = JSON.parse(open("small.json"))
|
|
|
|
|
|
export function get_options() {
|
|
return {
|
|
thresholds: {
|
|
http_req_failed: ['rate==0'],
|
|
// time_per_token: [{
|
|
// threshold: `p(50)<${5 * reference_latency_ms}`,
|
|
// abortOnFail: true,
|
|
// delayAbortEval: '10s'
|
|
// }],
|
|
},
|
|
scenarios: {
|
|
// single_user: {
|
|
// executor: 'constant-arrival-rate',
|
|
// duration: '60s',
|
|
// preAllocatedVUs: 1,
|
|
// rate: 20,
|
|
// timeUnit: '1s',
|
|
// },
|
|
// load_test: {
|
|
// executor: 'constant-arrival-rate',
|
|
// duration: '60s',
|
|
// preAllocatedVUs: 100,
|
|
// rate: 1,
|
|
// timeUnit: '1s',
|
|
// },
|
|
// breakpoint: {
|
|
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
|
|
// preAllocatedVUs: 300,
|
|
// stages: [
|
|
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
|
|
// ],
|
|
// },
|
|
throughput: {
|
|
executor: 'shared-iterations',
|
|
vus: 100,
|
|
iterations: 200,
|
|
maxDuration: '40s',
|
|
},
|
|
},
|
|
};
|
|
}
|
|
|
|
function generate_payload(gpt, max_new_tokens) {
|
|
const input = gpt["conversations"][0]["value"];
|
|
return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
|
|
}
|
|
|
|
export const options = get_options();
|
|
|
|
export default function run() {
|
|
const headers = { 'Content-Type': 'application/json' };
|
|
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
|
|
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
|
|
const res = http.post(`http://${host}/v1/chat/completions`, payload, {
|
|
headers,
|
|
});
|
|
if (res.status >= 400 && res.status < 500) {
|
|
return;
|
|
}
|
|
|
|
|
|
check(res, {
|
|
'Post status is 200': (res) => res.status === 200,
|
|
});
|
|
const duration = res.timings.duration;
|
|
|
|
if (res.status === 200) {
|
|
const body = res.json();
|
|
const completion_tokens = body.usage.completion_tokens;
|
|
const latency_ms_per_token = duration / completion_tokens;
|
|
timePerToken.add(latency_ms_per_token);
|
|
const prompt_tokens = body.usage.prompt_tokens;
|
|
input_tokens.add(prompt_tokens);
|
|
new_tokens.add(completion_tokens);
|
|
tokens.add(completion_tokens + prompt_tokens);
|
|
}
|
|
}
|