mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
Updated.
This commit is contained in:
parent
d2b42f6883
commit
3a79fbc63e
@ -7,7 +7,9 @@ const seed = 0;
|
|||||||
|
|
||||||
const host = __ENV.HOST || '127.0.0.1:8000';
|
const host = __ENV.HOST || '127.0.0.1:8000';
|
||||||
const timePerToken = new Trend('time_per_token', true);
|
const timePerToken = new Trend('time_per_token', true);
|
||||||
const throughput = new Counter('tokens_per_s');
|
const tokens = new Counter('tokens');
|
||||||
|
const new_tokens = new Counter('new_tokens');
|
||||||
|
const input_tokens = new Counter('input_tokens');
|
||||||
|
|
||||||
randomSeed(seed);
|
randomSeed(seed);
|
||||||
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
||||||
@ -19,7 +21,7 @@ export function get_options(reference_latency_ms){
|
|||||||
thresholds: {
|
thresholds: {
|
||||||
http_req_failed: ['rate==0'],
|
http_req_failed: ['rate==0'],
|
||||||
time_per_token: [{
|
time_per_token: [{
|
||||||
threshold: `p(50)<${3 * reference_latency_ms}`,
|
threshold: `p(50)<${5 * reference_latency_ms}`,
|
||||||
abortOnFail: true,
|
abortOnFail: true,
|
||||||
delayAbortEval: '10s'
|
delayAbortEval: '10s'
|
||||||
}],
|
}],
|
||||||
@ -28,7 +30,7 @@ export function get_options(reference_latency_ms){
|
|||||||
load_test: {
|
load_test: {
|
||||||
executor: 'constant-arrival-rate',
|
executor: 'constant-arrival-rate',
|
||||||
duration: '60s',
|
duration: '60s',
|
||||||
preAllocatedVUs: 100,
|
preAllocatedVUs: 10,
|
||||||
rate: 10,
|
rate: 10,
|
||||||
timeUnit: '1s',
|
timeUnit: '1s',
|
||||||
},
|
},
|
||||||
@ -48,17 +50,22 @@ export function run(host, generate_payload, max_new_tokens) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
check(res, {
|
check(res, {
|
||||||
'Post status is 200': (r) => res.status === 200,
|
'Post status is 200': (r) => res.status === 200,
|
||||||
});
|
});
|
||||||
const n_tokens = max_new_tokens;
|
const duration = res.timings.duration;
|
||||||
const timings = res.timings.duration;
|
|
||||||
|
|
||||||
if (res.status === 200) {
|
if (res.status === 200) {
|
||||||
const latency_ms_per_token = timings / n_tokens;
|
const body = res.json();
|
||||||
|
const n_tokens = body.details.tokens.length;
|
||||||
|
const latency_ms_per_token = duration / n_tokens;
|
||||||
timePerToken.add(latency_ms_per_token);
|
timePerToken.add(latency_ms_per_token);
|
||||||
const latency_in_s = latency_ms_per_token / 1000;
|
const latency_in_s = latency_ms_per_token / 1000;
|
||||||
const individual_throughput = 1 / latency_in_s;
|
const individual_throughput = 1 / latency_in_s;
|
||||||
throughput.add(individual_throughput);
|
const _input_tokens = body.details.prefill.length;
|
||||||
|
tokens.add(n_tokens + _input_tokens);
|
||||||
|
input_tokens.add(_input_tokens);
|
||||||
|
new_tokens.add(n_tokens);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
import { get_options, run } from "./common.js";
|
import { get_options, run } from "./common.js";
|
||||||
|
|
||||||
const reference_latency_ms = 30;
|
const reference_latency_ms = 70;
|
||||||
const host = __ENV.HOST || '127.0.0.1:8000';
|
const host = __ENV.HOST || '127.0.0.1:8000';
|
||||||
const max_new_tokens = 50;
|
const max_new_tokens = 50;
|
||||||
|
|
||||||
|
|
||||||
function generate_payload(gpt){
|
function generate_payload(gpt){
|
||||||
const input = gpt["conversations"][0]["value"];
|
const input = gpt["conversations"][0]["value"];
|
||||||
return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}}
|
return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "decoder_input_details": true}}
|
||||||
}
|
}
|
||||||
|
|
||||||
export const options = get_options(reference_latency_ms);
|
export const options = get_options(reference_latency_ms);
|
||||||
|
@ -820,10 +820,20 @@ class FlashCausalLM(Model):
|
|||||||
else:
|
else:
|
||||||
next_token_logits = out
|
next_token_logits = out
|
||||||
|
|
||||||
|
# import datetime
|
||||||
|
# from loguru import logger
|
||||||
|
|
||||||
|
# start = datetime.datetime.now()
|
||||||
next_input_ids, next_token_logprobs, logprobs, accepted_ids, speculative_ids = batch.next_token_chooser(
|
next_input_ids, next_token_logprobs, logprobs, accepted_ids, speculative_ids = batch.next_token_chooser(
|
||||||
batch.all_input_ids_tensor[:, : batch.max_seqlen], next_token_logits, get_speculate(), batch.speculative_ids, speculative_logits
|
batch.all_input_ids_tensor[:, : batch.max_seqlen], next_token_logits, get_speculate(), batch.speculative_ids, speculative_logits
|
||||||
)
|
)
|
||||||
|
# took = datetime.datetime.now() - start
|
||||||
|
# logger.info(f"Next token chooser {batch.all_input_ids_tensor.shape} took {took}")
|
||||||
|
# if batch.all_input_ids_tensor.shape[1] < 2000 and took > datetime.timedelta(milliseconds=5):
|
||||||
|
# next_input_ids, next_token_logprobs, logprobs, accepted_ids, speculative_ids = batch.next_token_chooser(
|
||||||
|
# batch.all_input_ids_tensor[:, : batch.max_seqlen], next_token_logits, get_speculate(), batch.speculative_ids, speculative_logits, verbose=True
|
||||||
|
# )
|
||||||
|
# import ipdb;ipdb.set_trace()
|
||||||
|
|
||||||
batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
|
batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
|
||||||
batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs
|
batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs
|
||||||
|
Loading…
Reference in New Issue
Block a user