mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
pad to block size
This commit is contained in:
parent
086d0c2252
commit
d2e3843588
@ -187,10 +187,17 @@ impl State {
|
|||||||
max_input_length = max_input_length.max(entry.request.input_length);
|
max_input_length = max_input_length.max(entry.request.input_length);
|
||||||
prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
|
prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
|
||||||
} else {
|
} else {
|
||||||
prefill_tokens += entry.request.input_length;
|
// pad to block size
|
||||||
|
prefill_tokens += ((entry.request.input_length + 16 - 1) / 16) * 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.requires_padding {
|
||||||
decode_tokens += entry.request.stopping_parameters.max_new_tokens;
|
decode_tokens += entry.request.stopping_parameters.max_new_tokens;
|
||||||
|
} else {
|
||||||
|
// pad to block size
|
||||||
|
decode_tokens +=
|
||||||
|
((entry.request.stopping_parameters.max_new_tokens + 16 - 1) / 16) * 16;
|
||||||
|
}
|
||||||
|
|
||||||
if prefill_tokens > prefill_token_budget
|
if prefill_tokens > prefill_token_budget
|
||||||
|| (prefill_tokens + decode_tokens) > token_budget
|
|| (prefill_tokens + decode_tokens) > token_budget
|
||||||
|
Loading…
Reference in New Issue
Block a user