mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
pad to block size
This commit is contained in:
parent
086d0c2252
commit
d2e3843588
@ -187,10 +187,17 @@ impl State {
|
||||
max_input_length = max_input_length.max(entry.request.input_length);
|
||||
prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
|
||||
} else {
|
||||
prefill_tokens += entry.request.input_length;
|
||||
// pad to block size
|
||||
prefill_tokens += ((entry.request.input_length + 16 - 1) / 16) * 16;
|
||||
}
|
||||
|
||||
if self.requires_padding {
|
||||
decode_tokens += entry.request.stopping_parameters.max_new_tokens;
|
||||
} else {
|
||||
// pad to block size
|
||||
decode_tokens +=
|
||||
((entry.request.stopping_parameters.max_new_tokens + 16 - 1) / 16) * 16;
|
||||
}
|
||||
|
||||
if prefill_tokens > prefill_token_budget
|
||||
|| (prefill_tokens + decode_tokens) > token_budget
|
||||
|
Loading…
Reference in New Issue
Block a user