pad to block size

This commit is contained in:
OlivierDehaene 2023-07-18 12:04:38 +02:00
parent 086d0c2252
commit d2e3843588

View File

@ -187,10 +187,17 @@ impl State {
max_input_length = max_input_length.max(entry.request.input_length); max_input_length = max_input_length.max(entry.request.input_length);
prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
} else { } else {
prefill_tokens += entry.request.input_length; // pad to block size
prefill_tokens += ((entry.request.input_length + 16 - 1) / 16) * 16;
} }
if self.requires_padding {
decode_tokens += entry.request.stopping_parameters.max_new_tokens; decode_tokens += entry.request.stopping_parameters.max_new_tokens;
} else {
// pad to block size
decode_tokens +=
((entry.request.stopping_parameters.max_new_tokens + 16 - 1) / 16) * 16;
}
if prefill_tokens > prefill_token_budget if prefill_tokens > prefill_token_budget
|| (prefill_tokens + decode_tokens) > token_budget || (prefill_tokens + decode_tokens) > token_budget