With this change, bucketing/padding of input is applied to health check. (#245)

This commit is contained in:
srajabos 2024-11-18 16:38:30 -05:00 committed by GitHub
parent 56c3eb4adb
commit d49ce00f40
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -465,6 +465,8 @@ class CausalLMBatch(Batch):
requests = [CausalLMRequest.from_pb(idx, req, tokenizer) for idx, req in enumerate(pb.requests)] requests = [CausalLMRequest.from_pb(idx, req, tokenizer) for idx, req in enumerate(pb.requests)]
max_input_length = max(r.data.truncate for r in requests) max_input_length = max(r.data.truncate for r in requests)
if max_input_length < PAD_SEQUENCE_TO_MULTIPLE_OF:
max_input_length = PAD_SEQUENCE_TO_MULTIPLE_OF
max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests) max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests)
# TODO: Add support for sparse batches # TODO: Add support for sparse batches