With this change, bucketing/padding of input is applied to health check. (#245)

2025-09-11 12:24:53 +00:00 · 2024-11-18 16:38:30 -05:00 · 2024-11-18 16:38:30 -05:00 · d49ce00f40
commit d49ce00f40
parent 56c3eb4adb
1 changed files with 2 additions and 0 deletions
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -465,6 +465,8 @@ class CausalLMBatch(Batch):
        requests = [CausalLMRequest.from_pb(idx, req, tokenizer) for idx, req in enumerate(pb.requests)]

        max_input_length = max(r.data.truncate for r in requests)
+        if max_input_length < PAD_SEQUENCE_TO_MULTIPLE_OF:
+             max_input_length = PAD_SEQUENCE_TO_MULTIPLE_OF
        max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests)

        # TODO: Add support for sparse batches