From 0ca54b55f84e2fcb5b05bb887d736da1ebe76343 Mon Sep 17 00:00:00 2001 From: bkowalskiINTEL Date: Tue, 16 Jul 2024 14:53:24 +0200 Subject: [PATCH] Do not schedule decode if max_new_tokens is equal to 1 (#183) Co-authored-by: Bartosz Kowalski --- server/text_generation_server/models/causal_lm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 32c13daf..012f6249 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -985,6 +985,10 @@ class CausalLM(Model): batch.past_key_values, bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None, ) + elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]): + # Don't schedule next forward if max_new_tokens for all requests equals 1 + # - we've already generated the first and only needed token in the prefill phase + pass else: token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device) input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)