diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 796f8cd3..8ec6aca8 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -602,9 +602,12 @@ class CausalLM(Model): dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): + if speculator: raise RuntimeError("Speculator decoding is not enabled for AutoModel") + self.prev_bs = 0 + # Create tokenizer tokenizer = AutoTokenizer.from_pretrained( model_id, @@ -965,6 +968,9 @@ class CausalLM(Model): batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id) scenario = 'PREFILL' if prefill else 'GENERATE' + if self.enable_hpu_graph and self.limit_hpu_graph and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs: + self.model.clear_cache() + self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE) dbg_trace( scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}') assert batch.right_padding > 0, 'No more room for next token!' @@ -981,6 +987,10 @@ class CausalLM(Model): batch.past_key_values, bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None, ) + elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]): + # Don't schedule next forward if max_new_tokens for all requests equals 1 + # - we've already generated the first and only needed token in the prefill phase + pass else: token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device) input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)