Merge branch 'habana-main' into v2.0.4

2025-07-30 03:40:17 +00:00 · 2024-07-17 13:45:15 +08:00 · 2024-07-17 13:45:15 +08:00 · d3155d6f41
commit d3155d6f41
parent b34edc2ee9 0ca54b55f8
1 changed files with 10 additions and 0 deletions
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -602,9 +602,12 @@ class CausalLM(Model):
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        if speculator:
            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
        self.prev_bs = 0
        # Create tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
@ -965,6 +968,9 @@ class CausalLM(Model):
            batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id)
        scenario = 'PREFILL' if prefill else 'GENERATE'
        if self.enable_hpu_graph and self.limit_hpu_graph and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs:
            self.model.clear_cache()
            self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE)
        dbg_trace(
            scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}')
        assert batch.right_padding > 0, 'No more room for next token!'
@ -981,6 +987,10 @@ class CausalLM(Model):
                batch.past_key_values,
                bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None,
            )
        elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]):
            # Don't schedule next forward if max_new_tokens for all requests equals 1 
            # - we've already generated the first and only needed token in the prefill phase
            pass
        else:
            token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device)
            input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)