Merge branch 'habana-main' into v2.0.4

This commit is contained in:
yuanwu2017 2024-07-17 13:45:15 +08:00 committed by GitHub
commit d3155d6f41
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -602,9 +602,12 @@ class CausalLM(Model):
dtype: Optional[torch.dtype] = None, dtype: Optional[torch.dtype] = None,
trust_remote_code: bool = False, trust_remote_code: bool = False,
): ):
if speculator: if speculator:
raise RuntimeError("Speculator decoding is not enabled for AutoModel") raise RuntimeError("Speculator decoding is not enabled for AutoModel")
self.prev_bs = 0
# Create tokenizer # Create tokenizer
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
model_id, model_id,
@ -965,6 +968,9 @@ class CausalLM(Model):
batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id) batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id)
scenario = 'PREFILL' if prefill else 'GENERATE' scenario = 'PREFILL' if prefill else 'GENERATE'
if self.enable_hpu_graph and self.limit_hpu_graph and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs:
self.model.clear_cache()
self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE)
dbg_trace( dbg_trace(
scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}') scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}')
assert batch.right_padding > 0, 'No more room for next token!' assert batch.right_padding > 0, 'No more room for next token!'
@ -981,6 +987,10 @@ class CausalLM(Model):
batch.past_key_values, batch.past_key_values,
bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None, bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None,
) )
elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]):
# Don't schedule next forward if max_new_tokens for all requests equals 1
# - we've already generated the first and only needed token in the prefill phase
pass
else: else:
token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device) token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device)
input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1) input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)