mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-09 10:52:07 +00:00
Merge branch 'habana-main' into v2.0.4
This commit is contained in:
commit
d3155d6f41
@ -602,9 +602,12 @@ class CausalLM(Model):
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
trust_remote_code: bool = False,
|
||||
):
|
||||
|
||||
if speculator:
|
||||
raise RuntimeError("Speculator decoding is not enabled for AutoModel")
|
||||
|
||||
self.prev_bs = 0
|
||||
|
||||
# Create tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id,
|
||||
@ -965,6 +968,9 @@ class CausalLM(Model):
|
||||
batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id)
|
||||
|
||||
scenario = 'PREFILL' if prefill else 'GENERATE'
|
||||
if self.enable_hpu_graph and self.limit_hpu_graph and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs:
|
||||
self.model.clear_cache()
|
||||
self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE)
|
||||
dbg_trace(
|
||||
scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}')
|
||||
assert batch.right_padding > 0, 'No more room for next token!'
|
||||
@ -981,6 +987,10 @@ class CausalLM(Model):
|
||||
batch.past_key_values,
|
||||
bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None,
|
||||
)
|
||||
elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]):
|
||||
# Don't schedule next forward if max_new_tokens for all requests equals 1
|
||||
# - we've already generated the first and only needed token in the prefill phase
|
||||
pass
|
||||
else:
|
||||
token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device)
|
||||
input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)
|
||||
|
Loading…
Reference in New Issue
Block a user