mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
prefill bypass graph
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
6b21985c95
commit
5ec7f15d0c
@ -1785,7 +1785,7 @@ class FlashCausalLM(Model):
|
||||
|
||||
kwargs = {}
|
||||
if htorch.utils.internal.is_lazy():
|
||||
kwargs["bypass_hpu_graphs"] = False
|
||||
kwargs["bypass_hpu_graphs"] = batch.prefilling
|
||||
|
||||
logits, speculative_logits = self.model.forward(
|
||||
input_ids=input_ids,
|
||||
|
@ -455,7 +455,7 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
|
||||
|
||||
kwargs = {}
|
||||
if htorch.utils.internal.is_lazy():
|
||||
kwargs["bypass_hpu_graphs"] = False
|
||||
kwargs["bypass_hpu_graphs"] = batch.prefilling
|
||||
if batch.prefill_cache_indices is not None:
|
||||
slots_pad = torch.zeros_like(input_ids)
|
||||
slots_pad[batch.prefill_cache_indices] = slots
|
||||
|
Loading…
Reference in New Issue
Block a user