prefill bypass graph

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-06-07 01:42:08 +00:00 · 2025-04-15 00:27:07 -07:00 · 2025-04-15 00:27:07 -07:00 · 5ec7f15d0c
commit 5ec7f15d0c
parent 6b21985c95
2 changed files with 2 additions and 2 deletions
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -1785,7 +1785,7 @@ class FlashCausalLM(Model):

        kwargs = {}
        if htorch.utils.internal.is_lazy():
-            kwargs["bypass_hpu_graphs"] = False
+            kwargs["bypass_hpu_graphs"] = batch.prefilling

        logits, speculative_logits = self.model.forward(
            input_ids=input_ids,
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@ -455,7 +455,7 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):

        kwargs = {}
        if htorch.utils.internal.is_lazy():
-            kwargs["bypass_hpu_graphs"] = False
+            kwargs["bypass_hpu_graphs"] = batch.prefilling
        if batch.prefill_cache_indices is not None:
            slots_pad = torch.zeros_like(input_ids)
            slots_pad[batch.prefill_cache_indices] = slots