From 5ec7f15d0c61ea80eba606cfa18af3db0555cb1a Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Tue, 15 Apr 2025 00:27:07 -0700
Subject: [PATCH] prefill bypass graph

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 .../server/text_generation_server/models/flash_causal_lm.py     | 2 +-
 .../server/text_generation_server/models/mllama_causal_lm.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
index 5c7b8bc0..8a5668a5 100644
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@@ -1785,7 +1785,7 @@ class FlashCausalLM(Model):
 
         kwargs = {}
         if htorch.utils.internal.is_lazy():
-            kwargs["bypass_hpu_graphs"] = False
+            kwargs["bypass_hpu_graphs"] = batch.prefilling
 
         logits, speculative_logits = self.model.forward(
             input_ids=input_ids,
diff --git a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
index 6a066185..c1ea36f2 100644
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@@ -455,7 +455,7 @@ class FlashMllamaCausalLM(FlashVlmCausalLM):
 
         kwargs = {}
         if htorch.utils.internal.is_lazy():
-            kwargs["bypass_hpu_graphs"] = False
+            kwargs["bypass_hpu_graphs"] = batch.prefilling
         if batch.prefill_cache_indices is not None:
             slots_pad = torch.zeros_like(input_ids)
             slots_pad[batch.prefill_cache_indices] = slots