if limit is set, all prefill will bypass graph

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-10 11:54:52 +00:00 · 2025-05-07 06:07:20 -07:00 · 2025-05-07 06:07:20 -07:00 · 316cb087f3
commit 316cb087f3
parent ff5bc1bbd1
1 changed files with 3 additions and 2 deletions
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -1446,7 +1446,7 @@ class FlashCausalLM(Model):
            os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
        )
        self.limit_hpu_graph = (
-            os.environ.get("LIMIT_HPU_GRAPH", "true").lower() == "true"
+            os.environ.get("LIMIT_HPU_GRAPH", "false").lower() == "true"
        )
        self.max_seq_len_to_capture = 8192
        super().__init__(
@ -1596,8 +1596,9 @@ class FlashCausalLM(Model):

    def bypass_hpu_graphs(self, prefill, max_seq_len_to_capture):
        if self.limit_hpu_graph:
+            return prefill
+        else:
            return prefill and max_seq_len_to_capture > self.max_seq_len_to_capture
-        return False

    def warmup_hpu_graph(self, batch):
        warmup_times = 3