From 316cb087f327ee37267c390934bb7a9c5378ecf8 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Wed, 7 May 2025 06:07:20 -0700 Subject: [PATCH] if limit is set, all prefill will bypass graph Signed-off-by: Wang, Yi A --- .../server/text_generation_server/models/flash_causal_lm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py index 5aadff0d..cafaae23 100644 --- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py @@ -1446,7 +1446,7 @@ class FlashCausalLM(Model): os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true" ) self.limit_hpu_graph = ( - os.environ.get("LIMIT_HPU_GRAPH", "true").lower() == "true" + os.environ.get("LIMIT_HPU_GRAPH", "false").lower() == "true" ) self.max_seq_len_to_capture = 8192 super().__init__( @@ -1596,8 +1596,9 @@ class FlashCausalLM(Model): def bypass_hpu_graphs(self, prefill, max_seq_len_to_capture): if self.limit_hpu_graph: + return prefill + else: return prefill and max_seq_len_to_capture > self.max_seq_len_to_capture - return False def warmup_hpu_graph(self, batch): warmup_times = 3