From 6bbe24d9743e4a2c7e8a02890cd3aef9cea08c1d Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Mon, 17 Mar 2025 01:36:49 -0700
Subject: [PATCH] use tensor cache in hpu graph to avoid replay issue

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 .../server/text_generation_server/models/flash_causal_lm.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
index 27e1c672..3a0dc15e 100644
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@@ -1398,7 +1398,7 @@ class FlashCausalLM(Model):
         self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
 
         if htorch.utils.internal.is_lazy():
-            htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=True)
+            htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=False)
         environment.set_model_config(self.config)
         self.use_contiguous_pa = (
             os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"