use tensor cache in hpu graph to avoid replay issue

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-17 23:34:52 +00:00 · 2025-03-17 01:36:49 -07:00 · 2025-03-17 01:36:49 -07:00 · 6bbe24d974
commit 6bbe24d974
parent a07e7437b6
1 changed files with 1 additions and 1 deletions
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -1398,7 +1398,7 @@ class FlashCausalLM(Model):
        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype

        if htorch.utils.internal.is_lazy():
-            htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=True)
+            htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=False)
        environment.set_model_config(self.config)
        self.use_contiguous_pa = (
            os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"