mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 06:42:10 +00:00
use tensor cache in hpu graph to avoid replay issue
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
a07e7437b6
commit
6bbe24d974
@ -1398,7 +1398,7 @@ class FlashCausalLM(Model):
|
|||||||
self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
|
self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
|
||||||
|
|
||||||
if htorch.utils.internal.is_lazy():
|
if htorch.utils.internal.is_lazy():
|
||||||
htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=True)
|
htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=False)
|
||||||
environment.set_model_config(self.config)
|
environment.set_model_config(self.config)
|
||||||
self.use_contiguous_pa = (
|
self.use_contiguous_pa = (
|
||||||
os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
|
os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
|
||||||
|
Loading…
Reference in New Issue
Block a user