mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 11:54:52 +00:00
if limit is set, all prefill will bypass graph
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
ff5bc1bbd1
commit
316cb087f3
@ -1446,7 +1446,7 @@ class FlashCausalLM(Model):
|
||||
os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
|
||||
)
|
||||
self.limit_hpu_graph = (
|
||||
os.environ.get("LIMIT_HPU_GRAPH", "true").lower() == "true"
|
||||
os.environ.get("LIMIT_HPU_GRAPH", "false").lower() == "true"
|
||||
)
|
||||
self.max_seq_len_to_capture = 8192
|
||||
super().__init__(
|
||||
@ -1596,8 +1596,9 @@ class FlashCausalLM(Model):
|
||||
|
||||
def bypass_hpu_graphs(self, prefill, max_seq_len_to_capture):
|
||||
if self.limit_hpu_graph:
|
||||
return prefill
|
||||
else:
|
||||
return prefill and max_seq_len_to_capture > self.max_seq_len_to_capture
|
||||
return False
|
||||
|
||||
def warmup_hpu_graph(self, batch):
|
||||
warmup_times = 3
|
||||
|
Loading…
Reference in New Issue
Block a user