mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 11:54:52 +00:00
if limit is set, all prefill will bypass graph
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
ff5bc1bbd1
commit
316cb087f3
@ -1446,7 +1446,7 @@ class FlashCausalLM(Model):
|
|||||||
os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
|
os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
|
||||||
)
|
)
|
||||||
self.limit_hpu_graph = (
|
self.limit_hpu_graph = (
|
||||||
os.environ.get("LIMIT_HPU_GRAPH", "true").lower() == "true"
|
os.environ.get("LIMIT_HPU_GRAPH", "false").lower() == "true"
|
||||||
)
|
)
|
||||||
self.max_seq_len_to_capture = 8192
|
self.max_seq_len_to_capture = 8192
|
||||||
super().__init__(
|
super().__init__(
|
||||||
@ -1596,8 +1596,9 @@ class FlashCausalLM(Model):
|
|||||||
|
|
||||||
def bypass_hpu_graphs(self, prefill, max_seq_len_to_capture):
|
def bypass_hpu_graphs(self, prefill, max_seq_len_to_capture):
|
||||||
if self.limit_hpu_graph:
|
if self.limit_hpu_graph:
|
||||||
|
return prefill
|
||||||
|
else:
|
||||||
return prefill and max_seq_len_to_capture > self.max_seq_len_to_capture
|
return prefill and max_seq_len_to_capture > self.max_seq_len_to_capture
|
||||||
return False
|
|
||||||
|
|
||||||
def warmup_hpu_graph(self, batch):
|
def warmup_hpu_graph(self, batch):
|
||||||
warmup_times = 3
|
warmup_times = 3
|
||||||
|
Loading…
Reference in New Issue
Block a user