mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-23 16:02:10 +00:00
Doesn't run the prefill warmup when limit_hpu_graph=true
Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
parent
4586325a34
commit
0228bd0260
@ -1202,7 +1202,6 @@ class CausalLM(Model):
|
|||||||
decode_batch_size_list.append(max_decode_batch_size)
|
decode_batch_size_list.append(max_decode_batch_size)
|
||||||
decode_batch_size_list.sort(reverse=True)
|
decode_batch_size_list.sort(reverse=True)
|
||||||
|
|
||||||
self.limit_hpu_graph = True
|
|
||||||
try:
|
try:
|
||||||
for batch_size in decode_batch_size_list:
|
for batch_size in decode_batch_size_list:
|
||||||
batches= []
|
batches= []
|
||||||
@ -1234,11 +1233,12 @@ class CausalLM(Model):
|
|||||||
f"Memory stats: {mem_stats} "
|
f"Memory stats: {mem_stats} "
|
||||||
)
|
)
|
||||||
|
|
||||||
|
limit_hpu_graph = os.getenv("LIMIT_HPU_GRAPH", "false").lower() == "true"
|
||||||
|
if limit_hpu_graph == False:
|
||||||
# Warmup prefill batch_size
|
# Warmup prefill batch_size
|
||||||
max_input_length = request.max_input_length
|
max_input_length = request.max_input_length
|
||||||
prefill_batch_size_list = []
|
prefill_batch_size_list = []
|
||||||
prefill_seqlen_list = []
|
prefill_seqlen_list = []
|
||||||
#Prefill and decode warmup
|
|
||||||
try:
|
try:
|
||||||
for batch_size in range(max_prefill_batch_size, 0, -PREFILL_BATCH_BUCKET_SIZE):
|
for batch_size in range(max_prefill_batch_size, 0, -PREFILL_BATCH_BUCKET_SIZE):
|
||||||
prefill_batch_size_list.append(batch_size)
|
prefill_batch_size_list.append(batch_size)
|
||||||
@ -1257,8 +1257,6 @@ class CausalLM(Model):
|
|||||||
)
|
)
|
||||||
prefill_batch_size_list.sort()
|
prefill_batch_size_list.sort()
|
||||||
prefill_seqlen_list.sort()
|
prefill_seqlen_list.sort()
|
||||||
limit_hpu_graph = os.getenv("LIMIT_HPU_GRAPH", "false").lower() == "true"
|
|
||||||
if limit_hpu_graph == False:
|
|
||||||
mem_stats = get_hpu_memory_stats(self.device)
|
mem_stats = get_hpu_memory_stats(self.device)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"\nFollowing prefill and decode warmup successfully.\n"
|
f"\nFollowing prefill and decode warmup successfully.\n"
|
||||||
|
Loading…
Reference in New Issue
Block a user