mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 03:14:53 +00:00
Make style
This commit is contained in:
parent
2b2b4a814d
commit
afbebe6990
@ -1380,10 +1380,7 @@ class CausalLM(Model):
|
|||||||
prefill_seqlen_list.sort()
|
prefill_seqlen_list.sort()
|
||||||
prefill_batch_size_list.sort()
|
prefill_batch_size_list.sort()
|
||||||
mem_stats = get_hpu_memory_stats(self.device)
|
mem_stats = get_hpu_memory_stats(self.device)
|
||||||
logger.info(
|
logger.info(f"Prefill warmup successful.\n" f"Memory stats: {mem_stats} ")
|
||||||
f"Prefill warmup successful.\n"
|
|
||||||
f"Memory stats: {mem_stats} "
|
|
||||||
)
|
|
||||||
|
|
||||||
max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
|
max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
|
||||||
max_exp = math.ceil(math.log(max_decode_batch_size, BATCH_SIZE_EXPONENT_BASE))
|
max_exp = math.ceil(math.log(max_decode_batch_size, BATCH_SIZE_EXPONENT_BASE))
|
||||||
@ -1395,7 +1392,9 @@ class CausalLM(Model):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
for batch_size in decode_batch_size_list:
|
for batch_size in decode_batch_size_list:
|
||||||
logger.info(f"Decode warmup for `batch_size={batch_size}`, this may take a while...")
|
logger.info(
|
||||||
|
f"Decode warmup for `batch_size={batch_size}`, this may take a while..."
|
||||||
|
)
|
||||||
batches = []
|
batches = []
|
||||||
iters = math.floor(batch_size / max_prefill_batch_size)
|
iters = math.floor(batch_size / max_prefill_batch_size)
|
||||||
for i in range(iters):
|
for i in range(iters):
|
||||||
@ -1428,10 +1427,7 @@ class CausalLM(Model):
|
|||||||
decode_batch_size_list.sort()
|
decode_batch_size_list.sort()
|
||||||
max_supported_total_tokens = MAX_TOTAL_TOKENS * decode_batch_size_list[-1]
|
max_supported_total_tokens = MAX_TOTAL_TOKENS * decode_batch_size_list[-1]
|
||||||
mem_stats = get_hpu_memory_stats(self.device)
|
mem_stats = get_hpu_memory_stats(self.device)
|
||||||
logger.info(
|
logger.info(f"Decode warmup successful.\n" f"Memory stats: {mem_stats} ")
|
||||||
f"Decode warmup successful.\n"
|
|
||||||
f"Memory stats: {mem_stats} "
|
|
||||||
)
|
|
||||||
|
|
||||||
max_input_tokens = max_input_tokens
|
max_input_tokens = max_input_tokens
|
||||||
max_total_tokens = MAX_TOTAL_TOKENS
|
max_total_tokens = MAX_TOTAL_TOKENS
|
||||||
|
@ -1539,10 +1539,7 @@ class VlmCausalLM(Model):
|
|||||||
)
|
)
|
||||||
|
|
||||||
mem_stats = get_hpu_memory_stats(self.device)
|
mem_stats = get_hpu_memory_stats(self.device)
|
||||||
logger.info(
|
logger.info(f"Prefill warmup successful.\n" f"Memory stats: {mem_stats} ")
|
||||||
f"Prefill warmup successful.\n"
|
|
||||||
f"Memory stats: {mem_stats} "
|
|
||||||
)
|
|
||||||
|
|
||||||
max_decode_batch_size = MAX_BATCH_SIZE
|
max_decode_batch_size = MAX_BATCH_SIZE
|
||||||
batch_size = max_prefill_batch_size * 2
|
batch_size = max_prefill_batch_size * 2
|
||||||
@ -1556,7 +1553,9 @@ class VlmCausalLM(Model):
|
|||||||
batches = []
|
batches = []
|
||||||
while batch_size <= max_decode_batch_size:
|
while batch_size <= max_decode_batch_size:
|
||||||
for i in range(int(batch_size / max_prefill_batch_size)):
|
for i in range(int(batch_size / max_prefill_batch_size)):
|
||||||
logger.info(f"Decode warmup for `batch_size={batch_size}`, this may take a while...")
|
logger.info(
|
||||||
|
f"Decode warmup for `batch_size={batch_size}`, this may take a while..."
|
||||||
|
)
|
||||||
batch = self.generate_warmup_batch(
|
batch = self.generate_warmup_batch(
|
||||||
request,
|
request,
|
||||||
PREFILL_WARMUP_SEQLEN_LIST[0] - 1,
|
PREFILL_WARMUP_SEQLEN_LIST[0] - 1,
|
||||||
@ -1599,10 +1598,7 @@ class VlmCausalLM(Model):
|
|||||||
)
|
)
|
||||||
|
|
||||||
mem_stats = get_hpu_memory_stats(self.device)
|
mem_stats = get_hpu_memory_stats(self.device)
|
||||||
logger.info(
|
logger.info(f"Decode warmup successful.\n" f"Memory stats: {mem_stats}")
|
||||||
f"Decode warmup successful.\n"
|
|
||||||
f"Memory stats: {mem_stats}"
|
|
||||||
)
|
|
||||||
|
|
||||||
max_supported_total_tokens = MAX_BATCH_SIZE * MAX_TOTAL_TOKENS
|
max_supported_total_tokens = MAX_BATCH_SIZE * MAX_TOTAL_TOKENS
|
||||||
max_input_tokens = max_input_tokens
|
max_input_tokens = max_input_tokens
|
||||||
|
Loading…
Reference in New Issue
Block a user