mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
fix: also show total memory after full warmup
This commit is contained in:
parent
8b4cd2a9fc
commit
e152cb022b
@ -1386,7 +1386,8 @@ class FlashCausalLM(Model):
|
|||||||
total_cuda_graph_memory = free_memory_post_alloc - last_available_memory
|
total_cuda_graph_memory = free_memory_post_alloc - last_available_memory
|
||||||
log_master(
|
log_master(
|
||||||
logger.info,
|
logger.info,
|
||||||
f"Total memory used for CUDA graphs: {total_cuda_graph_memory/1024/1024:.2f} MB",
|
f"Total memory used for CUDA graphs: {total_cuda_graph_memory/1024/1024:.2f} MB"
|
||||||
|
f"\nTotal memory available: {last_available_memory/1024/1024:.2f} MB",
|
||||||
)
|
)
|
||||||
except torch.cuda.OutOfMemoryError:
|
except torch.cuda.OutOfMemoryError:
|
||||||
logger.exception("Decode cuda graph warmup failed")
|
logger.exception("Decode cuda graph warmup failed")
|
||||||
|
Loading…
Reference in New Issue
Block a user