diff --git a/examples/nvme/perf/perf.c b/examples/nvme/perf/perf.c
index 59c458920..1c5b25046 100644
--- a/examples/nvme/perf/perf.c
+++ b/examples/nvme/perf/perf.c
@@ -136,6 +136,11 @@ struct ns_worker_stats {
 	uint64_t		total_tsc;
 	uint64_t		min_tsc;
 	uint64_t		max_tsc;
+	uint64_t		last_tsc;
+	uint64_t		busy_tsc;
+	uint64_t		idle_tsc;
+	uint64_t		last_busy_tsc;
+	uint64_t		last_idle_tsc;
 };
 
 struct ns_worker_ctx {
@@ -203,7 +208,7 @@ struct ns_fn_table {
 	int	(*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
 			     struct ns_entry *entry, uint64_t offset_in_ios);
 
-	void	(*check_io)(struct ns_worker_ctx *ns_ctx);
+	int64_t	(*check_io)(struct ns_worker_ctx *ns_ctx);
 
 	void	(*verify_io)(struct perf_task *task, struct ns_entry *entry);
 
@@ -231,6 +236,8 @@ static pthread_barrier_t g_worker_sync_barrier;
 
 static uint64_t g_tsc_rate;
 
+static bool g_monitor_perf_cores = false;
+
 static uint32_t g_io_align = 0x200;
 static bool g_io_align_specified;
 static uint32_t g_io_size_bytes;
@@ -447,10 +454,10 @@ uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
 	return 0;
 }
 
-static void
+static int64_t
 uring_check_io(struct ns_worker_ctx *ns_ctx)
 {
-	int i, count, to_complete, to_submit, ret = 0;
+	int i, to_complete, to_submit, count = 0, ret = 0;
 	struct perf_task *task;
 
 	to_submit = ns_ctx->u.uring.io_pending;
@@ -460,7 +467,7 @@ uring_check_io(struct ns_worker_ctx *ns_ctx)
 		 * It will automatically call spdk_io_uring_enter appropriately. */
 		ret = io_uring_submit(&ns_ctx->u.uring.ring);
 		if (ret < 0) {
-			return;
+			return -1;
 		}
 		ns_ctx->u.uring.io_pending = 0;
 		ns_ctx->u.uring.io_inflight += to_submit;
@@ -481,6 +488,7 @@ uring_check_io(struct ns_worker_ctx *ns_ctx)
 			task_complete(task);
 		}
 	}
+	return count;
 }
 
 static void
@@ -580,7 +588,7 @@ aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
 	}
 }
 
-static void
+static int64_t
 aio_check_io(struct ns_worker_ctx *ns_ctx)
 {
 	int count, i;
@@ -598,6 +606,7 @@ aio_check_io(struct ns_worker_ctx *ns_ctx)
 	for (i = 0; i < count; i++) {
 		task_complete(ns_ctx->u.aio.events[i].data);
 	}
+	return count;
 }
 
 static void
@@ -884,7 +893,7 @@ perf_disconnect_cb(struct spdk_nvme_qpair *qpair, void *ctx)
 
 }
 
-static void
+static int64_t
 nvme_check_io(struct ns_worker_ctx *ns_ctx)
 {
 	int64_t rc;
@@ -894,6 +903,7 @@ nvme_check_io(struct ns_worker_ctx *ns_ctx)
 		fprintf(stderr, "NVMe io qpair process completion error\n");
 		exit(1);
 	}
+	return rc;
 }
 
 static void
@@ -1404,6 +1414,11 @@ print_periodic_performance(bool warmup)
 	double mb_this_second;
 	struct worker_thread *worker;
 	struct ns_worker_ctx *ns_ctx;
+	uint64_t busy_tsc;
+	uint64_t idle_tsc;
+	uint64_t core_busy_tsc = 0;
+	uint64_t core_idle_tsc = 0;
+	double core_busy_perc = 0;
 
 	if (!isatty(STDOUT_FILENO)) {
 		/* Don't print periodic stats if output is not going
@@ -1411,17 +1426,34 @@ print_periodic_performance(bool warmup)
 		 */
 		return;
 	}
-
 	io_this_second = 0;
 	TAILQ_FOREACH(worker, &g_workers, link) {
+		busy_tsc = 0;
+		idle_tsc = 0;
 		TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
 			io_this_second += ns_ctx->stats.io_completed - ns_ctx->stats.last_io_completed;
 			ns_ctx->stats.last_io_completed = ns_ctx->stats.io_completed;
+
+			if (g_monitor_perf_cores) {
+				busy_tsc += ns_ctx->stats.busy_tsc - ns_ctx->stats.last_busy_tsc;
+				idle_tsc += ns_ctx->stats.idle_tsc - ns_ctx->stats.last_idle_tsc;
+				ns_ctx->stats.last_busy_tsc = ns_ctx->stats.busy_tsc;
+				ns_ctx->stats.last_idle_tsc = ns_ctx->stats.idle_tsc;
+			}
+		}
+		if (g_monitor_perf_cores) {
+			core_busy_tsc += busy_tsc;
+			core_idle_tsc += idle_tsc;
+			core_busy_perc += (double)core_busy_tsc / (core_idle_tsc + core_busy_tsc) * 100;
 		}
 	}
-
 	mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024);
-	printf("%s%9ju IOPS, %8.2f MiB/s\r", warmup ? "[warmup] " : "", io_this_second, mb_this_second);
+
+	printf("%s%9ju IOPS, %8.2f MiB/s", warmup ? "[warmup] " : "", io_this_second, mb_this_second);
+	if (g_monitor_perf_cores) {
+		printf("%3d Core(s): %6.2f%% Busy", g_num_workers, core_busy_perc);
+	}
+	printf("\r");
 	fflush(stdout);
 }
 
@@ -1434,6 +1466,8 @@ work_fn(void *arg)
 	uint32_t unfinished_ns_ctx;
 	bool warmup = false;
 	int rc;
+	int64_t check_rc;
+	uint64_t check_now;
 
 	/* Allocate queue pairs for each namespace. */
 	TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
@@ -1473,7 +1507,15 @@ work_fn(void *arg)
 		 * to replace each I/O that is completed.
 		 */
 		TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
-			ns_ctx->entry->fn_table->check_io(ns_ctx);
+			check_now = spdk_get_ticks();
+			check_rc = ns_ctx->entry->fn_table->check_io(ns_ctx);
+
+			if (check_rc > 0) {
+				ns_ctx->stats.busy_tsc += check_now - ns_ctx->stats.last_tsc;
+			} else {
+				ns_ctx->stats.idle_tsc += check_now - ns_ctx->stats.last_tsc;
+			}
+			ns_ctx->stats.last_tsc = check_now;
 		}
 
 		tsc_current = spdk_get_ticks();
@@ -1590,6 +1632,7 @@ static void usage(char *program_name)
 	printf("\t[-A IO buffer alignment. Must be power of 2 and not less than cache line (%u)]\n",
 	       SPDK_CACHE_LINE_SIZE);
 	printf("\t[-S set the default sock impl, e.g. \"posix\"]\n");
+	printf("\t[-m display real-time overall cpu usage on used cores]\n");
 #ifdef SPDK_CONFIG_URING
 	printf("\t[-R enable using liburing to drive kernel devices (Default: libaio)]\n");
 #endif
@@ -2022,7 +2065,7 @@ parse_args(int argc, char **argv)
 	int rc;
 
 	while ((op = getopt(argc, argv,
-			    "a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:Q:RS:T:U:VZ:")) != -1) {
+			    "a:b:c:e:gi:lmo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:Q:RS:T:U:VZ:")) != -1) {
 		switch (op) {
 		case 'a':
 		case 'A':
@@ -2117,6 +2160,9 @@ parse_args(int argc, char **argv)
 		case 'l':
 			g_latency_ssd_tracking_enable = true;
 			break;
+		case 'm':
+			g_monitor_perf_cores = true;
+			break;
 		case 'r':
 			if (add_trid(optarg)) {
 				usage(argv[0]);