nvme_perf: rate limit I/O error messages

The nvme perf tool can be useful to generate I/O for stress conditions. But if we use it for situations where we expect a high rate of failures, the excessive spew can significantly clutter the log. So add a new -Q option (meaning "quiet") which will rate limit these error messages. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: Ia8ab2e1ea1cfab9f43d87bcabe8f3f7589b77cda Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6077 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
2021-01-25 16:10:05 +00:00 · 2021-01-25 16:10:05 +00:00 · d1c17064e7
commit d1c17064e7
parent 9765e956b9
1 changed files with 45 additions and 6 deletions
--- a/examples/nvme/perf/perf.c
+++ b/examples/nvme/perf/perf.c
@ -261,6 +261,29 @@ static bool g_mix_specified;
 static bool g_exit;
 /* Default to 10 seconds for the keep alive value. This value is arbitrary. */
 static uint32_t g_keep_alive_timeout_in_ms = 10000;
+static uint32_t g_quiet_count = 1;
+
+/* When user specifies -Q, some error messages are rate limited.  When rate
+ * limited, we only print the error message every g_quiet_count times the
+ * error occurs.
+ *
+ * Note: the __count is not thread safe, meaning the rate limiting will not
+ * be exact when running perf with multiple thread with lots of errors.
+ * Thread-local __count would mean rate-limiting per thread which doesn't
+ * seem as useful.
+ */
+#define RATELIMIT_LOG(...) \
+	{								\
+		static uint64_t __count = 0;				\
+		if ((__count % g_quiet_count) == 0) {			\
+			if (__count > 0 && g_quiet_count > 1) {		\
+				fprintf(stderr, "Message suppressed %" PRIu32 " times: ",	\
+					g_quiet_count - 1);		\
+			}						\
+			fprintf(stderr, __VA_ARGS__);			\
+		}							\
+		__count++;						\
+	}

 static const char *g_core_mask;

@ -1260,7 +1283,7 @@ submit_single_io(struct perf_task *task)
 	rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios);

 	if (spdk_unlikely(rc != 0)) {
-		fprintf(stderr, "starting I/O failed\n");
+		RATELIMIT_LOG("starting I/O failed\n");
 	} else {
 		ns_ctx->current_queue_depth++;
 	}
@ -1317,11 +1340,11 @@ io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)

 	if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
 		if (task->is_read) {
-			fprintf(stderr, "Read completed with error (sct=%d, sc=%d)\n",
-				cpl->status.sct, cpl->status.sc);
+			RATELIMIT_LOG("Read completed with error (sct=%d, sc=%d)\n",
+				      cpl->status.sct, cpl->status.sc);
 		} else {
-			fprintf(stderr, "Write completed with error (sct=%d, sc=%d)\n",
-				cpl->status.sct, cpl->status.sc);
+			RATELIMIT_LOG("Write completed with error (sct=%d, sc=%d)\n",
+				      cpl->status.sct, cpl->status.sc);
 		}
 	}

@ -1551,6 +1574,7 @@ static void usage(char *program_name)
 	printf("\t[-C max completions per poll]\n");
 	printf("\t\t(default: 0 - unlimited)\n");
 	printf("\t[-i shared memory group ID]\n");
+	printf("\t[-Q log I/O errors every N times (default: 1)\n");
 	printf("\t");
 	spdk_log_usage(stdout, "-T");
 	printf("\t[-V enable VMD enumeration]\n");
@ -1990,7 +2014,8 @@ parse_args(int argc, char **argv)
 	long int val;
 	int rc;

-	while ((op = getopt(argc, argv, "a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:RS:T:U:VZ:")) != -1) {
+	while ((op = getopt(argc, argv,
+			    "a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:Q:RS:T:U:VZ:")) != -1) {
 		switch (op) {
 		case 'a':
 		case 'A':
@ -2004,6 +2029,7 @@ parse_args(int argc, char **argv)
 		case 's':
 		case 't':
 		case 'M':
+		case 'Q':
 		case 'U':
 			val = spdk_strtol(optarg, 10);
 			if (val < 0) {
@ -2045,6 +2071,9 @@ parse_args(int argc, char **argv)
 				g_rw_percentage = val;
 				g_mix_specified = true;
 				break;
+			case 'Q':
+				g_quiet_count = val;
+				break;
 			case 'U':
 				g_nr_unused_io_queues = val;
 				break;
@ -2187,6 +2216,11 @@ parse_args(int argc, char **argv)
 		usage(argv[0]);
 		return 1;
 	}
+	if (!g_quiet_count) {
+		fprintf(stderr, "-Q value must be greater than 0\n");
+		usage(argv[0]);
+		return 1;
+	}

 	if (strncmp(g_workload_type, "rand", 4) == 0) {
 		g_is_random = 1;
@ -2571,6 +2605,11 @@ int main(int argc, char **argv)
 		goto cleanup;
 	}

+	if (g_num_workers > 1 && g_quiet_count > 1) {
+		fprintf(stderr, "Error message rate-limiting enabled across multiple threads.\n");
+		fprintf(stderr, "Error suppression count may not be exact.\n");
+	}
+
 	rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
 	if (rc != 0) {
 		fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");