nvme_perf: rate limit I/O error messages

The nvme perf tool can be useful to generate I/O for
stress conditions.  But if we use it for situations
where we expect a high rate of failures, the excessive
spew can significantly clutter the log.

So add a new -Q option (meaning "quiet") which will
rate limit these error messages.

Signed-off-by: Jim Harris <james.r.harris@intel.com>
Change-Id: Ia8ab2e1ea1cfab9f43d87bcabe8f3f7589b77cda
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6077
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
This commit is contained in:
Jim Harris 2021-01-25 16:10:05 +00:00 committed by Tomasz Zawadzki
parent 9765e956b9
commit d1c17064e7

View File

@ -261,6 +261,29 @@ static bool g_mix_specified;
static bool g_exit;
/* Default to 10 seconds for the keep alive value. This value is arbitrary. */
static uint32_t g_keep_alive_timeout_in_ms = 10000;
static uint32_t g_quiet_count = 1;
/* When user specifies -Q, some error messages are rate limited. When rate
* limited, we only print the error message every g_quiet_count times the
* error occurs.
*
* Note: the __count is not thread safe, meaning the rate limiting will not
* be exact when running perf with multiple thread with lots of errors.
* Thread-local __count would mean rate-limiting per thread which doesn't
* seem as useful.
*/
#define RATELIMIT_LOG(...) \
{ \
static uint64_t __count = 0; \
if ((__count % g_quiet_count) == 0) { \
if (__count > 0 && g_quiet_count > 1) { \
fprintf(stderr, "Message suppressed %" PRIu32 " times: ", \
g_quiet_count - 1); \
} \
fprintf(stderr, __VA_ARGS__); \
} \
__count++; \
}
static const char *g_core_mask;
@ -1260,7 +1283,7 @@ submit_single_io(struct perf_task *task)
rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios);
if (spdk_unlikely(rc != 0)) {
fprintf(stderr, "starting I/O failed\n");
RATELIMIT_LOG("starting I/O failed\n");
} else {
ns_ctx->current_queue_depth++;
}
@ -1317,11 +1340,11 @@ io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)
if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
if (task->is_read) {
fprintf(stderr, "Read completed with error (sct=%d, sc=%d)\n",
cpl->status.sct, cpl->status.sc);
RATELIMIT_LOG("Read completed with error (sct=%d, sc=%d)\n",
cpl->status.sct, cpl->status.sc);
} else {
fprintf(stderr, "Write completed with error (sct=%d, sc=%d)\n",
cpl->status.sct, cpl->status.sc);
RATELIMIT_LOG("Write completed with error (sct=%d, sc=%d)\n",
cpl->status.sct, cpl->status.sc);
}
}
@ -1551,6 +1574,7 @@ static void usage(char *program_name)
printf("\t[-C max completions per poll]\n");
printf("\t\t(default: 0 - unlimited)\n");
printf("\t[-i shared memory group ID]\n");
printf("\t[-Q log I/O errors every N times (default: 1)\n");
printf("\t");
spdk_log_usage(stdout, "-T");
printf("\t[-V enable VMD enumeration]\n");
@ -1990,7 +2014,8 @@ parse_args(int argc, char **argv)
long int val;
int rc;
while ((op = getopt(argc, argv, "a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:RS:T:U:VZ:")) != -1) {
while ((op = getopt(argc, argv,
"a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:Q:RS:T:U:VZ:")) != -1) {
switch (op) {
case 'a':
case 'A':
@ -2004,6 +2029,7 @@ parse_args(int argc, char **argv)
case 's':
case 't':
case 'M':
case 'Q':
case 'U':
val = spdk_strtol(optarg, 10);
if (val < 0) {
@ -2045,6 +2071,9 @@ parse_args(int argc, char **argv)
g_rw_percentage = val;
g_mix_specified = true;
break;
case 'Q':
g_quiet_count = val;
break;
case 'U':
g_nr_unused_io_queues = val;
break;
@ -2187,6 +2216,11 @@ parse_args(int argc, char **argv)
usage(argv[0]);
return 1;
}
if (!g_quiet_count) {
fprintf(stderr, "-Q value must be greater than 0\n");
usage(argv[0]);
return 1;
}
if (strncmp(g_workload_type, "rand", 4) == 0) {
g_is_random = 1;
@ -2571,6 +2605,11 @@ int main(int argc, char **argv)
goto cleanup;
}
if (g_num_workers > 1 && g_quiet_count > 1) {
fprintf(stderr, "Error message rate-limiting enabled across multiple threads.\n");
fprintf(stderr, "Error suppression count may not be exact.\n");
}
rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
if (rc != 0) {
fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");