nvme_perf: rate limit I/O error messages
The nvme perf tool can be useful to generate I/O for stress conditions. But if we use it for situations where we expect a high rate of failures, the excessive spew can significantly clutter the log. So add a new -Q option (meaning "quiet") which will rate limit these error messages. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: Ia8ab2e1ea1cfab9f43d87bcabe8f3f7589b77cda Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6077 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
This commit is contained in:
parent
9765e956b9
commit
d1c17064e7
@ -261,6 +261,29 @@ static bool g_mix_specified;
|
||||
static bool g_exit;
|
||||
/* Default to 10 seconds for the keep alive value. This value is arbitrary. */
|
||||
static uint32_t g_keep_alive_timeout_in_ms = 10000;
|
||||
static uint32_t g_quiet_count = 1;
|
||||
|
||||
/* When user specifies -Q, some error messages are rate limited. When rate
|
||||
* limited, we only print the error message every g_quiet_count times the
|
||||
* error occurs.
|
||||
*
|
||||
* Note: the __count is not thread safe, meaning the rate limiting will not
|
||||
* be exact when running perf with multiple thread with lots of errors.
|
||||
* Thread-local __count would mean rate-limiting per thread which doesn't
|
||||
* seem as useful.
|
||||
*/
|
||||
#define RATELIMIT_LOG(...) \
|
||||
{ \
|
||||
static uint64_t __count = 0; \
|
||||
if ((__count % g_quiet_count) == 0) { \
|
||||
if (__count > 0 && g_quiet_count > 1) { \
|
||||
fprintf(stderr, "Message suppressed %" PRIu32 " times: ", \
|
||||
g_quiet_count - 1); \
|
||||
} \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
} \
|
||||
__count++; \
|
||||
}
|
||||
|
||||
static const char *g_core_mask;
|
||||
|
||||
@ -1260,7 +1283,7 @@ submit_single_io(struct perf_task *task)
|
||||
rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios);
|
||||
|
||||
if (spdk_unlikely(rc != 0)) {
|
||||
fprintf(stderr, "starting I/O failed\n");
|
||||
RATELIMIT_LOG("starting I/O failed\n");
|
||||
} else {
|
||||
ns_ctx->current_queue_depth++;
|
||||
}
|
||||
@ -1317,11 +1340,11 @@ io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)
|
||||
|
||||
if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
|
||||
if (task->is_read) {
|
||||
fprintf(stderr, "Read completed with error (sct=%d, sc=%d)\n",
|
||||
cpl->status.sct, cpl->status.sc);
|
||||
RATELIMIT_LOG("Read completed with error (sct=%d, sc=%d)\n",
|
||||
cpl->status.sct, cpl->status.sc);
|
||||
} else {
|
||||
fprintf(stderr, "Write completed with error (sct=%d, sc=%d)\n",
|
||||
cpl->status.sct, cpl->status.sc);
|
||||
RATELIMIT_LOG("Write completed with error (sct=%d, sc=%d)\n",
|
||||
cpl->status.sct, cpl->status.sc);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1551,6 +1574,7 @@ static void usage(char *program_name)
|
||||
printf("\t[-C max completions per poll]\n");
|
||||
printf("\t\t(default: 0 - unlimited)\n");
|
||||
printf("\t[-i shared memory group ID]\n");
|
||||
printf("\t[-Q log I/O errors every N times (default: 1)\n");
|
||||
printf("\t");
|
||||
spdk_log_usage(stdout, "-T");
|
||||
printf("\t[-V enable VMD enumeration]\n");
|
||||
@ -1990,7 +2014,8 @@ parse_args(int argc, char **argv)
|
||||
long int val;
|
||||
int rc;
|
||||
|
||||
while ((op = getopt(argc, argv, "a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:RS:T:U:VZ:")) != -1) {
|
||||
while ((op = getopt(argc, argv,
|
||||
"a:b:c:e:gi:lo:q:r:k:s:t:w:z:A:C:DGHILM:NO:P:Q:RS:T:U:VZ:")) != -1) {
|
||||
switch (op) {
|
||||
case 'a':
|
||||
case 'A':
|
||||
@ -2004,6 +2029,7 @@ parse_args(int argc, char **argv)
|
||||
case 's':
|
||||
case 't':
|
||||
case 'M':
|
||||
case 'Q':
|
||||
case 'U':
|
||||
val = spdk_strtol(optarg, 10);
|
||||
if (val < 0) {
|
||||
@ -2045,6 +2071,9 @@ parse_args(int argc, char **argv)
|
||||
g_rw_percentage = val;
|
||||
g_mix_specified = true;
|
||||
break;
|
||||
case 'Q':
|
||||
g_quiet_count = val;
|
||||
break;
|
||||
case 'U':
|
||||
g_nr_unused_io_queues = val;
|
||||
break;
|
||||
@ -2187,6 +2216,11 @@ parse_args(int argc, char **argv)
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
if (!g_quiet_count) {
|
||||
fprintf(stderr, "-Q value must be greater than 0\n");
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (strncmp(g_workload_type, "rand", 4) == 0) {
|
||||
g_is_random = 1;
|
||||
@ -2571,6 +2605,11 @@ int main(int argc, char **argv)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (g_num_workers > 1 && g_quiet_count > 1) {
|
||||
fprintf(stderr, "Error message rate-limiting enabled across multiple threads.\n");
|
||||
fprintf(stderr, "Error suppression count may not be exact.\n");
|
||||
}
|
||||
|
||||
rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");
|
||||
|
Loading…
Reference in New Issue
Block a user