per Intel policy to include file commit date using git cmd below. The policy does not apply to non-Intel (C) notices. git log --follow -C90% --format=%ad --date default <file> | tail -1 and then pull just the 4 digit year from the result. Intel copyrights were not added to files where Intel either had no contribution ot the contribution lacked substance (ie license header updates, formatting changes, etc). Contribution date used "--follow -C95%" to get the most accurate date. Note that several files in this patch didn't end the license/(c) block with a blank comment line so these were added as the vast majority of files do have this last blank line. Simply there for consistency. Signed-off-by: paul luse <paul.e.luse@intel.com> Change-Id: Id5b7ce4f658fe87132f14139ead58d6e285c04d4 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15192 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Community-CI: Mellanox Build Bot
1150 lines
28 KiB
C
1150 lines
28 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (C) 2019 Intel Corporation. All rights reserved.
|
|
* Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
|
|
*
|
|
* Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
|
|
*/
|
|
|
|
#include "spdk/stdinc.h"
|
|
|
|
#include "spdk/env.h"
|
|
#include "spdk/nvme.h"
|
|
#include "spdk/queue.h"
|
|
#include "spdk/string.h"
|
|
#include "spdk/util.h"
|
|
#include "spdk/log.h"
|
|
#include "spdk/likely.h"
|
|
|
|
struct ctrlr_entry {
|
|
struct spdk_nvme_ctrlr *ctrlr;
|
|
struct spdk_nvme_transport_id failover_trid;
|
|
enum spdk_nvme_transport_type trtype;
|
|
TAILQ_ENTRY(ctrlr_entry) link;
|
|
char name[1024];
|
|
int num_resets;
|
|
};
|
|
|
|
struct ns_entry {
|
|
struct spdk_nvme_ctrlr *ctrlr;
|
|
struct spdk_nvme_ns *ns;
|
|
|
|
TAILQ_ENTRY(ns_entry) link;
|
|
uint32_t io_size_blocks;
|
|
uint32_t num_io_requests;
|
|
uint64_t size_in_ios;
|
|
uint32_t block_size;
|
|
uint32_t io_flags;
|
|
char name[1024];
|
|
};
|
|
|
|
struct ns_worker_ctx {
|
|
struct ns_entry *entry;
|
|
uint64_t io_completed;
|
|
uint64_t current_queue_depth;
|
|
uint64_t offset_in_ios;
|
|
bool is_draining;
|
|
|
|
int num_qpairs;
|
|
struct spdk_nvme_qpair **qpair;
|
|
int last_qpair;
|
|
|
|
TAILQ_ENTRY(ns_worker_ctx) link;
|
|
};
|
|
|
|
struct perf_task {
|
|
struct ns_worker_ctx *ns_ctx;
|
|
struct iovec iov;
|
|
bool is_read;
|
|
};
|
|
|
|
struct worker_thread {
|
|
TAILQ_HEAD(, ns_worker_ctx) ns_ctx;
|
|
TAILQ_ENTRY(worker_thread) link;
|
|
unsigned lcore;
|
|
};
|
|
|
|
/* For basic reset handling. */
|
|
static int g_max_ctrlr_resets = 15;
|
|
|
|
static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers);
|
|
static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces);
|
|
static int g_num_namespaces = 0;
|
|
static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers);
|
|
static int g_num_workers = 0;
|
|
|
|
static uint64_t g_tsc_rate;
|
|
|
|
static uint32_t g_io_align = 0x200;
|
|
static uint32_t g_io_size_bytes;
|
|
static uint32_t g_max_io_size_blocks;
|
|
static int g_rw_percentage;
|
|
static int g_is_random;
|
|
static int g_queue_depth;
|
|
static int g_time_in_sec;
|
|
static uint32_t g_max_completions;
|
|
static int g_dpdk_mem;
|
|
static bool g_warn;
|
|
static uint32_t g_keep_alive_timeout_in_ms = 0;
|
|
static uint8_t g_transport_retry_count = 4;
|
|
static uint8_t g_transport_ack_timeout = 0; /* disabled */
|
|
static bool g_dpdk_mem_single_seg = false;
|
|
|
|
static const char *g_core_mask;
|
|
|
|
struct trid_entry {
|
|
struct spdk_nvme_transport_id trid;
|
|
struct spdk_nvme_transport_id failover_trid;
|
|
TAILQ_ENTRY(trid_entry) tailq;
|
|
};
|
|
|
|
static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list);
|
|
|
|
static inline void task_complete(struct perf_task *task);
|
|
static void submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth);
|
|
|
|
static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl);
|
|
|
|
static void
|
|
nvme_setup_payload(struct perf_task *task)
|
|
{
|
|
/* maximum extended lba format size from all active namespace,
|
|
* it's same with g_io_size_bytes for namespace without metadata.
|
|
*/
|
|
task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL);
|
|
task->iov.iov_len = g_io_size_bytes;
|
|
if (task->iov.iov_base == NULL) {
|
|
fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
static int
|
|
nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
|
|
struct ns_entry *entry, uint64_t offset_in_ios)
|
|
{
|
|
uint64_t lba;
|
|
int qp_num;
|
|
|
|
lba = offset_in_ios * entry->io_size_blocks;
|
|
|
|
qp_num = ns_ctx->last_qpair;
|
|
ns_ctx->last_qpair++;
|
|
if (ns_ctx->last_qpair == ns_ctx->num_qpairs) {
|
|
ns_ctx->last_qpair = 0;
|
|
}
|
|
|
|
if (task->is_read) {
|
|
return spdk_nvme_ns_cmd_read(entry->ns, ns_ctx->qpair[qp_num],
|
|
task->iov.iov_base, lba,
|
|
entry->io_size_blocks, io_complete,
|
|
task, entry->io_flags);
|
|
}
|
|
|
|
return spdk_nvme_ns_cmd_write(entry->ns, ns_ctx->qpair[qp_num],
|
|
task->iov.iov_base, lba,
|
|
entry->io_size_blocks, io_complete,
|
|
task, entry->io_flags);
|
|
}
|
|
|
|
static void
|
|
nvme_check_io(struct ns_worker_ctx *ns_ctx)
|
|
{
|
|
int i, rc;
|
|
|
|
for (i = 0; i < ns_ctx->num_qpairs; i++) {
|
|
rc = spdk_nvme_qpair_process_completions(ns_ctx->qpair[i], g_max_completions);
|
|
/* The transport level qpair is failed and we need to reconnect it. */
|
|
if (spdk_unlikely(rc == -ENXIO)) {
|
|
rc = spdk_nvme_ctrlr_reconnect_io_qpair(ns_ctx->qpair[i]);
|
|
/* successful reconnect */
|
|
if (rc == 0) {
|
|
continue;
|
|
} else if (rc == -ENXIO) {
|
|
/* This means the controller is failed. Defer to it to restore the qpair. */
|
|
continue;
|
|
} else {
|
|
/*
|
|
* We were unable to restore the qpair on this attempt. We don't
|
|
* really know why. For naive handling, just keep trying.
|
|
* TODO: add a retry limit, and destroy the qpair after x iterations.
|
|
*/
|
|
fprintf(stderr, "qpair failed and we were unable to recover it.\n");
|
|
}
|
|
} else if (spdk_unlikely(rc < 0)) {
|
|
fprintf(stderr, "Received an unknown error processing completions.\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* TODO: If a controller has multiple namespaces, they could all use the same queue.
|
|
* For now, give each namespace/thread combination its own queue.
|
|
*/
|
|
static int
|
|
nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
|
|
{
|
|
struct spdk_nvme_io_qpair_opts opts;
|
|
struct ns_entry *entry = ns_ctx->entry;
|
|
int i;
|
|
|
|
ns_ctx->num_qpairs = 1;
|
|
ns_ctx->qpair = calloc(ns_ctx->num_qpairs, sizeof(struct spdk_nvme_qpair *));
|
|
if (!ns_ctx->qpair) {
|
|
return -1;
|
|
}
|
|
|
|
spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->ctrlr, &opts, sizeof(opts));
|
|
if (opts.io_queue_requests < entry->num_io_requests) {
|
|
opts.io_queue_requests = entry->num_io_requests;
|
|
}
|
|
|
|
for (i = 0; i < ns_ctx->num_qpairs; i++) {
|
|
ns_ctx->qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->ctrlr, &opts,
|
|
sizeof(opts));
|
|
if (!ns_ctx->qpair[i]) {
|
|
printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < ns_ctx->num_qpairs; i++) {
|
|
spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair[i]);
|
|
}
|
|
|
|
free(ns_ctx->qpair);
|
|
}
|
|
|
|
static void
|
|
build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr)
|
|
{
|
|
const struct spdk_nvme_transport_id *trid;
|
|
|
|
trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
|
|
|
|
switch (trid->trtype) {
|
|
case SPDK_NVME_TRANSPORT_RDMA:
|
|
snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
|
|
break;
|
|
case SPDK_NVME_TRANSPORT_TCP:
|
|
snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
|
|
break;
|
|
case SPDK_NVME_TRANSPORT_VFIOUSER:
|
|
snprintf(name, length, "VFIOUSER (%s)", trid->traddr);
|
|
break;
|
|
case SPDK_NVME_TRANSPORT_CUSTOM:
|
|
snprintf(name, length, "CUSTOM (%s)", trid->traddr);
|
|
break;
|
|
default:
|
|
fprintf(stderr, "Unknown transport type %d\n", trid->trtype);
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
|
|
{
|
|
struct ns_entry *entry;
|
|
const struct spdk_nvme_ctrlr_data *cdata;
|
|
uint32_t max_xfer_size, entries, sector_size;
|
|
uint64_t ns_size;
|
|
struct spdk_nvme_io_qpair_opts opts;
|
|
|
|
cdata = spdk_nvme_ctrlr_get_data(ctrlr);
|
|
|
|
if (!spdk_nvme_ns_is_active(ns)) {
|
|
printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
|
|
cdata->mn, cdata->sn,
|
|
spdk_nvme_ns_get_id(ns));
|
|
g_warn = true;
|
|
return;
|
|
}
|
|
|
|
ns_size = spdk_nvme_ns_get_size(ns);
|
|
sector_size = spdk_nvme_ns_get_sector_size(ns);
|
|
|
|
if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) {
|
|
printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
|
|
"ns size %" PRIu64 " / block size %u for I/O size %u\n",
|
|
cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
|
|
ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
|
|
g_warn = true;
|
|
return;
|
|
}
|
|
|
|
max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
|
|
spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
|
|
/* NVMe driver may add additional entries based on
|
|
* stripe size and maximum transfer size, we assume
|
|
* 1 more entry be used for stripe.
|
|
*/
|
|
entries = (g_io_size_bytes - 1) / max_xfer_size + 2;
|
|
if ((g_queue_depth * entries) > opts.io_queue_size) {
|
|
printf("controller IO queue size %u less than required\n",
|
|
opts.io_queue_size);
|
|
printf("Consider using lower queue depth or small IO size because "
|
|
"IO requests may be queued at the NVMe driver.\n");
|
|
g_warn = true;
|
|
}
|
|
/* For requests which have children requests, parent request itself
|
|
* will also occupy 1 entry.
|
|
*/
|
|
entries += 1;
|
|
|
|
entry = calloc(1, sizeof(struct ns_entry));
|
|
if (entry == NULL) {
|
|
perror("ns_entry malloc");
|
|
exit(1);
|
|
}
|
|
|
|
entry->ctrlr = ctrlr;
|
|
entry->ns = ns;
|
|
entry->num_io_requests = g_queue_depth * entries;
|
|
|
|
entry->size_in_ios = ns_size / g_io_size_bytes;
|
|
entry->io_size_blocks = g_io_size_bytes / sector_size;
|
|
|
|
entry->block_size = spdk_nvme_ns_get_sector_size(ns);
|
|
|
|
|
|
if (g_max_io_size_blocks < entry->io_size_blocks) {
|
|
g_max_io_size_blocks = entry->io_size_blocks;
|
|
}
|
|
|
|
build_nvme_name(entry->name, sizeof(entry->name), ctrlr);
|
|
|
|
g_num_namespaces++;
|
|
TAILQ_INSERT_TAIL(&g_namespaces, entry, link);
|
|
}
|
|
|
|
static void
|
|
unregister_namespaces(void)
|
|
{
|
|
struct ns_entry *entry, *tmp;
|
|
|
|
TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp) {
|
|
TAILQ_REMOVE(&g_namespaces, entry, link);
|
|
free(entry);
|
|
}
|
|
}
|
|
|
|
static void
|
|
register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry)
|
|
{
|
|
struct spdk_nvme_ns *ns;
|
|
struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry));
|
|
const struct spdk_nvme_transport_id *ctrlr_trid;
|
|
uint32_t nsid;
|
|
|
|
if (entry == NULL) {
|
|
perror("ctrlr_entry malloc");
|
|
exit(1);
|
|
}
|
|
|
|
ctrlr_trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
|
|
assert(ctrlr_trid != NULL);
|
|
|
|
/* each controller needs a unique failover trid. */
|
|
entry->failover_trid = trid_entry->failover_trid;
|
|
|
|
/*
|
|
* Users are allowed to leave the trid subnqn blank or specify a discovery controller subnqn.
|
|
* In those cases, the controller subnqn will not equal the trid_entry subnqn and, by association,
|
|
* the failover_trid subnqn.
|
|
* When we do failover, we want to reconnect to the same nqn so explicitly set the failover nqn to
|
|
* the ctrlr nqn here.
|
|
*/
|
|
snprintf(entry->failover_trid.subnqn, SPDK_NVMF_NQN_MAX_LEN + 1, "%s", ctrlr_trid->subnqn);
|
|
|
|
|
|
build_nvme_name(entry->name, sizeof(entry->name), ctrlr);
|
|
|
|
entry->ctrlr = ctrlr;
|
|
entry->trtype = trid_entry->trid.trtype;
|
|
TAILQ_INSERT_TAIL(&g_controllers, entry, link);
|
|
|
|
for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
|
|
nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
|
|
ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
|
|
if (ns == NULL) {
|
|
continue;
|
|
}
|
|
register_ns(ctrlr, ns);
|
|
}
|
|
}
|
|
|
|
static __thread unsigned int seed = 0;
|
|
|
|
static inline void
|
|
submit_single_io(struct perf_task *task)
|
|
{
|
|
uint64_t offset_in_ios;
|
|
int rc;
|
|
struct ns_worker_ctx *ns_ctx = task->ns_ctx;
|
|
struct ns_entry *entry = ns_ctx->entry;
|
|
|
|
if (g_is_random) {
|
|
offset_in_ios = rand_r(&seed) % entry->size_in_ios;
|
|
} else {
|
|
offset_in_ios = ns_ctx->offset_in_ios++;
|
|
if (ns_ctx->offset_in_ios == entry->size_in_ios) {
|
|
ns_ctx->offset_in_ios = 0;
|
|
}
|
|
}
|
|
|
|
if ((g_rw_percentage == 100) ||
|
|
(g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) {
|
|
task->is_read = true;
|
|
} else {
|
|
task->is_read = false;
|
|
}
|
|
|
|
rc = nvme_submit_io(task, ns_ctx, entry, offset_in_ios);
|
|
|
|
if (spdk_unlikely(rc != 0)) {
|
|
fprintf(stderr, "starting I/O failed\n");
|
|
} else {
|
|
ns_ctx->current_queue_depth++;
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
task_complete(struct perf_task *task)
|
|
{
|
|
struct ns_worker_ctx *ns_ctx;
|
|
|
|
ns_ctx = task->ns_ctx;
|
|
ns_ctx->current_queue_depth--;
|
|
ns_ctx->io_completed++;
|
|
|
|
/*
|
|
* is_draining indicates when time has expired for the test run
|
|
* and we are just waiting for the previously submitted I/O
|
|
* to complete. In this case, do not submit a new I/O to replace
|
|
* the one just completed.
|
|
*/
|
|
if (spdk_unlikely(ns_ctx->is_draining)) {
|
|
spdk_dma_free(task->iov.iov_base);
|
|
free(task);
|
|
} else {
|
|
submit_single_io(task);
|
|
}
|
|
}
|
|
|
|
static void
|
|
io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)
|
|
{
|
|
struct perf_task *task = ctx;
|
|
|
|
if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
|
|
fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n",
|
|
task->is_read ? "Read" : "Write",
|
|
cpl->status.sct, cpl->status.sc);
|
|
}
|
|
|
|
task_complete(task);
|
|
}
|
|
|
|
static void
|
|
check_io(struct ns_worker_ctx *ns_ctx)
|
|
{
|
|
nvme_check_io(ns_ctx);
|
|
}
|
|
|
|
static struct perf_task *
|
|
allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth)
|
|
{
|
|
struct perf_task *task;
|
|
|
|
task = calloc(1, sizeof(*task));
|
|
if (task == NULL) {
|
|
fprintf(stderr, "Out of memory allocating tasks\n");
|
|
exit(1);
|
|
}
|
|
|
|
nvme_setup_payload(task);
|
|
|
|
task->ns_ctx = ns_ctx;
|
|
|
|
return task;
|
|
}
|
|
|
|
static void
|
|
submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
|
|
{
|
|
struct perf_task *task;
|
|
|
|
while (queue_depth-- > 0) {
|
|
task = allocate_task(ns_ctx, queue_depth);
|
|
submit_single_io(task);
|
|
}
|
|
}
|
|
|
|
static int
|
|
work_fn(void *arg)
|
|
{
|
|
uint64_t tsc_end;
|
|
struct worker_thread *worker = (struct worker_thread *)arg;
|
|
struct ns_worker_ctx *ns_ctx = NULL;
|
|
uint32_t unfinished_ns_ctx;
|
|
|
|
printf("Starting thread on core %u\n", worker->lcore);
|
|
|
|
/* Allocate queue pairs for each namespace. */
|
|
TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
|
|
if (nvme_init_ns_worker_ctx(ns_ctx) != 0) {
|
|
printf("ERROR: init_ns_worker_ctx() failed\n");
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate;
|
|
|
|
/* Submit initial I/O for each namespace. */
|
|
TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
|
|
submit_io(ns_ctx, g_queue_depth);
|
|
}
|
|
|
|
while (1) {
|
|
/*
|
|
* Check for completed I/O for each controller. A new
|
|
* I/O will be submitted in the io_complete callback
|
|
* to replace each I/O that is completed.
|
|
*/
|
|
TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
|
|
check_io(ns_ctx);
|
|
}
|
|
|
|
if (spdk_get_ticks() > tsc_end) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* drain the io of each ns_ctx in round robin to make the fairness */
|
|
do {
|
|
unfinished_ns_ctx = 0;
|
|
TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
|
|
/* first time will enter into this if case */
|
|
if (!ns_ctx->is_draining) {
|
|
ns_ctx->is_draining = true;
|
|
}
|
|
|
|
if (ns_ctx->current_queue_depth > 0) {
|
|
check_io(ns_ctx);
|
|
if (ns_ctx->current_queue_depth == 0) {
|
|
nvme_cleanup_ns_worker_ctx(ns_ctx);
|
|
} else {
|
|
unfinished_ns_ctx++;
|
|
}
|
|
}
|
|
}
|
|
} while (unfinished_ns_ctx > 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
usage(char *program_name)
|
|
{
|
|
printf("%s options", program_name);
|
|
printf("\n");
|
|
printf("\t[-q io depth]\n");
|
|
printf("\t[-o io size in bytes]\n");
|
|
printf("\t[-w io pattern type, must be one of\n");
|
|
printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
|
|
printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
|
|
printf("\t[-t time in seconds]\n");
|
|
printf("\t[-c core mask for I/O submission/completion.]\n");
|
|
printf("\t\t(default: 1)\n");
|
|
printf("\t[-r Transport ID for NVMeoF]\n");
|
|
printf("\t Format: 'key:value [key:value] ...'\n");
|
|
printf("\t Keys:\n");
|
|
printf("\t trtype Transport type (e.g. RDMA)\n");
|
|
printf("\t adrfam Address family (e.g. IPv4, IPv6)\n");
|
|
printf("\t traddr Transport address (e.g. 192.168.100.8 for RDMA)\n");
|
|
printf("\t trsvcid Transport service identifier (e.g. 4420)\n");
|
|
printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
|
|
printf("\t alt_traddr (Optional) Alternative Transport address for failover.\n");
|
|
printf("\t Example: -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
|
|
printf("\t[-k keep alive timeout period in millisecond]\n");
|
|
printf("\t[-s DPDK huge memory size in MB.]\n");
|
|
printf("\t[-m max completions per poll]\n");
|
|
printf("\t\t(default: 0 - unlimited)\n");
|
|
printf("\t[-i shared memory group ID]\n");
|
|
printf("\t[-A transport ACK timeout]\n");
|
|
printf("\t[-R transport retry count]\n");
|
|
printf("\t");
|
|
spdk_log_usage(stdout, "-T");
|
|
#ifdef DEBUG
|
|
printf("\t[-G enable debug logging]\n");
|
|
#else
|
|
printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)]\n");
|
|
#endif
|
|
}
|
|
|
|
static void
|
|
unregister_trids(void)
|
|
{
|
|
struct trid_entry *trid_entry, *tmp;
|
|
|
|
TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) {
|
|
TAILQ_REMOVE(&g_trid_list, trid_entry, tailq);
|
|
free(trid_entry);
|
|
}
|
|
}
|
|
|
|
static int
|
|
add_trid(const char *trid_str)
|
|
{
|
|
struct trid_entry *trid_entry;
|
|
struct spdk_nvme_transport_id *trid;
|
|
char *alt_traddr;
|
|
int len;
|
|
|
|
trid_entry = calloc(1, sizeof(*trid_entry));
|
|
if (trid_entry == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
trid = &trid_entry->trid;
|
|
snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
|
|
|
|
if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) {
|
|
fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str);
|
|
free(trid_entry);
|
|
return 1;
|
|
}
|
|
|
|
trid_entry->failover_trid = trid_entry->trid;
|
|
|
|
alt_traddr = strcasestr(trid_str, "alt_traddr:");
|
|
if (alt_traddr) {
|
|
alt_traddr += strlen("alt_traddr:");
|
|
len = strcspn(alt_traddr, " \t\n");
|
|
if (len > SPDK_NVMF_TRADDR_MAX_LEN) {
|
|
fprintf(stderr, "The failover traddr %s is too long.\n", alt_traddr);
|
|
free(trid_entry);
|
|
return -1;
|
|
}
|
|
snprintf(trid_entry->failover_trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1, "%s", alt_traddr);
|
|
}
|
|
|
|
TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq);
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
parse_args(int argc, char **argv)
|
|
{
|
|
struct trid_entry *trid_entry, *trid_entry_tmp;
|
|
const char *workload_type;
|
|
int op;
|
|
bool mix_specified = false;
|
|
long int val;
|
|
int rc;
|
|
|
|
/* default value */
|
|
g_queue_depth = 0;
|
|
g_io_size_bytes = 0;
|
|
workload_type = NULL;
|
|
g_time_in_sec = 0;
|
|
g_rw_percentage = -1;
|
|
g_core_mask = NULL;
|
|
g_max_completions = 0;
|
|
|
|
while ((op = getopt(argc, argv, "c:gm:o:q:r:k:s:t:w:A:GM:R:T:")) != -1) {
|
|
switch (op) {
|
|
case 'm':
|
|
case 'o':
|
|
case 'q':
|
|
case 'k':
|
|
case 's':
|
|
case 't':
|
|
case 'A':
|
|
case 'M':
|
|
case 'R':
|
|
val = spdk_strtol(optarg, 10);
|
|
if (val < 0) {
|
|
fprintf(stderr, "Converting a string to integer failed\n");
|
|
return val;
|
|
}
|
|
switch (op) {
|
|
case 'm':
|
|
g_max_completions = val;
|
|
break;
|
|
case 'o':
|
|
g_io_size_bytes = val;
|
|
break;
|
|
case 'q':
|
|
g_queue_depth = val;
|
|
break;
|
|
case 'k':
|
|
g_keep_alive_timeout_in_ms = val;
|
|
break;
|
|
case 's':
|
|
g_dpdk_mem = val;
|
|
break;
|
|
case 't':
|
|
g_time_in_sec = val;
|
|
break;
|
|
case 'A':
|
|
g_transport_ack_timeout = val;
|
|
break;
|
|
case 'M':
|
|
g_rw_percentage = val;
|
|
mix_specified = true;
|
|
break;
|
|
case 'R':
|
|
g_transport_retry_count = val;
|
|
break;
|
|
}
|
|
break;
|
|
case 'c':
|
|
g_core_mask = optarg;
|
|
break;
|
|
case 'g':
|
|
g_dpdk_mem_single_seg = true;
|
|
break;
|
|
case 'r':
|
|
if (add_trid(optarg)) {
|
|
usage(argv[0]);
|
|
return 1;
|
|
}
|
|
break;
|
|
case 'w':
|
|
workload_type = optarg;
|
|
break;
|
|
case 'G':
|
|
#ifndef DEBUG
|
|
fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n",
|
|
argv[0]);
|
|
usage(argv[0]);
|
|
return 1;
|
|
#else
|
|
spdk_log_set_flag("nvme");
|
|
spdk_log_set_print_level(SPDK_LOG_DEBUG);
|
|
break;
|
|
#endif
|
|
case 'T':
|
|
rc = spdk_log_set_flag(optarg);
|
|
if (rc < 0) {
|
|
fprintf(stderr, "unknown flag\n");
|
|
usage(argv[0]);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
#ifdef DEBUG
|
|
spdk_log_set_print_level(SPDK_LOG_DEBUG);
|
|
#endif
|
|
break;
|
|
default:
|
|
usage(argv[0]);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
if (!g_queue_depth) {
|
|
usage(argv[0]);
|
|
return 1;
|
|
}
|
|
if (!g_io_size_bytes) {
|
|
usage(argv[0]);
|
|
return 1;
|
|
}
|
|
if (!workload_type) {
|
|
usage(argv[0]);
|
|
return 1;
|
|
}
|
|
if (!g_time_in_sec) {
|
|
usage(argv[0]);
|
|
return 1;
|
|
}
|
|
|
|
if (strcmp(workload_type, "read") &&
|
|
strcmp(workload_type, "write") &&
|
|
strcmp(workload_type, "randread") &&
|
|
strcmp(workload_type, "randwrite") &&
|
|
strcmp(workload_type, "rw") &&
|
|
strcmp(workload_type, "randrw")) {
|
|
fprintf(stderr,
|
|
"io pattern type must be one of\n"
|
|
"(read, write, randread, randwrite, rw, randrw)\n");
|
|
return 1;
|
|
}
|
|
|
|
if (!strcmp(workload_type, "read") ||
|
|
!strcmp(workload_type, "randread")) {
|
|
g_rw_percentage = 100;
|
|
}
|
|
|
|
if (!strcmp(workload_type, "write") ||
|
|
!strcmp(workload_type, "randwrite")) {
|
|
g_rw_percentage = 0;
|
|
}
|
|
|
|
if (!strcmp(workload_type, "read") ||
|
|
!strcmp(workload_type, "randread") ||
|
|
!strcmp(workload_type, "write") ||
|
|
!strcmp(workload_type, "randwrite")) {
|
|
if (mix_specified) {
|
|
fprintf(stderr, "Ignoring -M option... Please use -M option"
|
|
" only when using rw or randrw.\n");
|
|
}
|
|
}
|
|
|
|
if (!strcmp(workload_type, "rw") ||
|
|
!strcmp(workload_type, "randrw")) {
|
|
if (g_rw_percentage < 0 || g_rw_percentage > 100) {
|
|
fprintf(stderr,
|
|
"-M must be specified to value from 0 to 100 "
|
|
"for rw or randrw.\n");
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
if (!strcmp(workload_type, "read") ||
|
|
!strcmp(workload_type, "write") ||
|
|
!strcmp(workload_type, "rw")) {
|
|
g_is_random = 0;
|
|
} else {
|
|
g_is_random = 1;
|
|
}
|
|
|
|
if (TAILQ_EMPTY(&g_trid_list)) {
|
|
fprintf(stderr, "You must specify at least one fabrics TRID.\n");
|
|
return -1;
|
|
}
|
|
|
|
/* check whether there is local PCIe type and fail. */
|
|
TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) {
|
|
if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
|
|
fprintf(stderr, "This application was not intended to be run on PCIe controllers.\n");
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
register_workers(void)
|
|
{
|
|
uint32_t i;
|
|
struct worker_thread *worker;
|
|
|
|
SPDK_ENV_FOREACH_CORE(i) {
|
|
worker = calloc(1, sizeof(*worker));
|
|
if (worker == NULL) {
|
|
fprintf(stderr, "Unable to allocate worker\n");
|
|
return -1;
|
|
}
|
|
|
|
TAILQ_INIT(&worker->ns_ctx);
|
|
worker->lcore = i;
|
|
TAILQ_INSERT_TAIL(&g_workers, worker, link);
|
|
g_num_workers++;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
unregister_workers(void)
|
|
{
|
|
struct worker_thread *worker, *tmp_worker;
|
|
struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx;
|
|
|
|
/* Free namespace context and worker thread */
|
|
TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) {
|
|
TAILQ_REMOVE(&g_workers, worker, link);
|
|
TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) {
|
|
TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link);
|
|
free(ns_ctx);
|
|
}
|
|
|
|
free(worker);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
|
|
struct spdk_nvme_ctrlr_opts *opts)
|
|
{
|
|
/* These should have been weeded out earlier. */
|
|
assert(trid->trtype != SPDK_NVME_TRANSPORT_PCIE);
|
|
|
|
printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n",
|
|
trid->traddr, trid->trsvcid,
|
|
trid->subnqn);
|
|
|
|
/* Set io_queue_size to UINT16_MAX, NVMe driver
|
|
* will then reduce this to MQES to maximize
|
|
* the io_queue_size as much as possible.
|
|
*/
|
|
opts->io_queue_size = UINT16_MAX;
|
|
|
|
opts->keep_alive_timeout_ms = spdk_max(opts->keep_alive_timeout_ms,
|
|
g_keep_alive_timeout_in_ms);
|
|
|
|
opts->transport_retry_count = g_transport_retry_count;
|
|
opts->transport_ack_timeout = g_transport_ack_timeout;
|
|
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
|
|
struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
|
|
{
|
|
struct trid_entry *trid_entry = cb_ctx;
|
|
|
|
printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
|
|
trid->traddr, trid->trsvcid,
|
|
trid->subnqn);
|
|
|
|
register_ctrlr(ctrlr, trid_entry);
|
|
}
|
|
|
|
static int
|
|
register_controllers(void)
|
|
{
|
|
struct trid_entry *trid_entry;
|
|
|
|
printf("Initializing NVMe Controllers\n");
|
|
|
|
TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) {
|
|
if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) {
|
|
fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n",
|
|
trid_entry->trid.traddr);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
unregister_controllers(void)
|
|
{
|
|
struct ctrlr_entry *entry, *tmp;
|
|
struct spdk_nvme_detach_ctx *detach_ctx = NULL;
|
|
|
|
TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) {
|
|
TAILQ_REMOVE(&g_controllers, entry, link);
|
|
spdk_nvme_detach_async(entry->ctrlr, &detach_ctx);
|
|
free(entry);
|
|
}
|
|
|
|
if (detach_ctx) {
|
|
spdk_nvme_detach_poll(detach_ctx);
|
|
}
|
|
}
|
|
|
|
static int
|
|
associate_workers_with_ns(void)
|
|
{
|
|
struct ns_entry *entry = TAILQ_FIRST(&g_namespaces);
|
|
struct worker_thread *worker = TAILQ_FIRST(&g_workers);
|
|
struct ns_worker_ctx *ns_ctx;
|
|
int i, count;
|
|
|
|
count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
|
|
|
|
for (i = 0; i < count; i++) {
|
|
if (entry == NULL) {
|
|
break;
|
|
}
|
|
|
|
ns_ctx = calloc(1, sizeof(struct ns_worker_ctx));
|
|
if (!ns_ctx) {
|
|
return -1;
|
|
}
|
|
|
|
printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
|
|
ns_ctx->entry = entry;
|
|
|
|
TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link);
|
|
|
|
worker = TAILQ_NEXT(worker, link);
|
|
if (worker == NULL) {
|
|
worker = TAILQ_FIRST(&g_workers);
|
|
}
|
|
|
|
entry = TAILQ_NEXT(entry, link);
|
|
if (entry == NULL) {
|
|
entry = TAILQ_FIRST(&g_namespaces);
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void *
|
|
nvme_poll_ctrlrs(void *arg)
|
|
{
|
|
struct ctrlr_entry *entry;
|
|
const struct spdk_nvme_transport_id *old_trid;
|
|
int oldstate;
|
|
int rc;
|
|
|
|
|
|
spdk_unaffinitize_thread();
|
|
|
|
while (true) {
|
|
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
|
|
|
|
TAILQ_FOREACH(entry, &g_controllers, link) {
|
|
rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr);
|
|
/* This controller has encountered a failure at the transport level. reset it. */
|
|
if (rc == -ENXIO) {
|
|
if (entry->num_resets == 0) {
|
|
old_trid = spdk_nvme_ctrlr_get_transport_id(entry->ctrlr);
|
|
fprintf(stderr, "A controller has encountered a failure and is being reset.\n");
|
|
if (spdk_nvme_transport_id_compare(old_trid, &entry->failover_trid)) {
|
|
fprintf(stderr, "Resorting to new failover address %s\n", entry->failover_trid.traddr);
|
|
spdk_nvme_ctrlr_fail(entry->ctrlr);
|
|
rc = spdk_nvme_ctrlr_set_trid(entry->ctrlr, &entry->failover_trid);
|
|
if (rc != 0) {
|
|
fprintf(stderr, "Unable to fail over to back up trid.\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
rc = spdk_nvme_ctrlr_reset(entry->ctrlr);
|
|
if (rc != 0) {
|
|
entry->num_resets++;
|
|
fprintf(stderr, "Unable to reset the controller.\n");
|
|
|
|
if (entry->num_resets > g_max_ctrlr_resets) {
|
|
fprintf(stderr, "Controller cannot be recovered. Exiting.\n");
|
|
exit(1);
|
|
}
|
|
} else {
|
|
fprintf(stderr, "Controller properly reset.\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
|
|
|
|
/* This is a pthread cancellation point and cannot be removed. */
|
|
sleep(1);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
int
|
|
main(int argc, char **argv)
|
|
{
|
|
int rc;
|
|
struct worker_thread *worker, *main_worker;
|
|
unsigned main_core;
|
|
struct spdk_env_opts opts;
|
|
pthread_t thread_id = 0;
|
|
|
|
rc = parse_args(argc, argv);
|
|
if (rc != 0) {
|
|
return rc;
|
|
}
|
|
|
|
spdk_env_opts_init(&opts);
|
|
opts.name = "reconnect";
|
|
if (g_core_mask) {
|
|
opts.core_mask = g_core_mask;
|
|
}
|
|
|
|
if (g_dpdk_mem) {
|
|
opts.mem_size = g_dpdk_mem;
|
|
}
|
|
opts.hugepage_single_segments = g_dpdk_mem_single_seg;
|
|
if (spdk_env_init(&opts) < 0) {
|
|
fprintf(stderr, "Unable to initialize SPDK env\n");
|
|
unregister_trids();
|
|
return 1;
|
|
}
|
|
|
|
g_tsc_rate = spdk_get_ticks_hz();
|
|
|
|
if (register_workers() != 0) {
|
|
rc = 1;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (register_controllers() != 0) {
|
|
rc = 1;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (g_warn) {
|
|
printf("WARNING: Some requested NVMe devices were skipped\n");
|
|
}
|
|
|
|
if (g_num_namespaces == 0) {
|
|
fprintf(stderr, "No valid NVMe controllers found\n");
|
|
goto cleanup;
|
|
}
|
|
|
|
rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
|
|
if (rc != 0) {
|
|
fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");
|
|
goto cleanup;
|
|
}
|
|
|
|
if (associate_workers_with_ns() != 0) {
|
|
rc = 1;
|
|
goto cleanup;
|
|
}
|
|
|
|
printf("Initialization complete. Launching workers.\n");
|
|
|
|
/* Launch all of the secondary workers */
|
|
main_core = spdk_env_get_current_core();
|
|
main_worker = NULL;
|
|
TAILQ_FOREACH(worker, &g_workers, link) {
|
|
if (worker->lcore != main_core) {
|
|
spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker);
|
|
} else {
|
|
assert(main_worker == NULL);
|
|
main_worker = worker;
|
|
}
|
|
}
|
|
|
|
assert(main_worker != NULL);
|
|
rc = work_fn(main_worker);
|
|
|
|
spdk_env_thread_wait_all();
|
|
|
|
cleanup:
|
|
if (thread_id && pthread_cancel(thread_id) == 0) {
|
|
pthread_join(thread_id, NULL);
|
|
}
|
|
unregister_trids();
|
|
unregister_namespaces();
|
|
unregister_controllers();
|
|
unregister_workers();
|
|
|
|
spdk_env_fini();
|
|
|
|
if (rc != 0) {
|
|
fprintf(stderr, "%s: errors occurred\n", argv[0]);
|
|
/*
|
|
* return a generic error to the caller. This allows us to
|
|
* distinguish between a failure in the script and something
|
|
* like a segfault or an invalid access which causes the program
|
|
* to crash.
|
|
*/
|
|
rc = 1;
|
|
}
|
|
|
|
return rc;
|
|
}
|