diff --git a/lib/nvme/nvme_rdma.c b/lib/nvme/nvme_rdma.c index de1b37156..6c6f840af 100644 --- a/lib/nvme/nvme_rdma.c +++ b/lib/nvme/nvme_rdma.c @@ -63,9 +63,12 @@ #define NVME_RDMA_DEFAULT_TX_SGE 2 #define NVME_RDMA_DEFAULT_RX_SGE 1 - /* Max number of NVMe-oF SGL descriptors supported by the host */ #define NVME_RDMA_MAX_SGL_DESCRIPTORS 16 + +/* number of STAILQ entries for holding pending RDMA CM events. */ +#define NVME_RDMA_NUM_CM_EVENTS 256 + struct spdk_nvmf_cmd { struct spdk_nvme_cmd cmd; struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS]; @@ -81,6 +84,12 @@ struct spdk_nvme_rdma_mr_map { LIST_ENTRY(spdk_nvme_rdma_mr_map) link; }; +/* STAILQ wrapper for cm events. */ +struct nvme_rdma_cm_event_entry { + struct rdma_cm_event *evt; + STAILQ_ENTRY(nvme_rdma_cm_event_entry) link; +}; + /* NVMe RDMA transport extensions for spdk_nvme_ctrlr */ struct nvme_rdma_ctrlr { struct spdk_nvme_ctrlr ctrlr; @@ -88,6 +97,14 @@ struct nvme_rdma_ctrlr { struct ibv_pd *pd; uint16_t max_sge; + + struct rdma_event_channel *cm_channel; + + STAILQ_HEAD(, nvme_rdma_cm_event_entry) pending_cm_events; + + STAILQ_HEAD(, nvme_rdma_cm_event_entry) free_cm_events; + + struct nvme_rdma_cm_event_entry *cm_events; }; /* NVMe RDMA qpair extensions for spdk_nvme_qpair */ @@ -130,9 +147,7 @@ struct nvme_rdma_qpair { TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs; /* Placed at the end of the struct since it is not used frequently */ - struct rdma_event_channel *cm_channel; - - struct rdma_cm_event *evt; + struct rdma_cm_event *evt; }; struct spdk_nvme_rdma_req { @@ -282,39 +297,92 @@ nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair) return rc; } +/* + * This function must be called under the nvme controller's lock + * because it touches global controller variables. The lock is taken + * by the generic transport code before invoking a few of the functions + * in this file: nvme_rdma_ctrlr_connect_qpair, nvme_rdma_ctrlr_delete_io_qpair, + * and conditionally nvme_rdma_qpair_process_completions when it is calling + * completions on the admin qpair. When adding a new call to this function, please + * verify that it is in a situation where it falls under the lock. + */ +static int +nvme_rdma_poll_events(struct nvme_rdma_ctrlr *rctrlr) +{ + struct nvme_rdma_cm_event_entry *entry, *tmp; + struct nvme_rdma_qpair *event_qpair; + struct rdma_cm_event *event; + struct rdma_event_channel *channel = rctrlr->cm_channel; + + STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) { + event_qpair = nvme_rdma_qpair(entry->evt->id->context); + if (event_qpair->evt == NULL) { + event_qpair->evt = entry->evt; + STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link); + STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link); + } + } + + while (rdma_get_cm_event(channel, &event) == 0) { + event_qpair = nvme_rdma_qpair(event->id->context); + if (event_qpair->evt == NULL) { + event_qpair->evt = event; + } else { + assert(rctrlr == nvme_rdma_ctrlr(event_qpair->qpair.ctrlr)); + entry = STAILQ_FIRST(&rctrlr->free_cm_events); + if (entry == NULL) { + rdma_ack_cm_event(event); + return -ENOMEM; + } + STAILQ_REMOVE(&rctrlr->free_cm_events, entry, nvme_rdma_cm_event_entry, link); + entry->evt = event; + STAILQ_INSERT_TAIL(&rctrlr->pending_cm_events, entry, link); + } + } + + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } else { + return errno; + } +} + static int nvme_rdma_process_event(struct nvme_rdma_qpair *rqpair, struct rdma_event_channel *channel, enum rdma_cm_event_type evt) { - struct rdma_cm_event *event; - int rc = 0; + struct nvme_rdma_ctrlr *rctrlr; + int rc = 0, rc2; if (rqpair->evt != NULL) { - return -EINVAL; - } - - while (rdma_get_cm_event(channel, &event) != 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) { - continue; + rc = nvme_rdma_qpair_process_cm_event(rqpair); + if (rc) { + return rc; } - - SPDK_ERRLOG("Failed to get event from CM event channel. Error %d (%s)\n", - errno, spdk_strerror(errno)); - return -errno; } - if (event->event != evt) { + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + assert(rctrlr != NULL); + while (!rqpair->evt && !rc) { + rc = nvme_rdma_poll_events(rctrlr); + } + + if (rc) { + return rc; + } + + if (rqpair->evt->event != evt) { SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n", nvme_rdma_cm_event_str_get(evt), - nvme_rdma_cm_event_str_get(event->event), event->event, - event->status); + nvme_rdma_cm_event_str_get(rqpair->evt->event), rqpair->evt->event, + rqpair->evt->status); rc = -EBADMSG; } - rqpair->evt = event; - rc |= nvme_rdma_qpair_process_cm_event(rqpair); - return rc; + rc2 = nvme_rdma_qpair_process_cm_event(rqpair); + /* bad message takes precedence over the other error codes from processing the event. */ + return rc == 0 ? rc2 : rc; } static int @@ -652,6 +720,7 @@ nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) struct ibv_device_attr attr; int ret; struct spdk_nvme_ctrlr *ctrlr; + struct nvme_rdma_ctrlr *rctrlr; ret = ibv_query_device(rqpair->cm_id->verbs, &attr); if (ret != 0) { @@ -665,6 +734,8 @@ nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) if (!ctrlr) { return -1; } + rctrlr = nvme_rdma_ctrlr(ctrlr); + assert(rctrlr != NULL); request_data.qid = rqpair->qpair.id; request_data.hrqsize = rqpair->num_entries; @@ -682,7 +753,7 @@ nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) return ret; } - ret = nvme_rdma_process_event(rqpair, rqpair->cm_channel, RDMA_CM_EVENT_ESTABLISHED); + ret = nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_ESTABLISHED); if (ret) { SPDK_ERRLOG("RDMA connect error\n"); return -1; @@ -850,22 +921,12 @@ nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair) bool src_addr_specified; int rc; struct spdk_nvme_ctrlr *ctrlr; + struct nvme_rdma_ctrlr *rctrlr; int family; - int flag; - - rqpair->cm_channel = rdma_create_event_channel(); - if (rqpair->cm_channel == NULL) { - SPDK_ERRLOG("rdma_create_event_channel() failed\n"); - return -1; - } - - flag = fcntl(rqpair->cm_channel->fd, F_GETFL); - if (fcntl(rqpair->cm_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { - SPDK_ERRLOG("Cannot set event channel to non blocking\n"); - return -1; - } ctrlr = rqpair->qpair.ctrlr; + rctrlr = nvme_rdma_ctrlr(ctrlr); + assert(rctrlr != NULL); switch (ctrlr->trid.adrfam) { case SPDK_NVMF_ADRFAM_IPV4: @@ -902,7 +963,7 @@ nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair) src_addr_specified = false; } - rc = rdma_create_id(rqpair->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP); + rc = rdma_create_id(rctrlr->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP); if (rc < 0) { SPDK_ERRLOG("rdma_create_id() failed\n"); return -1; @@ -910,7 +971,7 @@ nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair) rc = nvme_rdma_resolve_addr(rqpair, src_addr_specified ? (struct sockaddr *)&src_addr : NULL, - (struct sockaddr *)&dst_addr, rqpair->cm_channel); + (struct sockaddr *)&dst_addr, rctrlr->cm_channel); if (rc < 0) { SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n"); return -1; @@ -1409,11 +1470,6 @@ nvme_rdma_qpair_disconnect(struct spdk_nvme_qpair *qpair) ibv_destroy_cq(rqpair->cq); rqpair->cq = NULL; } - - if (rqpair->cm_channel) { - rdma_destroy_event_channel(rqpair->cm_channel); - rqpair->cm_channel = NULL; - } } static int @@ -1528,7 +1584,7 @@ struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transpo union spdk_nvme_vs_register vs; struct ibv_context **contexts; struct ibv_device_attr dev_attr; - int i, rc; + int i, flag, rc; rctrlr = calloc(1, sizeof(struct nvme_rdma_ctrlr)); if (rctrlr == NULL) { @@ -1570,6 +1626,33 @@ struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transpo return NULL; } + STAILQ_INIT(&rctrlr->pending_cm_events); + STAILQ_INIT(&rctrlr->free_cm_events); + rctrlr->cm_events = calloc(NVME_RDMA_NUM_CM_EVENTS, sizeof(*rctrlr->cm_events)); + if (rctrlr->cm_events == NULL) { + SPDK_ERRLOG("unable to allocat buffers to hold CM events.\n"); + nvme_rdma_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; + } + + for (i = 0; i < NVME_RDMA_NUM_CM_EVENTS; i++) { + STAILQ_INSERT_TAIL(&rctrlr->free_cm_events, &rctrlr->cm_events[i], link); + } + + rctrlr->cm_channel = rdma_create_event_channel(); + if (rctrlr->cm_channel == NULL) { + SPDK_ERRLOG("rdma_create_event_channel() failed\n"); + nvme_rdma_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; + } + + flag = fcntl(rctrlr->cm_channel->fd, F_GETFL); + if (fcntl(rctrlr->cm_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("Cannot set event channel to non blocking\n"); + nvme_rdma_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; + } + rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0, SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES, 0, SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES); if (!rctrlr->ctrlr.adminq) { @@ -1606,11 +1689,25 @@ int nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) { struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); + struct nvme_rdma_cm_event_entry *entry; if (ctrlr->adminq) { nvme_rdma_qpair_destroy(ctrlr->adminq); } + STAILQ_FOREACH(entry, &rctrlr->pending_cm_events, link) { + rdma_ack_cm_event(entry->evt); + } + + STAILQ_INIT(&rctrlr->free_cm_events); + STAILQ_INIT(&rctrlr->pending_cm_events); + free(rctrlr->cm_events); + + if (rctrlr->cm_channel) { + rdma_destroy_event_channel(rctrlr->cm_channel); + rctrlr->cm_channel = NULL; + } + nvme_ctrlr_destruct_finish(ctrlr); free(rctrlr); @@ -1780,6 +1877,7 @@ nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t reaped; struct ibv_cq *cq; struct spdk_nvme_rdma_req *rdma_req; + struct nvme_rdma_ctrlr *rctrlr; if (max_completions == 0) { max_completions = rqpair->num_entries; @@ -1787,6 +1885,12 @@ nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, max_completions = spdk_min(max_completions, rqpair->num_entries); } + if (nvme_qpair_is_admin_queue(&rqpair->qpair)) { + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + nvme_rdma_poll_events(rctrlr); + } + nvme_rdma_qpair_process_cm_event(rqpair); + cq = rqpair->cq; reaped = 0;