diff --git a/include/spdk/nvme.h b/include/spdk/nvme.h index 99faf785b..a7433b1ca 100644 --- a/include/spdk/nvme.h +++ b/include/spdk/nvme.h @@ -235,6 +235,21 @@ bool spdk_nvme_ctrlr_is_discovery(struct spdk_nvme_ctrlr *ctrlr); void spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t opts_size); +/** + * Reason for qpair disconnect at the transport layer. + * + * NONE implies that the qpair is still connected while UNKNOWN means that the + * qpair is disconnected, but the cause was not apparent. + */ +enum spdk_nvme_qp_failure_reason { + SPDK_NVME_QPAIR_FAILURE_NONE = 0, + SPDK_NVME_QPAIR_FAILURE_LOCAL, + SPDK_NVME_QPAIR_FAILURE_REMOTE, + SPDK_NVME_QPAIR_FAILURE_UNKNOWN, +}; + +typedef enum spdk_nvme_qp_failure_reason spdk_nvme_qp_failure_reason; + /** * NVMe library transports * @@ -1106,6 +1121,16 @@ struct spdk_nvme_qpair *spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *c */ int spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair); +/** + * Returns the reason the admin qpair for a given controller is disconnected. + * + * \param ctrlr The controller to check. + * + * \return a valid spdk_nvme_qp_failure_reason. + */ +spdk_nvme_qp_failure_reason spdk_nvme_ctrlr_get_admin_qp_failure_reason( + struct spdk_nvme_ctrlr *ctrlr); + /** * Free an I/O queue pair that was allocated by spdk_nvme_ctrlr_alloc_io_qpair(). * @@ -1252,6 +1277,15 @@ int spdk_nvme_ctrlr_cmd_io_raw_with_md(struct spdk_nvme_ctrlr *ctrlr, int32_t spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions); +/** + * Returns the reason the qpair is disconnected. + * + * \param qpair The qpair to check. + * + * \return a valid spdk_nvme_qp_failure_reason. + */ +spdk_nvme_qp_failure_reason spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair); + /** * Send the given admin command to the NVMe controller. * diff --git a/lib/nvme/nvme_ctrlr.c b/lib/nvme/nvme_ctrlr.c index 4cac0e0dc..69285938a 100644 --- a/lib/nvme/nvme_ctrlr.c +++ b/lib/nvme/nvme_ctrlr.c @@ -422,6 +422,7 @@ spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair) rc = -EAGAIN; goto out; } + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE; nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); out: @@ -429,6 +430,12 @@ out: return rc; } +spdk_nvme_qp_failure_reason +spdk_nvme_ctrlr_get_admin_qp_failure_reason(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->adminq->transport_failure_reason; +} + /* * This internal function will attempt to take the controller * lock before calling disconnect on a controller qpair. @@ -1076,11 +1083,13 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) /* Disable all queues before disabling the controller hardware. */ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED); } nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_DISABLED); nvme_qpair_complete_error_reqs(ctrlr->adminq); nvme_transport_qpair_abort_reqs(ctrlr->adminq, 0 /* retry */); + ctrlr->adminq->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq); if (nvme_transport_ctrlr_connect_qpair(ctrlr, ctrlr->adminq) != 0) { SPDK_ERRLOG("Controller reinitialization failed.\n"); @@ -1088,6 +1097,7 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) rc = -1; goto out; } + ctrlr->adminq->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE; nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_CONNECTED); /* Doorbell buffer config is invalid during reset */ @@ -1116,10 +1126,12 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) /* Reinitialize qpairs */ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED); rc = -1; continue; } + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE; nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); } } diff --git a/lib/nvme/nvme_internal.h b/lib/nvme/nvme_internal.h index f71510f3a..fbb95f4e2 100644 --- a/lib/nvme/nvme_internal.h +++ b/lib/nvme/nvme_internal.h @@ -381,6 +381,8 @@ struct spdk_nvme_qpair { struct spdk_nvme_ctrlr_process *active_proc; void *req_buf; + + uint8_t transport_failure_reason: 2; }; struct spdk_nvme_ns { diff --git a/lib/nvme/nvme_qpair.c b/lib/nvme/nvme_qpair.c index ff268b11e..552da6561 100644 --- a/lib/nvme/nvme_qpair.c +++ b/lib/nvme/nvme_qpair.c @@ -506,6 +506,12 @@ spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_ return ret; } +spdk_nvme_qp_failure_reason +spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair) +{ + return qpair->transport_failure_reason; +} + int nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, struct spdk_nvme_ctrlr *ctrlr, diff --git a/lib/nvme/nvme_rdma.c b/lib/nvme/nvme_rdma.c index 11b6b4994..b09233161 100644 --- a/lib/nvme/nvme_rdma.c +++ b/lib/nvme/nvme_rdma.c @@ -287,13 +287,18 @@ nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair) } break; case RDMA_CM_EVENT_DISCONNECTED: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; + nvme_qpair_set_state(&rqpair->qpair, NVME_QPAIR_DISABLED); + break; case RDMA_CM_EVENT_DEVICE_REMOVAL: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; nvme_qpair_set_state(&rqpair->qpair, NVME_QPAIR_DISABLED); break; case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: break; case RDMA_CM_EVENT_ADDR_CHANGE: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; nvme_qpair_set_state(&rqpair->qpair, NVME_QPAIR_DISABLED); break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: @@ -1060,6 +1065,7 @@ nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair) rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries); if (rc < 0) { + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; nvme_qpair_set_state(&rqpair->qpair, NVME_QPAIR_DISABLED); SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); return -1; @@ -1876,7 +1882,7 @@ nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, { struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL]; - int i, rc, batch_size; + int i, rc = 0, batch_size; uint32_t reaped; struct ibv_cq *cq; struct spdk_nvme_rdma_req *rdma_req; @@ -1967,6 +1973,12 @@ fail: * we can call nvme_rdma_qpair_disconnect. For other qpairs we need * to call the generic function which will take the lock for us. */ + if (rc == IBV_WC_RETRY_EXC_ERR) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; + } else if (qpair->transport_failure_reason == SPDK_NVME_QPAIR_FAILURE_NONE) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + } + if (nvme_qpair_is_admin_queue(qpair)) { nvme_rdma_qpair_disconnect(qpair); } else { diff --git a/lib/nvme/nvme_tcp.c b/lib/nvme/nvme_tcp.c index 207451b96..84d68fd1a 100644 --- a/lib/nvme/nvme_tcp.c +++ b/lib/nvme/nvme_tcp.c @@ -1511,6 +1511,8 @@ fail: * we can call nvme_tcp_qpair_disconnect. For other qpairs we need * to call the generic function which will take the lock for us. */ + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + if (nvme_qpair_is_admin_queue(qpair)) { nvme_tcp_qpair_disconnect(qpair); } else {