From ec6de131f7b18dbc8bb6f038171c704ea8fe05d9 Mon Sep 17 00:00:00 2001 From: Seth Howell Date: Tue, 29 Oct 2019 14:09:36 -0700 Subject: [PATCH] nvme: don't disconnect qpairs from admin thread. Disconnecting qpairs from the admin thread during a reset led to an inevitable race with the data thread. QP related memory is freed during the disconnect and cannot be touched from the other threads. The only way to fix this is to force the qpair disconnect onto the data thread. This requires a small change in the way that resets are handled for pcie. Please see the code in reset.c for that change. fixes: bb01a089 Signed-off-by: Seth Howell Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/472749 (master) (cherry picked from commit 13f30a254e655c9174995f7178d3440e049aa9db) Change-Id: I8a39e444c7cbbe85fafca42ffd040e929721ce95 Signed-off-by: Tomasz Zawadzki Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/472960 Reviewed-by: Jim Harris Reviewed-by: Ben Walker Reviewed-by: Seth Howell Tested-by: SPDK CI Jenkins --- include/spdk/nvme.h | 2 ++ lib/nvme/nvme_ctrlr.c | 17 +++++++++++++++-- lib/nvme/nvme_qpair.c | 2 +- test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c | 6 +++--- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/spdk/nvme.h b/include/spdk/nvme.h index 13982ae14..3ab81878b 100644 --- a/include/spdk/nvme.h +++ b/include/spdk/nvme.h @@ -1091,6 +1091,8 @@ struct spdk_nvme_qpair *spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *c * This function is intended to be called on qpairs that have already been connected, * but have since entered a failed state as indicated by a return value of -ENXIO from * either spdk_nvme_qpair_process_completions or one of the spdk_nvme_ns_cmd_* functions. + * This function must be called from the same thread as spdk_nvme_qpair_process_completions + * and the spdk_nvme_ns_cmd_* functions. * * \param qpair The qpair to reconnect. * diff --git a/lib/nvme/nvme_ctrlr.c b/lib/nvme/nvme_ctrlr.c index 0780093ba..e89c1c2d0 100644 --- a/lib/nvme/nvme_ctrlr.c +++ b/lib/nvme/nvme_ctrlr.c @@ -412,13 +412,18 @@ spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair) goto out; } + /* We have to confirm that any old memory is cleaned up. */ + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair); if (rc) { nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED); + qpair->transport_qp_is_failed = true; rc = -EAGAIN; goto out; } nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); + qpair->transport_qp_is_failed = false; out: nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); @@ -1073,7 +1078,7 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) /* Disable all queues before disabling the controller hardware. */ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { nvme_qpair_set_state(qpair, NVME_QPAIR_DISABLED); - nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + qpair->transport_qp_is_failed = true; } nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_DISABLED); nvme_qpair_complete_error_reqs(ctrlr->adminq); @@ -1102,7 +1107,14 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) } } - if (rc == 0) { + /* + * For PCIe controllers, the memory locations of the tranpsort qpair + * don't change when the controller is reset. They simply need to be + * re-enabled with admin commands to the controller. For fabric + * controllers we need to disconnect and reconnect the qpair on its + * own thread outside of the context of the reset. + */ + if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { /* Reinitialize qpairs */ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) { @@ -1111,6 +1123,7 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) continue; } nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); + qpair->transport_qp_is_failed = false; } } diff --git a/lib/nvme/nvme_qpair.c b/lib/nvme/nvme_qpair.c index 778f9d4db..63508c03f 100644 --- a/lib/nvme/nvme_qpair.c +++ b/lib/nvme/nvme_qpair.c @@ -458,7 +458,7 @@ spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_ * qpair is not enabled, likely because a controller reset is * in progress. */ - return 0; + return -ENXIO; } /* error injection for those queued error requests */ diff --git a/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c b/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c index 665fd1f38..e1be64a7c 100644 --- a/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c +++ b/test/unit/lib/nvme/nvme_qpair.c/nvme_qpair_ut.c @@ -243,18 +243,18 @@ static void test_nvme_qpair_process_completions(void) ctrlr.is_removed = false; ctrlr.is_resetting = true; rc = spdk_nvme_qpair_process_completions(&qpair, 0); - CU_ASSERT(rc == 0); + CU_ASSERT(rc == -ENXIO); CU_ASSERT(g_called_transport_process_completions == false); /* We also need to make sure we didn't abort the requests. */ CU_ASSERT(!STAILQ_EMPTY(&qpair.queued_req)); CU_ASSERT(g_num_cb_passed == 0); CU_ASSERT(g_num_cb_failed == 0); - /* The case where we aren't resetting, but are enablign the qpair is the same as above. */ + /* The case where we aren't resetting, but are enabling the qpair is the same as above. */ ctrlr.is_resetting = false; qpair.state = NVME_QPAIR_ENABLING; rc = spdk_nvme_qpair_process_completions(&qpair, 0); - CU_ASSERT(rc == 0); + CU_ASSERT(rc == -ENXIO); CU_ASSERT(g_called_transport_process_completions == false); CU_ASSERT(!STAILQ_EMPTY(&qpair.queued_req)); CU_ASSERT(g_num_cb_passed == 0);