bdev/nvme: Fail reset sequence immediately if ctrlr is already removed.

After a controller was hot-removed, if a reset sequence started to
the controller, spdk_nvme_ctrlr_disconnect() failed and caused core
dump in debug mode.

When implemented, how to cause the failure and how to process the
failure were not clear. Hence assert was added to detect the
failure.

We know how we cause the failure now. Let's handle the failure
appropriately.

If spdk_nvme_ctrlr_disconnect() fails, we are on the nvme_ctrlr->thread.
Hence call bdev_nvme_reset_complete() with failure immediately.

Even if spdk_nvme_ctrlr_disconnect() completes synchronously, the
completion callback is executed asynchronously when polling an adminq.

Hence set the completion callback only if spdk_nvme_ctrlr_disconnect()
succeeds.

Fixes issue #2632

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: I11f61853aba9eca2515592f964a291e59def7247
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13892
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Krzysztof Karas <krzysztof.karas@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Shuhei Matsumoto 2022-08-08 12:19:44 +09:00 committed by Tomasz Zawadzki
parent 6439abc0d8
commit 2826ef13ce
2 changed files with 47 additions and 8 deletions

View File

@ -1533,10 +1533,21 @@ bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
}
}
static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success);
static void
nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn)
{
int rc __attribute__((unused));
int rc;
rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
if (rc != 0) {
/* Disconnect fails if ctrlr is already resetting or removed. In this case,
* fail the reset sequence immediately.
*/
bdev_nvme_reset_complete(nvme_ctrlr, false);
return;
}
/* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq.
* Set callback here to execute the specified operation after ctrlr is really disconnected.
@ -1544,13 +1555,6 @@ nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb
assert(nvme_ctrlr->disconnected_cb == NULL);
nvme_ctrlr->disconnected_cb = cb_fn;
/* Disconnect fails if ctrlr is already resetting or removed. Both cases are
* not possible. Reset is controlled and the callback to hot remove is called
* when ctrlr is hot removed.
*/
rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
assert(rc == 0);
/* During disconnection, reduce the period to poll adminq more often. */
bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0);
}

View File

@ -233,6 +233,7 @@ struct spdk_nvme_ctrlr {
bool attached;
bool is_failed;
bool fail_reset;
bool is_removed;
struct spdk_nvme_transport_id trid;
TAILQ_HEAD(, spdk_nvme_qpair) active_io_qpairs;
TAILQ_ENTRY(spdk_nvme_ctrlr) tailq;
@ -764,6 +765,10 @@ spdk_nvme_ctrlr_reconnect_async(struct spdk_nvme_ctrlr *ctrlr)
int
spdk_nvme_ctrlr_disconnect(struct spdk_nvme_ctrlr *ctrlr)
{
if (ctrlr->is_removed) {
return -ENXIO;
}
ctrlr->adminq.is_connected = false;
ctrlr->is_failed = false;
@ -1303,6 +1308,17 @@ test_create_ctrlr(void)
CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
}
static void
ut_check_hotplug_on_reset(void *cb_arg, bool success)
{
bool *detect_remove = cb_arg;
CU_ASSERT(success == false);
SPDK_CU_ASSERT_FATAL(detect_remove != NULL);
*detect_remove = true;
}
static void
test_reset_ctrlr(void)
{
@ -1312,6 +1328,7 @@ test_reset_ctrlr(void)
struct nvme_path_id *curr_trid;
struct spdk_io_channel *ch1, *ch2;
struct nvme_ctrlr_channel *ctrlr_ch1, *ctrlr_ch2;
bool detect_remove;
int rc;
ut_init_trid(&trid);
@ -1406,6 +1423,24 @@ test_reset_ctrlr(void)
CU_ASSERT(nvme_ctrlr->resetting == false);
CU_ASSERT(curr_trid->is_failed == false);
/* Case 4: ctrlr is already removed. */
ctrlr.is_removed = true;
rc = bdev_nvme_reset(nvme_ctrlr);
CU_ASSERT(rc == 0);
detect_remove = false;
nvme_ctrlr->reset_cb_fn = ut_check_hotplug_on_reset;
nvme_ctrlr->reset_cb_arg = &detect_remove;
poll_threads();
CU_ASSERT(nvme_ctrlr->reset_cb_fn == NULL);
CU_ASSERT(nvme_ctrlr->reset_cb_arg == NULL);
CU_ASSERT(detect_remove == true);
ctrlr.is_removed = false;
spdk_put_io_channel(ch2);
set_thread(0);