From 521a9bb22c87d4eef4d30cdfe4e6355ff74ad484 Mon Sep 17 00:00:00 2001 From: Shuhei Matsumoto Date: Thu, 30 Dec 2021 16:45:06 +0900 Subject: [PATCH] bdev/nvme: Fix race between failover and add secondary trid We sort secondary trids to avoid using disconnected trids for failover. However the sort had a bug. This bug was found by running test/nvmf/host/multipath.sh in a loop. Verify the fix by adding unit test. Fixes #2300 Signed-off-by: Shuhei Matsumoto Change-Id: I22b0ede4d2ef98b786c3e0d1f5337a2d568ba56d Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10921 Community-CI: Broadcom CI Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins Reviewed-by: Aleksey Marchuk Reviewed-by: Ben Walker Reviewed-by: Changpeng Liu --- module/bdev/nvme/bdev_nvme.c | 2 +- .../lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c | 107 ++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index 2533278ca..bfba51488 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -3588,7 +3588,7 @@ _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, new_trid->is_failed = false; TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { - if (tmp_trid->is_failed) { + if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); return 0; } diff --git a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c index 8380954ba..b5defb3f5 100644 --- a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c +++ b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c @@ -1529,6 +1529,112 @@ test_failover_ctrlr(void) CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL); } +/* We had a bug when running test/nvmf/host/multipath.sh. The bug was the following. + * + * A nvme_ctrlr had trid1 and trid2 first. trid1 was active. A connection to trid1 was + * disconnected and reset ctrlr failed repeatedly before starting failover from trid1 + * to trid2. While processing the failed reset, trid3 was added. trid1 should + * have been active, i.e., the head of the list until the failover completed. + * However trid3 was inserted to the head of the list by mistake. + * + * I/O qpairs have smaller polling period than admin qpair. When a connection is + * detected, I/O qpair may detect the error earlier than admin qpair. I/O qpair error + * invokes reset ctrlr and admin qpair error invokes failover ctrlr. Hence reset ctrlr + * may be executed repeatedly before failover is executed. Hence this bug is real. + * + * The following test verifies the fix. + */ +static void +test_race_between_failover_and_add_secondary_trid(void) +{ + struct spdk_nvme_transport_id trid1 = {}, trid2 = {}, trid3 = {}; + struct spdk_nvme_ctrlr ctrlr = {}; + struct nvme_ctrlr *nvme_ctrlr = NULL; + struct nvme_path_id *path_id1, *path_id2, *path_id3; + struct spdk_io_channel *ch1, *ch2; + int rc; + + ut_init_trid(&trid1); + ut_init_trid2(&trid2); + ut_init_trid3(&trid3); + TAILQ_INIT(&ctrlr.active_io_qpairs); + + set_thread(0); + + rc = nvme_ctrlr_create(&ctrlr, "nvme0", &trid1, NULL); + CU_ASSERT(rc == 0); + + nvme_ctrlr = nvme_ctrlr_get_by_name("nvme0"); + SPDK_CU_ASSERT_FATAL(nvme_ctrlr != NULL); + + ch1 = spdk_get_io_channel(nvme_ctrlr); + SPDK_CU_ASSERT_FATAL(ch1 != NULL); + + set_thread(1); + + ch2 = spdk_get_io_channel(nvme_ctrlr); + SPDK_CU_ASSERT_FATAL(ch2 != NULL); + + set_thread(0); + + rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, &ctrlr, &trid2); + CU_ASSERT(rc == 0); + + path_id1 = TAILQ_FIRST(&nvme_ctrlr->trids); + SPDK_CU_ASSERT_FATAL(path_id1 != NULL); + CU_ASSERT(path_id1 == nvme_ctrlr->active_path_id); + CU_ASSERT(spdk_nvme_transport_id_compare(&path_id1->trid, &trid1) == 0); + path_id2 = TAILQ_NEXT(path_id1, link); + SPDK_CU_ASSERT_FATAL(path_id2 != NULL); + CU_ASSERT(spdk_nvme_transport_id_compare(&path_id2->trid, &trid2) == 0); + + ctrlr.fail_reset = true; + + rc = bdev_nvme_reset(nvme_ctrlr); + CU_ASSERT(rc == 0); + + poll_threads(); + + CU_ASSERT(path_id1->is_failed == true); + CU_ASSERT(path_id1 == nvme_ctrlr->active_path_id); + + rc = bdev_nvme_reset(nvme_ctrlr); + CU_ASSERT(rc == 0); + + rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, &ctrlr, &trid3); + CU_ASSERT(rc == 0); + + CU_ASSERT(path_id1 == TAILQ_FIRST(&nvme_ctrlr->trids)); + CU_ASSERT(path_id1 == nvme_ctrlr->active_path_id); + CU_ASSERT(spdk_nvme_transport_id_compare(&path_id1->trid, &trid1) == 0); + CU_ASSERT(path_id2 == TAILQ_NEXT(path_id1, link)); + CU_ASSERT(spdk_nvme_transport_id_compare(&path_id2->trid, &trid2) == 0); + path_id3 = TAILQ_NEXT(path_id2, link); + SPDK_CU_ASSERT_FATAL(path_id3 != NULL); + CU_ASSERT(spdk_nvme_transport_id_compare(&path_id3->trid, &trid3) == 0); + + poll_threads(); + + spdk_put_io_channel(ch1); + + set_thread(1); + + spdk_put_io_channel(ch2); + + poll_threads(); + + set_thread(0); + + rc = bdev_nvme_delete("nvme0", &g_any_path); + CU_ASSERT(rc == 0); + + poll_threads(); + spdk_delay_us(1000); + poll_threads(); + + CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL); +} + static void attach_ctrlr_done(void *cb_ctx, size_t bdev_count, int rc) { @@ -5065,6 +5171,7 @@ main(int argc, const char **argv) CU_ADD_TEST(suite, test_reset_ctrlr); CU_ADD_TEST(suite, test_race_between_reset_and_destruct_ctrlr); CU_ADD_TEST(suite, test_failover_ctrlr); + CU_ADD_TEST(suite, test_race_between_failover_and_add_secondary_trid); CU_ADD_TEST(suite, test_pending_reset); CU_ADD_TEST(suite, test_attach_ctrlr); CU_ADD_TEST(suite, test_aer_cb);