From 13ca6e52d3c7d413a244c67cfbb419e55f3293da Mon Sep 17 00:00:00 2001 From: Shuhei Matsumoto Date: Thu, 7 Apr 2022 17:29:05 +0900 Subject: [PATCH] bdev/nvme: Handle ANA transition (change or inaccessible state) correctly Previously, if a namespace is in ANA inaccessible state, I/O had been queued infinitely. Fix this issue according to the NVMe spec. Add a temporary poller anatt_timer and a flag ana_transition_timedout for each nvme_ns. Start anatt_timer if the nvme_ns enters ANA transition. If anatt_timer is expired, set ana_transition_timedout to true. Cancel anatt_timer or clear ana_transition_timedout if the nvme_ns exits ANA transition. nvme_io_path_become_available() returns false if ana_transition_timedout is true. Add unit test case to verify these addition. Signed-off-by: Shuhei Matsumoto Change-Id: Ic76933242046b3e8e553de88221b943ad097c91c Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12194 Tested-by: SPDK CI Jenkins Community-CI: Broadcom CI Reviewed-by: Aleksey Marchuk Reviewed-by: Jim Harris Reviewed-by: Monica Kenguva --- module/bdev/nvme/bdev_nvme.c | 41 +++++++++++++ module/bdev/nvme/bdev_nvme.h | 2 + .../lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c | 58 +++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index 48d7fb5d5..a0ffd5e1b 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -875,6 +875,10 @@ any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) struct nvme_io_path *io_path; STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { + if (io_path->nvme_ns->ana_transition_timedout) { + continue; + } + if (nvme_io_path_is_connected(io_path) || !nvme_io_path_is_failed(io_path)) { return true; @@ -2624,13 +2628,48 @@ bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, return rc; } +static int +nvme_ns_ana_transition_timedout(void *ctx) +{ + struct nvme_ns *nvme_ns = ctx; + + spdk_poller_unregister(&nvme_ns->anatt_timer); + nvme_ns->ana_transition_timedout = true; + + return SPDK_POLLER_BUSY; +} + static void _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, const struct spdk_nvme_ana_group_descriptor *desc) { + const struct spdk_nvme_ctrlr_data *cdata; + nvme_ns->ana_group_id = desc->ana_group_id; nvme_ns->ana_state = desc->ana_state; nvme_ns->ana_state_updating = false; + + switch (nvme_ns->ana_state) { + case SPDK_NVME_ANA_OPTIMIZED_STATE: + case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: + nvme_ns->ana_transition_timedout = false; + spdk_poller_unregister(&nvme_ns->anatt_timer); + break; + + case SPDK_NVME_ANA_INACCESSIBLE_STATE: + case SPDK_NVME_ANA_CHANGE_STATE: + if (nvme_ns->anatt_timer != NULL) { + break; + } + + cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); + nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, + nvme_ns, + cdata->anatt * SPDK_SEC_TO_USEC); + break; + default: + break; + } } static int @@ -3114,6 +3153,8 @@ nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *n { struct nvme_bdev *bdev; + spdk_poller_unregister(&nvme_ns->anatt_timer); + bdev = nvme_ns->bdev; if (bdev != NULL) { pthread_mutex_lock(&bdev->mutex); diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h index 124b5e629..5ee33f56f 100644 --- a/module/bdev/nvme/bdev_nvme.h +++ b/module/bdev/nvme/bdev_nvme.h @@ -85,6 +85,8 @@ struct nvme_ns { uint32_t ana_group_id; enum spdk_nvme_ana_state ana_state; bool ana_state_updating; + bool ana_transition_timedout; + struct spdk_poller *anatt_timer; struct nvme_async_probe_ctx *probe_ctx; TAILQ_ENTRY(nvme_ns) tailq; RB_ENTRY(nvme_ns) node; diff --git a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c index 565f35ecf..883ea2956 100644 --- a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c +++ b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c @@ -5930,6 +5930,63 @@ test_nvme_ns_cmp(void) CU_ASSERT(nvme_ns_cmp(&nvme_ns2, &nvme_ns1) > 0); } +static void +test_ana_transition(void) +{ + struct spdk_nvme_ctrlr ctrlr = { .cdata.anatt = 10, }; + struct nvme_ctrlr nvme_ctrlr = { .ctrlr = &ctrlr, }; + struct nvme_ns nvme_ns = { .ctrlr = &nvme_ctrlr, }; + struct spdk_nvme_ana_group_descriptor desc = { .ana_group_id = 1, }; + + /* case 1: ANA transition timedout is canceled. */ + nvme_ns.ana_state = SPDK_NVME_ANA_CHANGE_STATE; + nvme_ns.ana_transition_timedout = true; + + desc.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + + _nvme_ns_set_ana_state(&nvme_ns, &desc); + + CU_ASSERT(nvme_ns.ana_transition_timedout == false); + CU_ASSERT(nvme_ns.ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE); + + /* case 2: ANATT timer is kept. */ + nvme_ns.ana_state = SPDK_NVME_ANA_CHANGE_STATE; + nvme_ns.anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, + &nvme_ns, + ctrlr.cdata.anatt * SPDK_SEC_TO_USEC); + + desc.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE; + + _nvme_ns_set_ana_state(&nvme_ns, &desc); + + CU_ASSERT(nvme_ns.anatt_timer != NULL); + CU_ASSERT(nvme_ns.ana_state == SPDK_NVME_ANA_INACCESSIBLE_STATE); + + /* case 3: ANATT timer is stopped. */ + desc.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + + _nvme_ns_set_ana_state(&nvme_ns, &desc); + + CU_ASSERT(nvme_ns.anatt_timer == NULL); + CU_ASSERT(nvme_ns.ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE); + + /* ANATT timer is started. */ + desc.ana_state = SPDK_NVME_ANA_CHANGE_STATE; + + _nvme_ns_set_ana_state(&nvme_ns, &desc); + + CU_ASSERT(nvme_ns.anatt_timer != NULL); + CU_ASSERT(nvme_ns.ana_state == SPDK_NVME_ANA_CHANGE_STATE); + + /* ANATT timer is expired. */ + spdk_delay_us(ctrlr.cdata.anatt * SPDK_SEC_TO_USEC); + + poll_threads(); + + CU_ASSERT(nvme_ns.anatt_timer == NULL); + CU_ASSERT(nvme_ns.ana_transition_timedout == true); +} + int main(int argc, const char **argv) { @@ -5978,6 +6035,7 @@ main(int argc, const char **argv) CU_ADD_TEST(suite, test_retry_failover_ctrlr); CU_ADD_TEST(suite, test_fail_path); CU_ADD_TEST(suite, test_nvme_ns_cmp); + CU_ADD_TEST(suite, test_ana_transition); CU_basic_set_mode(CU_BRM_VERBOSE);