diff --git a/CHANGELOG.md b/CHANGELOG.md index 95672f833..5d98e47a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,9 @@ when in multipath mode. This RPC does not support NVMe bdevs in failover mode. A new RPC `bdev_nvme_set_multipath_policy` was added to set multipath policy of a NVMe bdev in multipath mode. +A new option `disable_auto_failback` was added to the `bdev_nvme_set_options` RPC to disable +automatic failback. + ### idxd A new parameter `flags` was added to all low level submission and preparation diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 55ea6680c..4c791d0b8 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -2963,6 +2963,7 @@ transport_ack_timeout | Optional | number | Time to wait ack until ret ctrlr_loss_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before deleting ctrlr. -1 means infinite reconnects. 0 means no reconnect. reconnect_delay_sec | Optional | number | Time to delay a reconnect trial. 0 means no reconnect. fast_io_fail_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout. +disable_auto_failback | Optional | boolean | Disable automatic failback. The RPC bdev_nvme_set_preferred_path can be used to do manual failback. #### Example diff --git a/doc/nvme_multipath.md b/doc/nvme_multipath.md index f0eb14b03..ebdddd472 100644 --- a/doc/nvme_multipath.md +++ b/doc/nvme_multipath.md @@ -58,7 +58,10 @@ For active-passive policy, each I/O channel for an NVMe bdev has a cache to stor I/O path which is connected and optimal from ANA and use it for I/O submission. Some users may want to specify the preferred I/O path manually. They can dynamically set the preferred I/O path using the `bdev_nvme_set_preferred_path` RPC. Such assignment is realized naturally by moving the -I/O path to the head of the I/O path list. +I/O path to the head of the I/O path list. By default, if the preferred I/O path is restored, +failback to it is done automatically. The automatic failback can be disabled by a global option +`disable_auto_failback`. In this case, the `bdev_nvme_set_preferred_path` RPC can be used +to do manual failback. The active-active policy uses the round-robin algorithm and submits an I/O to each I/O path in circular order. diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index 660d0f30f..729b48f89 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -145,6 +145,7 @@ static struct spdk_bdev_nvme_opts g_opts = { .ctrlr_loss_timeout_sec = 0, .reconnect_delay_sec = 0, .fast_io_fail_timeout_sec = 0, + .disable_auto_failback = false, }; #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL @@ -1415,7 +1416,9 @@ bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) nvme_qpair->qpair = qpair; - _bdev_nvme_clear_io_path_cache(nvme_qpair); + if (!g_opts.disable_auto_failback) { + _bdev_nvme_clear_io_path_cache(nvme_qpair); + } return 0; @@ -3601,14 +3604,19 @@ _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) prev = io_path; } - if (io_path != NULL && prev != NULL) { - STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); - STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); + if (io_path != NULL) { + if (prev != NULL) { + STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); + STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); + } /* We can set io_path to nbdev_ch->current_io_path directly here. * However, it needs to be conditional. To simplify the code, * just clear nbdev_ch->current_io_path and let find_io_path() * fill it. + * + * Automatic failback may be disabled. Hence even if the io_path is + * already at the head, clear nbdev_ch->current_io_path. */ nbdev_ch->current_io_path = NULL; } diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h index 69b1f4109..e84abcea0 100644 --- a/module/bdev/nvme/bdev_nvme.h +++ b/module/bdev/nvme/bdev_nvme.h @@ -276,6 +276,7 @@ struct spdk_bdev_nvme_opts { int32_t ctrlr_loss_timeout_sec; uint32_t reconnect_delay_sec; uint32_t fast_io_fail_timeout_sec; + bool disable_auto_failback; }; struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); diff --git a/module/bdev/nvme/bdev_nvme_rpc.c b/module/bdev/nvme/bdev_nvme_rpc.c index 64d4db230..80aac95b6 100644 --- a/module/bdev/nvme/bdev_nvme_rpc.c +++ b/module/bdev/nvme/bdev_nvme_rpc.c @@ -95,6 +95,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = {"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, {"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true}, {"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, + {"disable_auto_failback", offsetof(struct spdk_bdev_nvme_opts, disable_auto_failback), spdk_json_decode_bool, true}, }; static void diff --git a/python/spdk/rpc/bdev.py b/python/spdk/rpc/bdev.py index c7edba412..4b344ec61 100644 --- a/python/spdk/rpc/bdev.py +++ b/python/spdk/rpc/bdev.py @@ -464,7 +464,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None, delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, - fast_io_fail_timeout_sec=None): + fast_io_fail_timeout_sec=None, disable_auto_failback=None): """Set options for the bdev nvme. This is startup command. Args: @@ -500,6 +500,8 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. This can be overridden by bdev_nvme_attach_controller. (optional) + disable_auto_failback: Disable automatic failback. bdev_nvme_set_preferred_path can be used to do manual failback. + By default, immediately failback to the preferred I/O path if it is restored. (optional) """ params = {} @@ -562,6 +564,9 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo if fast_io_fail_timeout_sec is not None: params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec + if disable_auto_failback is not None: + params['disable_auto_failback'] = disable_auto_failback + return client.call('bdev_nvme_set_options', params) diff --git a/scripts/rpc.py b/scripts/rpc.py index 39164ed89..64c5feb21 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -509,7 +509,8 @@ if __name__ == "__main__": transport_ack_timeout=args.transport_ack_timeout, ctrlr_loss_timeout_sec=args.ctrlr_loss_timeout_sec, reconnect_delay_sec=args.reconnect_delay_sec, - fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec) + fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec, + disable_auto_failback=args.disable_auto_failback) p = subparsers.add_parser('bdev_nvme_set_options', aliases=['set_bdev_nvme_options'], help='Set options for the bdev nvme type. This is startup command.') @@ -570,6 +571,10 @@ if __name__ == "__main__": less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. This can be overridden by bdev_nvme_attach_controller.""", type=int) + p.add_argument('-f', '--disable-auto-failback', + help="""Disable automatic failback. bdev_nvme_set_preferred_path can be used to do manual failback. + By default, immediately failback to the preferred I/O path if it restored.""", + action='store_true') p.set_defaults(func=bdev_nvme_set_options) diff --git a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c index 82b0fe916..afd12e756 100644 --- a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c +++ b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c @@ -6252,6 +6252,153 @@ test_find_next_io_path(void) CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2); } +static void +test_disable_auto_failback(void) +{ + struct nvme_path_id path1 = {}, path2 = {}; + struct nvme_ctrlr_opts opts = {}; + struct spdk_nvme_ctrlr *ctrlr1, *ctrlr2; + struct nvme_bdev_ctrlr *nbdev_ctrlr; + struct nvme_ctrlr *nvme_ctrlr1; + const int STRING_SIZE = 32; + const char *attached_names[STRING_SIZE]; + struct nvme_bdev *bdev; + struct spdk_io_channel *ch; + struct nvme_bdev_channel *nbdev_ch; + struct nvme_io_path *io_path; + struct spdk_uuid uuid1 = { .u.raw = { 0x1 } }; + const struct spdk_nvme_ctrlr_data *cdata; + bool done; + int rc; + + memset(attached_names, 0, sizeof(char *) * STRING_SIZE); + ut_init_trid(&path1.trid); + ut_init_trid2(&path2.trid); + g_ut_attach_ctrlr_status = 0; + g_ut_attach_bdev_count = 1; + + g_opts.disable_auto_failback = true; + + opts.ctrlr_loss_timeout_sec = -1; + opts.reconnect_delay_sec = 1; + + set_thread(0); + + ctrlr1 = ut_attach_ctrlr(&path1.trid, 1, true, true); + SPDK_CU_ASSERT_FATAL(ctrlr1 != NULL); + + ctrlr1->ns[0].uuid = &uuid1; + + rc = bdev_nvme_create(&path1.trid, "nvme0", attached_names, STRING_SIZE, + attach_ctrlr_done, NULL, NULL, &opts, true); + CU_ASSERT(rc == 0); + + spdk_delay_us(1000); + poll_threads(); + spdk_delay_us(g_opts.nvme_adminq_poll_period_us); + poll_threads(); + + ctrlr2 = ut_attach_ctrlr(&path2.trid, 1, true, true); + SPDK_CU_ASSERT_FATAL(ctrlr2 != NULL); + + ctrlr2->ns[0].uuid = &uuid1; + + rc = bdev_nvme_create(&path2.trid, "nvme0", attached_names, STRING_SIZE, + attach_ctrlr_done, NULL, NULL, &opts, true); + CU_ASSERT(rc == 0); + + spdk_delay_us(1000); + poll_threads(); + spdk_delay_us(g_opts.nvme_adminq_poll_period_us); + poll_threads(); + + nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name("nvme0"); + SPDK_CU_ASSERT_FATAL(nbdev_ctrlr != NULL); + + bdev = nvme_bdev_ctrlr_get_bdev(nbdev_ctrlr, 1); + SPDK_CU_ASSERT_FATAL(bdev != NULL); + + nvme_ctrlr1 = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, &path1.trid); + SPDK_CU_ASSERT_FATAL(nvme_ctrlr1 != NULL); + + /* ctrlr1 was added first. Hence io_path to ctrlr1 should be preferred. */ + + ch = spdk_get_io_channel(bdev); + SPDK_CU_ASSERT_FATAL(ch != NULL); + nbdev_ch = spdk_io_channel_get_ctx(ch); + + io_path = bdev_nvme_find_io_path(nbdev_ch); + SPDK_CU_ASSERT_FATAL(io_path != NULL); + + CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr1); + + /* If resetting ctrlr1 failed, io_path to ctrlr2 should be used. */ + ctrlr1->fail_reset = true; + ctrlr1->is_failed = true; + + bdev_nvme_reset(nvme_ctrlr1); + + poll_threads(); + spdk_delay_us(g_opts.nvme_adminq_poll_period_us); + poll_threads(); + spdk_delay_us(g_opts.nvme_adminq_poll_period_us); + poll_threads(); + + CU_ASSERT(ctrlr1->adminq.is_connected == false); + + io_path = bdev_nvme_find_io_path(nbdev_ch); + SPDK_CU_ASSERT_FATAL(io_path != NULL); + + CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr2); + + /* After a second, ctrlr1 is recovered. However, automatic failback is disabled. + * Hence, io_path to ctrlr2 should still be used. + */ + ctrlr1->fail_reset = false; + + spdk_delay_us(SPDK_SEC_TO_USEC); + poll_threads(); + + CU_ASSERT(ctrlr1->adminq.is_connected == true); + + io_path = bdev_nvme_find_io_path(nbdev_ch); + SPDK_CU_ASSERT_FATAL(io_path != NULL); + + CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr2); + + /* Set io_path to ctrlr1 to preferred explicitly. Then io_path to ctrlr1 should + * be used again. + */ + + cdata = spdk_nvme_ctrlr_get_data(ctrlr1); + done = false; + + bdev_nvme_set_preferred_path(bdev->disk.name, cdata->cntlid, _set_preferred_path_cb, &done); + + poll_threads(); + CU_ASSERT(done == true); + + io_path = bdev_nvme_find_io_path(nbdev_ch); + SPDK_CU_ASSERT_FATAL(io_path != NULL); + + CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr1); + + spdk_put_io_channel(ch); + + poll_threads(); + + rc = bdev_nvme_delete("nvme0", &g_any_path); + CU_ASSERT(rc == 0); + + poll_threads(); + spdk_delay_us(1000); + poll_threads(); + + CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL); + + g_opts.disable_auto_failback = false; +} + int main(int argc, const char **argv) { @@ -6303,6 +6450,7 @@ main(int argc, const char **argv) CU_ADD_TEST(suite, test_ana_transition); CU_ADD_TEST(suite, test_set_preferred_path); CU_ADD_TEST(suite, test_find_next_io_path); + CU_ADD_TEST(suite, test_disable_auto_failback); CU_basic_set_mode(CU_BRM_VERBOSE);