bdev/nvme: Disable automatic failback in multipath mode

By default, failback to the preferred I/O path is done automatically
if it is restored. Some users may want to keep using the backup I/O
path even if the preferred I/O path is restored. In this case,
bdev_nvme_set_preferred_path can be used to do manual failback.

We may be able to clear/fill I/O path cache more strictly but it will
be complicated and have bugs. This patch does the minimal change,
just skips an apparent case.

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: I78fe5faee6ff04e88ae3d7c6be6da1c20637c912
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12431
Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
Shuhei Matsumoto 2022-05-02 12:07:02 +09:00 committed by Tomasz Zawadzki
parent b0ab552452
commit 00d46b80b2
9 changed files with 182 additions and 7 deletions

View File

@ -29,6 +29,9 @@ when in multipath mode. This RPC does not support NVMe bdevs in failover mode.
A new RPC `bdev_nvme_set_multipath_policy` was added to set multipath policy of a NVMe bdev A new RPC `bdev_nvme_set_multipath_policy` was added to set multipath policy of a NVMe bdev
in multipath mode. in multipath mode.
A new option `disable_auto_failback` was added to the `bdev_nvme_set_options` RPC to disable
automatic failback.
### idxd ### idxd
A new parameter `flags` was added to all low level submission and preparation A new parameter `flags` was added to all low level submission and preparation

View File

@ -2963,6 +2963,7 @@ transport_ack_timeout | Optional | number | Time to wait ack until ret
ctrlr_loss_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before deleting ctrlr. -1 means infinite reconnects. 0 means no reconnect. ctrlr_loss_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before deleting ctrlr. -1 means infinite reconnects. 0 means no reconnect.
reconnect_delay_sec | Optional | number | Time to delay a reconnect trial. 0 means no reconnect. reconnect_delay_sec | Optional | number | Time to delay a reconnect trial. 0 means no reconnect.
fast_io_fail_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout. fast_io_fail_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout.
disable_auto_failback | Optional | boolean | Disable automatic failback. The RPC bdev_nvme_set_preferred_path can be used to do manual failback.
#### Example #### Example

View File

@ -58,7 +58,10 @@ For active-passive policy, each I/O channel for an NVMe bdev has a cache to stor
I/O path which is connected and optimal from ANA and use it for I/O submission. Some users may want I/O path which is connected and optimal from ANA and use it for I/O submission. Some users may want
to specify the preferred I/O path manually. They can dynamically set the preferred I/O path using to specify the preferred I/O path manually. They can dynamically set the preferred I/O path using
the `bdev_nvme_set_preferred_path` RPC. Such assignment is realized naturally by moving the the `bdev_nvme_set_preferred_path` RPC. Such assignment is realized naturally by moving the
I/O path to the head of the I/O path list. I/O path to the head of the I/O path list. By default, if the preferred I/O path is restored,
failback to it is done automatically. The automatic failback can be disabled by a global option
`disable_auto_failback`. In this case, the `bdev_nvme_set_preferred_path` RPC can be used
to do manual failback.
The active-active policy uses the round-robin algorithm and submits an I/O to each I/O path in The active-active policy uses the round-robin algorithm and submits an I/O to each I/O path in
circular order. circular order.

View File

@ -145,6 +145,7 @@ static struct spdk_bdev_nvme_opts g_opts = {
.ctrlr_loss_timeout_sec = 0, .ctrlr_loss_timeout_sec = 0,
.reconnect_delay_sec = 0, .reconnect_delay_sec = 0,
.fast_io_fail_timeout_sec = 0, .fast_io_fail_timeout_sec = 0,
.disable_auto_failback = false,
}; };
#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
@ -1415,7 +1416,9 @@ bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair)
nvme_qpair->qpair = qpair; nvme_qpair->qpair = qpair;
if (!g_opts.disable_auto_failback) {
_bdev_nvme_clear_io_path_cache(nvme_qpair); _bdev_nvme_clear_io_path_cache(nvme_qpair);
}
return 0; return 0;
@ -3601,14 +3604,19 @@ _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i)
prev = io_path; prev = io_path;
} }
if (io_path != NULL && prev != NULL) { if (io_path != NULL) {
if (prev != NULL) {
STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq);
STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq);
}
/* We can set io_path to nbdev_ch->current_io_path directly here. /* We can set io_path to nbdev_ch->current_io_path directly here.
* However, it needs to be conditional. To simplify the code, * However, it needs to be conditional. To simplify the code,
* just clear nbdev_ch->current_io_path and let find_io_path() * just clear nbdev_ch->current_io_path and let find_io_path()
* fill it. * fill it.
*
* Automatic failback may be disabled. Hence even if the io_path is
* already at the head, clear nbdev_ch->current_io_path.
*/ */
nbdev_ch->current_io_path = NULL; nbdev_ch->current_io_path = NULL;
} }

View File

@ -276,6 +276,7 @@ struct spdk_bdev_nvme_opts {
int32_t ctrlr_loss_timeout_sec; int32_t ctrlr_loss_timeout_sec;
uint32_t reconnect_delay_sec; uint32_t reconnect_delay_sec;
uint32_t fast_io_fail_timeout_sec; uint32_t fast_io_fail_timeout_sec;
bool disable_auto_failback;
}; };
struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);

View File

@ -95,6 +95,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
{"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, {"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true},
{"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true}, {"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true},
{"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, {"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true},
{"disable_auto_failback", offsetof(struct spdk_bdev_nvme_opts, disable_auto_failback), spdk_json_decode_bool, true},
}; };
static void static void

View File

@ -464,7 +464,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None, nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None,
delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None,
transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None,
fast_io_fail_timeout_sec=None): fast_io_fail_timeout_sec=None, disable_auto_failback=None):
"""Set options for the bdev nvme. This is startup command. """Set options for the bdev nvme. This is startup command.
Args: Args:
@ -500,6 +500,8 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than
ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1.
This can be overridden by bdev_nvme_attach_controller. (optional) This can be overridden by bdev_nvme_attach_controller. (optional)
disable_auto_failback: Disable automatic failback. bdev_nvme_set_preferred_path can be used to do manual failback.
By default, immediately failback to the preferred I/O path if it is restored. (optional)
""" """
params = {} params = {}
@ -562,6 +564,9 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
if fast_io_fail_timeout_sec is not None: if fast_io_fail_timeout_sec is not None:
params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec
if disable_auto_failback is not None:
params['disable_auto_failback'] = disable_auto_failback
return client.call('bdev_nvme_set_options', params) return client.call('bdev_nvme_set_options', params)

View File

@ -509,7 +509,8 @@ if __name__ == "__main__":
transport_ack_timeout=args.transport_ack_timeout, transport_ack_timeout=args.transport_ack_timeout,
ctrlr_loss_timeout_sec=args.ctrlr_loss_timeout_sec, ctrlr_loss_timeout_sec=args.ctrlr_loss_timeout_sec,
reconnect_delay_sec=args.reconnect_delay_sec, reconnect_delay_sec=args.reconnect_delay_sec,
fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec) fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec,
disable_auto_failback=args.disable_auto_failback)
p = subparsers.add_parser('bdev_nvme_set_options', aliases=['set_bdev_nvme_options'], p = subparsers.add_parser('bdev_nvme_set_options', aliases=['set_bdev_nvme_options'],
help='Set options for the bdev nvme type. This is startup command.') help='Set options for the bdev nvme type. This is startup command.')
@ -570,6 +571,10 @@ if __name__ == "__main__":
less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1.
This can be overridden by bdev_nvme_attach_controller.""", This can be overridden by bdev_nvme_attach_controller.""",
type=int) type=int)
p.add_argument('-f', '--disable-auto-failback',
help="""Disable automatic failback. bdev_nvme_set_preferred_path can be used to do manual failback.
By default, immediately failback to the preferred I/O path if it restored.""",
action='store_true')
p.set_defaults(func=bdev_nvme_set_options) p.set_defaults(func=bdev_nvme_set_options)

View File

@ -6252,6 +6252,153 @@ test_find_next_io_path(void)
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2); CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
} }
static void
test_disable_auto_failback(void)
{
struct nvme_path_id path1 = {}, path2 = {};
struct nvme_ctrlr_opts opts = {};
struct spdk_nvme_ctrlr *ctrlr1, *ctrlr2;
struct nvme_bdev_ctrlr *nbdev_ctrlr;
struct nvme_ctrlr *nvme_ctrlr1;
const int STRING_SIZE = 32;
const char *attached_names[STRING_SIZE];
struct nvme_bdev *bdev;
struct spdk_io_channel *ch;
struct nvme_bdev_channel *nbdev_ch;
struct nvme_io_path *io_path;
struct spdk_uuid uuid1 = { .u.raw = { 0x1 } };
const struct spdk_nvme_ctrlr_data *cdata;
bool done;
int rc;
memset(attached_names, 0, sizeof(char *) * STRING_SIZE);
ut_init_trid(&path1.trid);
ut_init_trid2(&path2.trid);
g_ut_attach_ctrlr_status = 0;
g_ut_attach_bdev_count = 1;
g_opts.disable_auto_failback = true;
opts.ctrlr_loss_timeout_sec = -1;
opts.reconnect_delay_sec = 1;
set_thread(0);
ctrlr1 = ut_attach_ctrlr(&path1.trid, 1, true, true);
SPDK_CU_ASSERT_FATAL(ctrlr1 != NULL);
ctrlr1->ns[0].uuid = &uuid1;
rc = bdev_nvme_create(&path1.trid, "nvme0", attached_names, STRING_SIZE,
attach_ctrlr_done, NULL, NULL, &opts, true);
CU_ASSERT(rc == 0);
spdk_delay_us(1000);
poll_threads();
spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
poll_threads();
ctrlr2 = ut_attach_ctrlr(&path2.trid, 1, true, true);
SPDK_CU_ASSERT_FATAL(ctrlr2 != NULL);
ctrlr2->ns[0].uuid = &uuid1;
rc = bdev_nvme_create(&path2.trid, "nvme0", attached_names, STRING_SIZE,
attach_ctrlr_done, NULL, NULL, &opts, true);
CU_ASSERT(rc == 0);
spdk_delay_us(1000);
poll_threads();
spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
poll_threads();
nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name("nvme0");
SPDK_CU_ASSERT_FATAL(nbdev_ctrlr != NULL);
bdev = nvme_bdev_ctrlr_get_bdev(nbdev_ctrlr, 1);
SPDK_CU_ASSERT_FATAL(bdev != NULL);
nvme_ctrlr1 = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, &path1.trid);
SPDK_CU_ASSERT_FATAL(nvme_ctrlr1 != NULL);
/* ctrlr1 was added first. Hence io_path to ctrlr1 should be preferred. */
ch = spdk_get_io_channel(bdev);
SPDK_CU_ASSERT_FATAL(ch != NULL);
nbdev_ch = spdk_io_channel_get_ctx(ch);
io_path = bdev_nvme_find_io_path(nbdev_ch);
SPDK_CU_ASSERT_FATAL(io_path != NULL);
CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr1);
/* If resetting ctrlr1 failed, io_path to ctrlr2 should be used. */
ctrlr1->fail_reset = true;
ctrlr1->is_failed = true;
bdev_nvme_reset(nvme_ctrlr1);
poll_threads();
spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
poll_threads();
spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
poll_threads();
CU_ASSERT(ctrlr1->adminq.is_connected == false);
io_path = bdev_nvme_find_io_path(nbdev_ch);
SPDK_CU_ASSERT_FATAL(io_path != NULL);
CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr2);
/* After a second, ctrlr1 is recovered. However, automatic failback is disabled.
* Hence, io_path to ctrlr2 should still be used.
*/
ctrlr1->fail_reset = false;
spdk_delay_us(SPDK_SEC_TO_USEC);
poll_threads();
CU_ASSERT(ctrlr1->adminq.is_connected == true);
io_path = bdev_nvme_find_io_path(nbdev_ch);
SPDK_CU_ASSERT_FATAL(io_path != NULL);
CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr2);
/* Set io_path to ctrlr1 to preferred explicitly. Then io_path to ctrlr1 should
* be used again.
*/
cdata = spdk_nvme_ctrlr_get_data(ctrlr1);
done = false;
bdev_nvme_set_preferred_path(bdev->disk.name, cdata->cntlid, _set_preferred_path_cb, &done);
poll_threads();
CU_ASSERT(done == true);
io_path = bdev_nvme_find_io_path(nbdev_ch);
SPDK_CU_ASSERT_FATAL(io_path != NULL);
CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr1);
spdk_put_io_channel(ch);
poll_threads();
rc = bdev_nvme_delete("nvme0", &g_any_path);
CU_ASSERT(rc == 0);
poll_threads();
spdk_delay_us(1000);
poll_threads();
CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
g_opts.disable_auto_failback = false;
}
int int
main(int argc, const char **argv) main(int argc, const char **argv)
{ {
@ -6303,6 +6450,7 @@ main(int argc, const char **argv)
CU_ADD_TEST(suite, test_ana_transition); CU_ADD_TEST(suite, test_ana_transition);
CU_ADD_TEST(suite, test_set_preferred_path); CU_ADD_TEST(suite, test_set_preferred_path);
CU_ADD_TEST(suite, test_find_next_io_path); CU_ADD_TEST(suite, test_find_next_io_path);
CU_ADD_TEST(suite, test_disable_auto_failback);
CU_basic_set_mode(CU_BRM_VERBOSE); CU_basic_set_mode(CU_BRM_VERBOSE);