From 8f9b977504142c37ebf9717a9d83bc18a765f1dd Mon Sep 17 00:00:00 2001 From: Shuhei Matsumoto Date: Fri, 29 Apr 2022 14:37:35 +0900 Subject: [PATCH] bdev/nvme: Add active/active policy for multipath mode The NVMe bdev module supported active-passive policy for multipath mode first. By this patch, the NVMe bdev module supports active-active policy for multipath node next. Following the Linux kernel native NVMe multipath, the NVMe bdev module supports round robin algorithm for active-active policy. The multipath policy, active-passive or active-active, is managed per nvme_bdev. The multipath policy is copied to all corresponding nvme_bdev_channels. Different from active-passive, active-active caches even non_optimized path to provide load balance across multiple paths. Signed-off-by: Shuhei Matsumoto Change-Id: Ie18b24db60d3da1ce2f83725b6cd3079f628f95b Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12001 Community-CI: Mellanox Build Bot Community-CI: Broadcom CI Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Aleksey Marchuk --- CHANGELOG.md | 3 + doc/jsonrpc.md | 37 +++++ module/bdev/nvme/bdev_nvme.c | 153 +++++++++++++++++- module/bdev/nvme/bdev_nvme.h | 37 ++++- module/bdev/nvme/bdev_nvme_rpc.c | 87 ++++++++++ python/spdk/rpc/bdev.py | 14 ++ scripts/rpc.py | 11 ++ .../lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c | 53 ++++++ 8 files changed, 386 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7bc94e52..6fcc59b9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,9 @@ A new RPC `bdev_nvme_get_io_paths` was added to get all active I/O paths. A new RPC `bdev_nvme_set_preferred_path` was added to set preferred I/O path for an NVMe bdev when in multipath mode. This RPC does not support NVMe bdevs in failover mode. +A new RPC `bdev_nvme_set_multipath_policy` was added to set multipath policy of a NVMe bdev +in multipath mode. + ### idxd A new parameter `flags` was added to all low level submission and preparation diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 472763c47..e5da55592 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -3433,6 +3433,43 @@ Example response: } ~~~ +### bdev_nvme_set_multipath_policy {#rpc_bdev_nvme_set_multipath_policy} + +Set multipath policy of the NVMe bdev in multipath mode. + +#### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +name | Required | string | Name of the NVMe bdev +policy | Required | string | Multipath policy: active_active or active_passive + +#### Example + +Example request: + +~~~json +{ + "jsonrpc": "2.0", + "method": "bdev_nvme_set_multipath_policy", + "id": 1, + "params": { + "name": "Nvme0n1", + "policy": "active_passive" + } +} +~~~ + +Example response: + +~~~json +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ + ### bdev_nvme_cuse_register {#rpc_bdev_nvme_cuse_register} Register CUSE device on NVMe controller. diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index 339e14263..d8afa1637 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -825,6 +825,55 @@ nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) return true; } +/* Simulate circular linked list. */ +static inline struct nvme_io_path * +nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) +{ + struct nvme_io_path *next_path; + + next_path = STAILQ_NEXT(prev_path, stailq); + if (next_path != NULL) { + return next_path; + } else { + return STAILQ_FIRST(&nbdev_ch->io_path_list); + } +} + +static struct nvme_io_path * +bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch, + struct nvme_io_path *prev) +{ + struct nvme_io_path *io_path, *start, *non_optimized = NULL; + + start = nvme_io_path_get_next(nbdev_ch, prev); + + io_path = start; + do { + if (spdk_likely(nvme_io_path_is_connected(io_path) && + !io_path->nvme_ns->ana_state_updating)) { + switch (io_path->nvme_ns->ana_state) { + case SPDK_NVME_ANA_OPTIMIZED_STATE: + nbdev_ch->current_io_path = io_path; + return io_path; + case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: + if (non_optimized == NULL) { + non_optimized = io_path; + } + break; + default: + break; + } + } + io_path = nvme_io_path_get_next(nbdev_ch, io_path); + } while (io_path != start); + + /* We come here only if there is no optimized path. Cache even non_optimized + * path for load balance across multiple non_optimized paths. + */ + nbdev_ch->current_io_path = non_optimized; + return non_optimized; +} + static struct nvme_io_path * _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) { @@ -864,7 +913,11 @@ bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) return _bdev_nvme_find_io_path(nbdev_ch); } - return nbdev_ch->current_io_path; + if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) { + return nbdev_ch->current_io_path; + } else { + return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path); + } } /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, @@ -2600,6 +2653,20 @@ nvme_namespace_info_json(struct spdk_json_write_ctx *w, spdk_json_write_object_end(w); } +static const char * +nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) +{ + switch (nbdev->mp_policy) { + case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: + return "active_passive"; + case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: + return "active_active"; + default: + assert(false); + return "invalid"; + } +} + static int bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) { @@ -2612,6 +2679,7 @@ bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) nvme_namespace_info_json(w, nvme_ns); } spdk_json_write_array_end(w); + spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); pthread_mutex_unlock(&nvme_bdev->mutex); return 0; @@ -2884,6 +2952,7 @@ nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) } bdev->ref = 1; + bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; TAILQ_INIT(&bdev->nvme_ns_list); TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); bdev->opal = nvme_ctrlr->opal_dev != NULL; @@ -3635,6 +3704,88 @@ err_alloc: cb_fn(cb_arg, rc); } +struct bdev_nvme_set_multipath_policy_ctx { + struct spdk_bdev_desc *desc; + bdev_nvme_set_multipath_policy_cb cb_fn; + void *cb_arg; +}; + +static void +bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) +{ + struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + assert(ctx != NULL); + assert(ctx->desc != NULL); + assert(ctx->cb_fn != NULL); + + spdk_bdev_close(ctx->desc); + + ctx->cb_fn(ctx->cb_arg, status); + + free(ctx); +} + +static void +_bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); + struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); + + nbdev_ch->mp_policy = nbdev->mp_policy; + nbdev_ch->current_io_path = NULL; + + spdk_for_each_channel_continue(i, 0); +} + +void +bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, + bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) +{ + struct bdev_nvme_set_multipath_policy_ctx *ctx; + struct spdk_bdev *bdev; + struct nvme_bdev *nbdev; + int rc; + + assert(cb_fn != NULL); + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("Failed to alloc context.\n"); + rc = -ENOMEM; + goto err_alloc; + } + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); + if (rc != 0) { + SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); + rc = -ENODEV; + goto err_open; + } + + bdev = spdk_bdev_desc_get_bdev(ctx->desc); + nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); + + pthread_mutex_lock(&nbdev->mutex); + nbdev->mp_policy = policy; + pthread_mutex_unlock(&nbdev->mutex); + + spdk_for_each_channel(nbdev, + _bdev_nvme_set_multipath_policy, + ctx, + bdev_nvme_set_multipath_policy_done); + return; + +err_open: + free(ctx); +err_alloc: + cb_fn(cb_arg, rc); +} + static void aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) { diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h index def660a8b..9d64eb194 100644 --- a/module/bdev/nvme/bdev_nvme.h +++ b/module/bdev/nvme/bdev_nvme.h @@ -48,6 +48,11 @@ extern bool g_bdev_nvme_module_finish; #define NVME_MAX_CONTROLLERS 1024 +enum bdev_nvme_multipath_policy { + BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE, + BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE, +}; + typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc); typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx); typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx); @@ -166,14 +171,15 @@ struct nvme_bdev_ctrlr { }; struct nvme_bdev { - struct spdk_bdev disk; - uint32_t nsid; - struct nvme_bdev_ctrlr *nbdev_ctrlr; - pthread_mutex_t mutex; - int ref; - TAILQ_HEAD(, nvme_ns) nvme_ns_list; - bool opal; - TAILQ_ENTRY(nvme_bdev) tailq; + struct spdk_bdev disk; + uint32_t nsid; + struct nvme_bdev_ctrlr *nbdev_ctrlr; + pthread_mutex_t mutex; + int ref; + enum bdev_nvme_multipath_policy mp_policy; + TAILQ_HEAD(, nvme_ns) nvme_ns_list; + bool opal; + TAILQ_ENTRY(nvme_bdev) tailq; }; struct nvme_qpair { @@ -207,6 +213,7 @@ struct nvme_io_path { struct nvme_bdev_channel { struct nvme_io_path *current_io_path; + enum bdev_nvme_multipath_policy mp_policy; STAILQ_HEAD(, nvme_io_path) io_path_list; TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list; struct spdk_poller *retry_io_poller; @@ -333,4 +340,18 @@ typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc); void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg); +typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc); + +/** + * Set multipath policy of the NVMe bdev. + * + * \param name NVMe bdev name + * \param policy Multipath policy (active-passive or active-active) + * \param cb_fn Function to be called back after completion. + */ +void bdev_nvme_set_multipath_policy(const char *name, + enum bdev_nvme_multipath_policy policy, + bdev_nvme_set_multipath_policy_cb cb_fn, + void *cb_arg); + #endif /* SPDK_BDEV_NVME_H */ diff --git a/module/bdev/nvme/bdev_nvme_rpc.c b/module/bdev/nvme/bdev_nvme_rpc.c index 074deb236..75bba0d7e 100644 --- a/module/bdev/nvme/bdev_nvme_rpc.c +++ b/module/bdev/nvme/bdev_nvme_rpc.c @@ -2196,3 +2196,90 @@ cleanup: } SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path, SPDK_RPC_RUNTIME) + +struct rpc_set_multipath_policy { + char *name; + enum bdev_nvme_multipath_policy policy; +}; + +static void +free_rpc_set_multipath_policy(struct rpc_set_multipath_policy *req) +{ + free(req->name); +} + +static int +rpc_decode_mp_policy(const struct spdk_json_val *val, void *out) +{ + enum bdev_nvme_multipath_policy *policy = out; + + if (spdk_json_strequal(val, "active_passive") == true) { + *policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; + } else if (spdk_json_strequal(val, "active_active") == true) { + *policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE; + } else { + SPDK_NOTICELOG("Invalid parameter value: policy\n"); + return -EINVAL; + } + + return 0; +} + +static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = { + {"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string}, + {"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy}, +}; + +struct rpc_set_multipath_policy_ctx { + struct rpc_set_multipath_policy req; + struct spdk_jsonrpc_request *request; +}; + +static void +rpc_bdev_nvme_set_multipath_policy_done(void *cb_arg, int rc) +{ + struct rpc_set_multipath_policy_ctx *ctx = cb_arg; + + if (rc == 0) { + spdk_jsonrpc_send_bool_response(ctx->request, true); + } else { + spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc)); + } + + free_rpc_set_multipath_policy(&ctx->req); + free(ctx); +} + +static void +rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_set_multipath_policy_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + if (spdk_json_decode_object(params, rpc_set_multipath_policy_decoders, + SPDK_COUNTOF(rpc_set_multipath_policy_decoders), + &ctx->req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + ctx->request = request; + + bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy, + rpc_bdev_nvme_set_multipath_policy_done, ctx); + return; + +cleanup: + free_rpc_set_multipath_policy(&ctx->req); + free(ctx); +} +SPDK_RPC_REGISTER("bdev_nvme_set_multipath_policy", rpc_bdev_nvme_set_multipath_policy, + SPDK_RPC_RUNTIME) diff --git a/python/spdk/rpc/bdev.py b/python/spdk/rpc/bdev.py index 21f2ef369..99104bf65 100644 --- a/python/spdk/rpc/bdev.py +++ b/python/spdk/rpc/bdev.py @@ -840,6 +840,20 @@ def bdev_nvme_set_preferred_path(client, name, cntlid): return client.call('bdev_nvme_set_preferred_path', params) +def bdev_nvme_set_multipath_policy(client, name, policy): + """Set multipath policy of the NVMe bdev + + Args: + name: NVMe bdev name + policy: Multipath policy (active_passive or active_active) + """ + + params = {'name': name, + 'policy': policy} + + return client.call('bdev_nvme_set_multipath_policy', params) + + def bdev_nvme_cuse_register(client, name): """Register CUSE devices on NVMe controller. diff --git a/scripts/rpc.py b/scripts/rpc.py index 5a864025a..f43b5c2b6 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -779,6 +779,17 @@ if __name__ == "__main__": p.add_argument('-c', '--cntlid', help='NVMe-oF controller ID', type=int, required=True) p.set_defaults(func=bdev_nvme_set_preferred_path) + def bdev_nvme_set_multipath_policy(args): + rpc.bdev.bdev_nvme_set_multipath_policy(args.client, + name=args.name, + policy=args.policy) + + p = subparsers.add_parser('bdev_nvme_set_multipath_policy', + help="""Set multipath policy of the NVMe bdev""") + p.add_argument('-b', '--name', help='Name of the NVMe bdev', required=True) + p.add_argument('-p', '--policy', help='Multipath policy (active_passive or active_active)', required=True) + p.set_defaults(func=bdev_nvme_set_multipath_policy) + def bdev_nvme_cuse_register(args): rpc.bdev.bdev_nvme_cuse_register(args.client, name=args.name) diff --git a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c index 3d8b394bc..82b0fe916 100644 --- a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c +++ b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c @@ -6200,6 +6200,58 @@ test_set_preferred_path(void) CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL); } +static void +test_find_next_io_path(void) +{ + struct nvme_bdev_channel nbdev_ch = { + .io_path_list = STAILQ_HEAD_INITIALIZER(nbdev_ch.io_path_list), + .mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE, + }; + struct spdk_nvme_qpair qpair1 = {}, qpair2 = {}, qpair3 = {}; + struct spdk_nvme_ctrlr ctrlr1 = {}, ctrlr2 = {}, ctrlr3 = {}; + struct nvme_ctrlr nvme_ctrlr1 = { .ctrlr = &ctrlr1, }; + struct nvme_ctrlr nvme_ctrlr2 = { .ctrlr = &ctrlr2, }; + struct nvme_ctrlr nvme_ctrlr3 = { .ctrlr = &ctrlr3, }; + struct nvme_ctrlr_channel ctrlr_ch1 = {}; + struct nvme_ctrlr_channel ctrlr_ch2 = {}; + struct nvme_ctrlr_channel ctrlr_ch3 = {}; + struct nvme_qpair nvme_qpair1 = { .ctrlr_ch = &ctrlr_ch1, .ctrlr = &nvme_ctrlr1, .qpair = &qpair1, }; + struct nvme_qpair nvme_qpair2 = { .ctrlr_ch = &ctrlr_ch2, .ctrlr = &nvme_ctrlr2, .qpair = &qpair2, }; + struct nvme_qpair nvme_qpair3 = { .ctrlr_ch = &ctrlr_ch3, .ctrlr = &nvme_ctrlr3, .qpair = &qpair3, }; + struct nvme_ns nvme_ns1 = {}, nvme_ns2 = {}, nvme_ns3 = {}; + struct nvme_io_path io_path1 = { .qpair = &nvme_qpair1, .nvme_ns = &nvme_ns1, }; + struct nvme_io_path io_path2 = { .qpair = &nvme_qpair2, .nvme_ns = &nvme_ns2, }; + struct nvme_io_path io_path3 = { .qpair = &nvme_qpair3, .nvme_ns = &nvme_ns3, }; + + STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path1, stailq); + STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path2, stailq); + STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path3, stailq); + + /* nbdev_ch->current_io_path is filled always when bdev_nvme_find_next_io_path() is called. */ + + nbdev_ch.current_io_path = &io_path2; + nvme_ns1.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE; + nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + nvme_ns3.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE; + CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2); + + nvme_ns1.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE; + nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE; + CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2); + + nvme_ns1.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; + nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE; + CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path1); + + nbdev_ch.current_io_path = &io_path3; + nvme_ns1.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE; + nvme_ns2.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE; + nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE; + CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2); +} + int main(int argc, const char **argv) { @@ -6250,6 +6302,7 @@ main(int argc, const char **argv) CU_ADD_TEST(suite, test_nvme_ns_cmp); CU_ADD_TEST(suite, test_ana_transition); CU_ADD_TEST(suite, test_set_preferred_path); + CU_ADD_TEST(suite, test_find_next_io_path); CU_basic_set_mode(CU_BRM_VERBOSE);