bdev/nvme: Add active/active policy for multipath mode

The NVMe bdev module supported active-passive policy for multipath mode
first. By this patch, the NVMe bdev module supports active-active policy
for multipath node next. Following the Linux kernel native NVMe multipath,
the NVMe bdev module supports round robin algorithm for active-active
policy.

The multipath policy, active-passive or active-active, is managed per
nvme_bdev. The multipath policy is copied to all corresponding
nvme_bdev_channels.

Different from active-passive, active-active caches even non_optimized
path to provide load balance across multiple paths.

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: Ie18b24db60d3da1ce2f83725b6cd3079f628f95b
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12001
Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
This commit is contained in:
Shuhei Matsumoto 2022-04-29 14:37:35 +09:00 committed by Tomasz Zawadzki
parent 0869265d66
commit 8f9b977504
8 changed files with 386 additions and 9 deletions

View File

@ -26,6 +26,9 @@ A new RPC `bdev_nvme_get_io_paths` was added to get all active I/O paths.
A new RPC `bdev_nvme_set_preferred_path` was added to set preferred I/O path for an NVMe bdev A new RPC `bdev_nvme_set_preferred_path` was added to set preferred I/O path for an NVMe bdev
when in multipath mode. This RPC does not support NVMe bdevs in failover mode. when in multipath mode. This RPC does not support NVMe bdevs in failover mode.
A new RPC `bdev_nvme_set_multipath_policy` was added to set multipath policy of a NVMe bdev
in multipath mode.
### idxd ### idxd
A new parameter `flags` was added to all low level submission and preparation A new parameter `flags` was added to all low level submission and preparation

View File

@ -3433,6 +3433,43 @@ Example response:
} }
~~~ ~~~
### bdev_nvme_set_multipath_policy {#rpc_bdev_nvme_set_multipath_policy}
Set multipath policy of the NVMe bdev in multipath mode.
#### Parameters
Name | Optional | Type | Description
----------------------- | -------- | ----------- | -----------
name | Required | string | Name of the NVMe bdev
policy | Required | string | Multipath policy: active_active or active_passive
#### Example
Example request:
~~~json
{
"jsonrpc": "2.0",
"method": "bdev_nvme_set_multipath_policy",
"id": 1,
"params": {
"name": "Nvme0n1",
"policy": "active_passive"
}
}
~~~
Example response:
~~~json
{
"jsonrpc": "2.0",
"id": 1,
"result": true
}
~~~
### bdev_nvme_cuse_register {#rpc_bdev_nvme_cuse_register} ### bdev_nvme_cuse_register {#rpc_bdev_nvme_cuse_register}
Register CUSE device on NVMe controller. Register CUSE device on NVMe controller.

View File

@ -825,6 +825,55 @@ nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
return true; return true;
} }
/* Simulate circular linked list. */
static inline struct nvme_io_path *
nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path)
{
struct nvme_io_path *next_path;
next_path = STAILQ_NEXT(prev_path, stailq);
if (next_path != NULL) {
return next_path;
} else {
return STAILQ_FIRST(&nbdev_ch->io_path_list);
}
}
static struct nvme_io_path *
bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch,
struct nvme_io_path *prev)
{
struct nvme_io_path *io_path, *start, *non_optimized = NULL;
start = nvme_io_path_get_next(nbdev_ch, prev);
io_path = start;
do {
if (spdk_likely(nvme_io_path_is_connected(io_path) &&
!io_path->nvme_ns->ana_state_updating)) {
switch (io_path->nvme_ns->ana_state) {
case SPDK_NVME_ANA_OPTIMIZED_STATE:
nbdev_ch->current_io_path = io_path;
return io_path;
case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
if (non_optimized == NULL) {
non_optimized = io_path;
}
break;
default:
break;
}
}
io_path = nvme_io_path_get_next(nbdev_ch, io_path);
} while (io_path != start);
/* We come here only if there is no optimized path. Cache even non_optimized
* path for load balance across multiple non_optimized paths.
*/
nbdev_ch->current_io_path = non_optimized;
return non_optimized;
}
static struct nvme_io_path * static struct nvme_io_path *
_bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{ {
@ -864,7 +913,11 @@ bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
return _bdev_nvme_find_io_path(nbdev_ch); return _bdev_nvme_find_io_path(nbdev_ch);
} }
if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) {
return nbdev_ch->current_io_path; return nbdev_ch->current_io_path;
} else {
return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path);
}
} }
/* Return true if there is any io_path whose qpair is active or ctrlr is not failed, /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
@ -2600,6 +2653,20 @@ nvme_namespace_info_json(struct spdk_json_write_ctx *w,
spdk_json_write_object_end(w); spdk_json_write_object_end(w);
} }
static const char *
nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev)
{
switch (nbdev->mp_policy) {
case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
return "active_passive";
case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
return "active_active";
default:
assert(false);
return "invalid";
}
}
static int static int
bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
{ {
@ -2612,6 +2679,7 @@ bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
nvme_namespace_info_json(w, nvme_ns); nvme_namespace_info_json(w, nvme_ns);
} }
spdk_json_write_array_end(w); spdk_json_write_array_end(w);
spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev));
pthread_mutex_unlock(&nvme_bdev->mutex); pthread_mutex_unlock(&nvme_bdev->mutex);
return 0; return 0;
@ -2884,6 +2952,7 @@ nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
} }
bdev->ref = 1; bdev->ref = 1;
bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
TAILQ_INIT(&bdev->nvme_ns_list); TAILQ_INIT(&bdev->nvme_ns_list);
TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
bdev->opal = nvme_ctrlr->opal_dev != NULL; bdev->opal = nvme_ctrlr->opal_dev != NULL;
@ -3635,6 +3704,88 @@ err_alloc:
cb_fn(cb_arg, rc); cb_fn(cb_arg, rc);
} }
struct bdev_nvme_set_multipath_policy_ctx {
struct spdk_bdev_desc *desc;
bdev_nvme_set_multipath_policy_cb cb_fn;
void *cb_arg;
};
static void
bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status)
{
struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
assert(ctx != NULL);
assert(ctx->desc != NULL);
assert(ctx->cb_fn != NULL);
spdk_bdev_close(ctx->desc);
ctx->cb_fn(ctx->cb_arg, status);
free(ctx);
}
static void
_bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i)
{
struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch);
nbdev_ch->mp_policy = nbdev->mp_policy;
nbdev_ch->current_io_path = NULL;
spdk_for_each_channel_continue(i, 0);
}
void
bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy,
bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
{
struct bdev_nvme_set_multipath_policy_ctx *ctx;
struct spdk_bdev *bdev;
struct nvme_bdev *nbdev;
int rc;
assert(cb_fn != NULL);
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
SPDK_ERRLOG("Failed to alloc context.\n");
rc = -ENOMEM;
goto err_alloc;
}
ctx->cb_fn = cb_fn;
ctx->cb_arg = cb_arg;
rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
if (rc != 0) {
SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
rc = -ENODEV;
goto err_open;
}
bdev = spdk_bdev_desc_get_bdev(ctx->desc);
nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
pthread_mutex_lock(&nbdev->mutex);
nbdev->mp_policy = policy;
pthread_mutex_unlock(&nbdev->mutex);
spdk_for_each_channel(nbdev,
_bdev_nvme_set_multipath_policy,
ctx,
bdev_nvme_set_multipath_policy_done);
return;
err_open:
free(ctx);
err_alloc:
cb_fn(cb_arg, rc);
}
static void static void
aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
{ {

View File

@ -48,6 +48,11 @@ extern bool g_bdev_nvme_module_finish;
#define NVME_MAX_CONTROLLERS 1024 #define NVME_MAX_CONTROLLERS 1024
enum bdev_nvme_multipath_policy {
BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE,
BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
};
typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc); typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx); typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx);
typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx); typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
@ -171,6 +176,7 @@ struct nvme_bdev {
struct nvme_bdev_ctrlr *nbdev_ctrlr; struct nvme_bdev_ctrlr *nbdev_ctrlr;
pthread_mutex_t mutex; pthread_mutex_t mutex;
int ref; int ref;
enum bdev_nvme_multipath_policy mp_policy;
TAILQ_HEAD(, nvme_ns) nvme_ns_list; TAILQ_HEAD(, nvme_ns) nvme_ns_list;
bool opal; bool opal;
TAILQ_ENTRY(nvme_bdev) tailq; TAILQ_ENTRY(nvme_bdev) tailq;
@ -207,6 +213,7 @@ struct nvme_io_path {
struct nvme_bdev_channel { struct nvme_bdev_channel {
struct nvme_io_path *current_io_path; struct nvme_io_path *current_io_path;
enum bdev_nvme_multipath_policy mp_policy;
STAILQ_HEAD(, nvme_io_path) io_path_list; STAILQ_HEAD(, nvme_io_path) io_path_list;
TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list; TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list;
struct spdk_poller *retry_io_poller; struct spdk_poller *retry_io_poller;
@ -333,4 +340,18 @@ typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);
void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg); bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);
typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc);
/**
* Set multipath policy of the NVMe bdev.
*
* \param name NVMe bdev name
* \param policy Multipath policy (active-passive or active-active)
* \param cb_fn Function to be called back after completion.
*/
void bdev_nvme_set_multipath_policy(const char *name,
enum bdev_nvme_multipath_policy policy,
bdev_nvme_set_multipath_policy_cb cb_fn,
void *cb_arg);
#endif /* SPDK_BDEV_NVME_H */ #endif /* SPDK_BDEV_NVME_H */

View File

@ -2196,3 +2196,90 @@ cleanup:
} }
SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path, SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path,
SPDK_RPC_RUNTIME) SPDK_RPC_RUNTIME)
struct rpc_set_multipath_policy {
char *name;
enum bdev_nvme_multipath_policy policy;
};
static void
free_rpc_set_multipath_policy(struct rpc_set_multipath_policy *req)
{
free(req->name);
}
static int
rpc_decode_mp_policy(const struct spdk_json_val *val, void *out)
{
enum bdev_nvme_multipath_policy *policy = out;
if (spdk_json_strequal(val, "active_passive") == true) {
*policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
} else if (spdk_json_strequal(val, "active_active") == true) {
*policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE;
} else {
SPDK_NOTICELOG("Invalid parameter value: policy\n");
return -EINVAL;
}
return 0;
}
static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = {
{"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string},
{"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy},
};
struct rpc_set_multipath_policy_ctx {
struct rpc_set_multipath_policy req;
struct spdk_jsonrpc_request *request;
};
static void
rpc_bdev_nvme_set_multipath_policy_done(void *cb_arg, int rc)
{
struct rpc_set_multipath_policy_ctx *ctx = cb_arg;
if (rc == 0) {
spdk_jsonrpc_send_bool_response(ctx->request, true);
} else {
spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc));
}
free_rpc_set_multipath_policy(&ctx->req);
free(ctx);
}
static void
rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request,
const struct spdk_json_val *params)
{
struct rpc_set_multipath_policy_ctx *ctx;
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
return;
}
if (spdk_json_decode_object(params, rpc_set_multipath_policy_decoders,
SPDK_COUNTOF(rpc_set_multipath_policy_decoders),
&ctx->req)) {
SPDK_ERRLOG("spdk_json_decode_object failed\n");
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"spdk_json_decode_object failed");
goto cleanup;
}
ctx->request = request;
bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy,
rpc_bdev_nvme_set_multipath_policy_done, ctx);
return;
cleanup:
free_rpc_set_multipath_policy(&ctx->req);
free(ctx);
}
SPDK_RPC_REGISTER("bdev_nvme_set_multipath_policy", rpc_bdev_nvme_set_multipath_policy,
SPDK_RPC_RUNTIME)

View File

@ -840,6 +840,20 @@ def bdev_nvme_set_preferred_path(client, name, cntlid):
return client.call('bdev_nvme_set_preferred_path', params) return client.call('bdev_nvme_set_preferred_path', params)
def bdev_nvme_set_multipath_policy(client, name, policy):
"""Set multipath policy of the NVMe bdev
Args:
name: NVMe bdev name
policy: Multipath policy (active_passive or active_active)
"""
params = {'name': name,
'policy': policy}
return client.call('bdev_nvme_set_multipath_policy', params)
def bdev_nvme_cuse_register(client, name): def bdev_nvme_cuse_register(client, name):
"""Register CUSE devices on NVMe controller. """Register CUSE devices on NVMe controller.

View File

@ -779,6 +779,17 @@ if __name__ == "__main__":
p.add_argument('-c', '--cntlid', help='NVMe-oF controller ID', type=int, required=True) p.add_argument('-c', '--cntlid', help='NVMe-oF controller ID', type=int, required=True)
p.set_defaults(func=bdev_nvme_set_preferred_path) p.set_defaults(func=bdev_nvme_set_preferred_path)
def bdev_nvme_set_multipath_policy(args):
rpc.bdev.bdev_nvme_set_multipath_policy(args.client,
name=args.name,
policy=args.policy)
p = subparsers.add_parser('bdev_nvme_set_multipath_policy',
help="""Set multipath policy of the NVMe bdev""")
p.add_argument('-b', '--name', help='Name of the NVMe bdev', required=True)
p.add_argument('-p', '--policy', help='Multipath policy (active_passive or active_active)', required=True)
p.set_defaults(func=bdev_nvme_set_multipath_policy)
def bdev_nvme_cuse_register(args): def bdev_nvme_cuse_register(args):
rpc.bdev.bdev_nvme_cuse_register(args.client, rpc.bdev.bdev_nvme_cuse_register(args.client,
name=args.name) name=args.name)

View File

@ -6200,6 +6200,58 @@ test_set_preferred_path(void)
CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL); CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
} }
static void
test_find_next_io_path(void)
{
struct nvme_bdev_channel nbdev_ch = {
.io_path_list = STAILQ_HEAD_INITIALIZER(nbdev_ch.io_path_list),
.mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
};
struct spdk_nvme_qpair qpair1 = {}, qpair2 = {}, qpair3 = {};
struct spdk_nvme_ctrlr ctrlr1 = {}, ctrlr2 = {}, ctrlr3 = {};
struct nvme_ctrlr nvme_ctrlr1 = { .ctrlr = &ctrlr1, };
struct nvme_ctrlr nvme_ctrlr2 = { .ctrlr = &ctrlr2, };
struct nvme_ctrlr nvme_ctrlr3 = { .ctrlr = &ctrlr3, };
struct nvme_ctrlr_channel ctrlr_ch1 = {};
struct nvme_ctrlr_channel ctrlr_ch2 = {};
struct nvme_ctrlr_channel ctrlr_ch3 = {};
struct nvme_qpair nvme_qpair1 = { .ctrlr_ch = &ctrlr_ch1, .ctrlr = &nvme_ctrlr1, .qpair = &qpair1, };
struct nvme_qpair nvme_qpair2 = { .ctrlr_ch = &ctrlr_ch2, .ctrlr = &nvme_ctrlr2, .qpair = &qpair2, };
struct nvme_qpair nvme_qpair3 = { .ctrlr_ch = &ctrlr_ch3, .ctrlr = &nvme_ctrlr3, .qpair = &qpair3, };
struct nvme_ns nvme_ns1 = {}, nvme_ns2 = {}, nvme_ns3 = {};
struct nvme_io_path io_path1 = { .qpair = &nvme_qpair1, .nvme_ns = &nvme_ns1, };
struct nvme_io_path io_path2 = { .qpair = &nvme_qpair2, .nvme_ns = &nvme_ns2, };
struct nvme_io_path io_path3 = { .qpair = &nvme_qpair3, .nvme_ns = &nvme_ns3, };
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path1, stailq);
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path2, stailq);
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path3, stailq);
/* nbdev_ch->current_io_path is filled always when bdev_nvme_find_next_io_path() is called. */
nbdev_ch.current_io_path = &io_path2;
nvme_ns1.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
nvme_ns1.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
nvme_ns1.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path1);
nbdev_ch.current_io_path = &io_path3;
nvme_ns1.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
}
int int
main(int argc, const char **argv) main(int argc, const char **argv)
{ {
@ -6250,6 +6302,7 @@ main(int argc, const char **argv)
CU_ADD_TEST(suite, test_nvme_ns_cmp); CU_ADD_TEST(suite, test_nvme_ns_cmp);
CU_ADD_TEST(suite, test_ana_transition); CU_ADD_TEST(suite, test_ana_transition);
CU_ADD_TEST(suite, test_set_preferred_path); CU_ADD_TEST(suite, test_set_preferred_path);
CU_ADD_TEST(suite, test_find_next_io_path);
CU_basic_set_mode(CU_BRM_VERBOSE); CU_basic_set_mode(CU_BRM_VERBOSE);