bdev/nvme: Add active/active policy for multipath mode

The NVMe bdev module supported active-passive policy for multipath mode
first. By this patch, the NVMe bdev module supports active-active policy
for multipath node next. Following the Linux kernel native NVMe multipath,
the NVMe bdev module supports round robin algorithm for active-active
policy.

The multipath policy, active-passive or active-active, is managed per
nvme_bdev. The multipath policy is copied to all corresponding
nvme_bdev_channels.

Different from active-passive, active-active caches even non_optimized
path to provide load balance across multiple paths.

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: Ie18b24db60d3da1ce2f83725b6cd3079f628f95b
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12001
Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
This commit is contained in:
Shuhei Matsumoto 2022-04-29 14:37:35 +09:00 committed by Tomasz Zawadzki
parent 0869265d66
commit 8f9b977504
8 changed files with 386 additions and 9 deletions

View File

@ -26,6 +26,9 @@ A new RPC `bdev_nvme_get_io_paths` was added to get all active I/O paths.
A new RPC `bdev_nvme_set_preferred_path` was added to set preferred I/O path for an NVMe bdev
when in multipath mode. This RPC does not support NVMe bdevs in failover mode.
A new RPC `bdev_nvme_set_multipath_policy` was added to set multipath policy of a NVMe bdev
in multipath mode.
### idxd
A new parameter `flags` was added to all low level submission and preparation

View File

@ -3433,6 +3433,43 @@ Example response:
}
~~~
### bdev_nvme_set_multipath_policy {#rpc_bdev_nvme_set_multipath_policy}
Set multipath policy of the NVMe bdev in multipath mode.
#### Parameters
Name | Optional | Type | Description
----------------------- | -------- | ----------- | -----------
name | Required | string | Name of the NVMe bdev
policy | Required | string | Multipath policy: active_active or active_passive
#### Example
Example request:
~~~json
{
"jsonrpc": "2.0",
"method": "bdev_nvme_set_multipath_policy",
"id": 1,
"params": {
"name": "Nvme0n1",
"policy": "active_passive"
}
}
~~~
Example response:
~~~json
{
"jsonrpc": "2.0",
"id": 1,
"result": true
}
~~~
### bdev_nvme_cuse_register {#rpc_bdev_nvme_cuse_register}
Register CUSE device on NVMe controller.

View File

@ -825,6 +825,55 @@ nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
return true;
}
/* Simulate circular linked list. */
static inline struct nvme_io_path *
nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path)
{
struct nvme_io_path *next_path;
next_path = STAILQ_NEXT(prev_path, stailq);
if (next_path != NULL) {
return next_path;
} else {
return STAILQ_FIRST(&nbdev_ch->io_path_list);
}
}
static struct nvme_io_path *
bdev_nvme_find_next_io_path(struct nvme_bdev_channel *nbdev_ch,
struct nvme_io_path *prev)
{
struct nvme_io_path *io_path, *start, *non_optimized = NULL;
start = nvme_io_path_get_next(nbdev_ch, prev);
io_path = start;
do {
if (spdk_likely(nvme_io_path_is_connected(io_path) &&
!io_path->nvme_ns->ana_state_updating)) {
switch (io_path->nvme_ns->ana_state) {
case SPDK_NVME_ANA_OPTIMIZED_STATE:
nbdev_ch->current_io_path = io_path;
return io_path;
case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
if (non_optimized == NULL) {
non_optimized = io_path;
}
break;
default:
break;
}
}
io_path = nvme_io_path_get_next(nbdev_ch, io_path);
} while (io_path != start);
/* We come here only if there is no optimized path. Cache even non_optimized
* path for load balance across multiple non_optimized paths.
*/
nbdev_ch->current_io_path = non_optimized;
return non_optimized;
}
static struct nvme_io_path *
_bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
@ -864,7 +913,11 @@ bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
return _bdev_nvme_find_io_path(nbdev_ch);
}
return nbdev_ch->current_io_path;
if (spdk_likely(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE)) {
return nbdev_ch->current_io_path;
} else {
return bdev_nvme_find_next_io_path(nbdev_ch, nbdev_ch->current_io_path);
}
}
/* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
@ -2600,6 +2653,20 @@ nvme_namespace_info_json(struct spdk_json_write_ctx *w,
spdk_json_write_object_end(w);
}
static const char *
nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev)
{
switch (nbdev->mp_policy) {
case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
return "active_passive";
case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
return "active_active";
default:
assert(false);
return "invalid";
}
}
static int
bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
{
@ -2612,6 +2679,7 @@ bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
nvme_namespace_info_json(w, nvme_ns);
}
spdk_json_write_array_end(w);
spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev));
pthread_mutex_unlock(&nvme_bdev->mutex);
return 0;
@ -2884,6 +2952,7 @@ nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
}
bdev->ref = 1;
bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
TAILQ_INIT(&bdev->nvme_ns_list);
TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
bdev->opal = nvme_ctrlr->opal_dev != NULL;
@ -3635,6 +3704,88 @@ err_alloc:
cb_fn(cb_arg, rc);
}
struct bdev_nvme_set_multipath_policy_ctx {
struct spdk_bdev_desc *desc;
bdev_nvme_set_multipath_policy_cb cb_fn;
void *cb_arg;
};
static void
bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status)
{
struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
assert(ctx != NULL);
assert(ctx->desc != NULL);
assert(ctx->cb_fn != NULL);
spdk_bdev_close(ctx->desc);
ctx->cb_fn(ctx->cb_arg, status);
free(ctx);
}
static void
_bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i)
{
struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch);
nbdev_ch->mp_policy = nbdev->mp_policy;
nbdev_ch->current_io_path = NULL;
spdk_for_each_channel_continue(i, 0);
}
void
bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy,
bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
{
struct bdev_nvme_set_multipath_policy_ctx *ctx;
struct spdk_bdev *bdev;
struct nvme_bdev *nbdev;
int rc;
assert(cb_fn != NULL);
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
SPDK_ERRLOG("Failed to alloc context.\n");
rc = -ENOMEM;
goto err_alloc;
}
ctx->cb_fn = cb_fn;
ctx->cb_arg = cb_arg;
rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
if (rc != 0) {
SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
rc = -ENODEV;
goto err_open;
}
bdev = spdk_bdev_desc_get_bdev(ctx->desc);
nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
pthread_mutex_lock(&nbdev->mutex);
nbdev->mp_policy = policy;
pthread_mutex_unlock(&nbdev->mutex);
spdk_for_each_channel(nbdev,
_bdev_nvme_set_multipath_policy,
ctx,
bdev_nvme_set_multipath_policy_done);
return;
err_open:
free(ctx);
err_alloc:
cb_fn(cb_arg, rc);
}
static void
aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
{

View File

@ -48,6 +48,11 @@ extern bool g_bdev_nvme_module_finish;
#define NVME_MAX_CONTROLLERS 1024
enum bdev_nvme_multipath_policy {
BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE,
BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
};
typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx);
typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
@ -166,14 +171,15 @@ struct nvme_bdev_ctrlr {
};
struct nvme_bdev {
struct spdk_bdev disk;
uint32_t nsid;
struct nvme_bdev_ctrlr *nbdev_ctrlr;
pthread_mutex_t mutex;
int ref;
TAILQ_HEAD(, nvme_ns) nvme_ns_list;
bool opal;
TAILQ_ENTRY(nvme_bdev) tailq;
struct spdk_bdev disk;
uint32_t nsid;
struct nvme_bdev_ctrlr *nbdev_ctrlr;
pthread_mutex_t mutex;
int ref;
enum bdev_nvme_multipath_policy mp_policy;
TAILQ_HEAD(, nvme_ns) nvme_ns_list;
bool opal;
TAILQ_ENTRY(nvme_bdev) tailq;
};
struct nvme_qpair {
@ -207,6 +213,7 @@ struct nvme_io_path {
struct nvme_bdev_channel {
struct nvme_io_path *current_io_path;
enum bdev_nvme_multipath_policy mp_policy;
STAILQ_HEAD(, nvme_io_path) io_path_list;
TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list;
struct spdk_poller *retry_io_poller;
@ -333,4 +340,18 @@ typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);
void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);
typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc);
/**
* Set multipath policy of the NVMe bdev.
*
* \param name NVMe bdev name
* \param policy Multipath policy (active-passive or active-active)
* \param cb_fn Function to be called back after completion.
*/
void bdev_nvme_set_multipath_policy(const char *name,
enum bdev_nvme_multipath_policy policy,
bdev_nvme_set_multipath_policy_cb cb_fn,
void *cb_arg);
#endif /* SPDK_BDEV_NVME_H */

View File

@ -2196,3 +2196,90 @@ cleanup:
}
SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path,
SPDK_RPC_RUNTIME)
struct rpc_set_multipath_policy {
char *name;
enum bdev_nvme_multipath_policy policy;
};
static void
free_rpc_set_multipath_policy(struct rpc_set_multipath_policy *req)
{
free(req->name);
}
static int
rpc_decode_mp_policy(const struct spdk_json_val *val, void *out)
{
enum bdev_nvme_multipath_policy *policy = out;
if (spdk_json_strequal(val, "active_passive") == true) {
*policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
} else if (spdk_json_strequal(val, "active_active") == true) {
*policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE;
} else {
SPDK_NOTICELOG("Invalid parameter value: policy\n");
return -EINVAL;
}
return 0;
}
static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = {
{"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string},
{"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy},
};
struct rpc_set_multipath_policy_ctx {
struct rpc_set_multipath_policy req;
struct spdk_jsonrpc_request *request;
};
static void
rpc_bdev_nvme_set_multipath_policy_done(void *cb_arg, int rc)
{
struct rpc_set_multipath_policy_ctx *ctx = cb_arg;
if (rc == 0) {
spdk_jsonrpc_send_bool_response(ctx->request, true);
} else {
spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc));
}
free_rpc_set_multipath_policy(&ctx->req);
free(ctx);
}
static void
rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request,
const struct spdk_json_val *params)
{
struct rpc_set_multipath_policy_ctx *ctx;
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
return;
}
if (spdk_json_decode_object(params, rpc_set_multipath_policy_decoders,
SPDK_COUNTOF(rpc_set_multipath_policy_decoders),
&ctx->req)) {
SPDK_ERRLOG("spdk_json_decode_object failed\n");
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"spdk_json_decode_object failed");
goto cleanup;
}
ctx->request = request;
bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy,
rpc_bdev_nvme_set_multipath_policy_done, ctx);
return;
cleanup:
free_rpc_set_multipath_policy(&ctx->req);
free(ctx);
}
SPDK_RPC_REGISTER("bdev_nvme_set_multipath_policy", rpc_bdev_nvme_set_multipath_policy,
SPDK_RPC_RUNTIME)

View File

@ -840,6 +840,20 @@ def bdev_nvme_set_preferred_path(client, name, cntlid):
return client.call('bdev_nvme_set_preferred_path', params)
def bdev_nvme_set_multipath_policy(client, name, policy):
"""Set multipath policy of the NVMe bdev
Args:
name: NVMe bdev name
policy: Multipath policy (active_passive or active_active)
"""
params = {'name': name,
'policy': policy}
return client.call('bdev_nvme_set_multipath_policy', params)
def bdev_nvme_cuse_register(client, name):
"""Register CUSE devices on NVMe controller.

View File

@ -779,6 +779,17 @@ if __name__ == "__main__":
p.add_argument('-c', '--cntlid', help='NVMe-oF controller ID', type=int, required=True)
p.set_defaults(func=bdev_nvme_set_preferred_path)
def bdev_nvme_set_multipath_policy(args):
rpc.bdev.bdev_nvme_set_multipath_policy(args.client,
name=args.name,
policy=args.policy)
p = subparsers.add_parser('bdev_nvme_set_multipath_policy',
help="""Set multipath policy of the NVMe bdev""")
p.add_argument('-b', '--name', help='Name of the NVMe bdev', required=True)
p.add_argument('-p', '--policy', help='Multipath policy (active_passive or active_active)', required=True)
p.set_defaults(func=bdev_nvme_set_multipath_policy)
def bdev_nvme_cuse_register(args):
rpc.bdev.bdev_nvme_cuse_register(args.client,
name=args.name)

View File

@ -6200,6 +6200,58 @@ test_set_preferred_path(void)
CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
}
static void
test_find_next_io_path(void)
{
struct nvme_bdev_channel nbdev_ch = {
.io_path_list = STAILQ_HEAD_INITIALIZER(nbdev_ch.io_path_list),
.mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
};
struct spdk_nvme_qpair qpair1 = {}, qpair2 = {}, qpair3 = {};
struct spdk_nvme_ctrlr ctrlr1 = {}, ctrlr2 = {}, ctrlr3 = {};
struct nvme_ctrlr nvme_ctrlr1 = { .ctrlr = &ctrlr1, };
struct nvme_ctrlr nvme_ctrlr2 = { .ctrlr = &ctrlr2, };
struct nvme_ctrlr nvme_ctrlr3 = { .ctrlr = &ctrlr3, };
struct nvme_ctrlr_channel ctrlr_ch1 = {};
struct nvme_ctrlr_channel ctrlr_ch2 = {};
struct nvme_ctrlr_channel ctrlr_ch3 = {};
struct nvme_qpair nvme_qpair1 = { .ctrlr_ch = &ctrlr_ch1, .ctrlr = &nvme_ctrlr1, .qpair = &qpair1, };
struct nvme_qpair nvme_qpair2 = { .ctrlr_ch = &ctrlr_ch2, .ctrlr = &nvme_ctrlr2, .qpair = &qpair2, };
struct nvme_qpair nvme_qpair3 = { .ctrlr_ch = &ctrlr_ch3, .ctrlr = &nvme_ctrlr3, .qpair = &qpair3, };
struct nvme_ns nvme_ns1 = {}, nvme_ns2 = {}, nvme_ns3 = {};
struct nvme_io_path io_path1 = { .qpair = &nvme_qpair1, .nvme_ns = &nvme_ns1, };
struct nvme_io_path io_path2 = { .qpair = &nvme_qpair2, .nvme_ns = &nvme_ns2, };
struct nvme_io_path io_path3 = { .qpair = &nvme_qpair3, .nvme_ns = &nvme_ns3, };
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path1, stailq);
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path2, stailq);
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path3, stailq);
/* nbdev_ch->current_io_path is filled always when bdev_nvme_find_next_io_path() is called. */
nbdev_ch.current_io_path = &io_path2;
nvme_ns1.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
nvme_ns1.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
nvme_ns1.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path1);
nbdev_ch.current_io_path = &io_path3;
nvme_ns1.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
}
int
main(int argc, const char **argv)
{
@ -6250,6 +6302,7 @@ main(int argc, const char **argv)
CU_ADD_TEST(suite, test_nvme_ns_cmp);
CU_ADD_TEST(suite, test_ana_transition);
CU_ADD_TEST(suite, test_set_preferred_path);
CU_ADD_TEST(suite, test_find_next_io_path);
CU_basic_set_mode(CU_BRM_VERBOSE);