bdev/nvme: Set preferred I/O path in multipath mode

If we specify a preferred path manually for each NVMe bdev, we will
be able to realize a simple static load balancing and make the failover
more controllable in the multipath mode.

The idea is to move I/O path to the NVMe-oF controller to the head of
the list and then clear the I/O path cache for each NVMe bdev channel.
We can set the I/O path to the I/O path cache directly but it must be
conditional and make the code very complex. Hence, let find_io_path() do
that.

However, a NVMe bdev channel may be acquired after setting the preferred
path. To cover such case, sort the nvme_ns list of the NVMe bdev too.

This feature supports only multipath mode. The NVMe bdev module supports
failover mode too. However, to support the latter, the new RPC needs to
have trid as parameters and the code and the usage will be come very
complex. Add a note for such limitation.

To verify one by one exactly, add unit test.

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: Ia51c74f530d6d7dc1f73d5b65f854967363e76b0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12262
Community-CI: Mellanox Build Bot
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Reviewed-by: <tanl12@chinatelecom.cn>
Reviewed-by: GangCao <gang.cao@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Shuhei Matsumoto 2022-04-14 06:23:40 +09:00 committed by Tomasz Zawadzki
parent f0935084bd
commit 22b77a3c80
8 changed files with 487 additions and 0 deletions

View File

@ -23,6 +23,9 @@ safe for race conditions.
A new RPC `bdev_nvme_get_io_paths` was added to get all active I/O paths.
A new RPC `bdev_nvme_set_preferred_path` was added to set preferred I/O path for an NVMe bdev
when in multipath mode. This RPC does not support NVMe bdevs in failover mode.
### idxd
A new parameter `flags` was added to all low level submission and preparation

View File

@ -3394,6 +3394,45 @@ Example response:
}
~~~
### bdev_nvme_set_preferred_path {#rpc_bdev_nvme_set_preferred_path}
Set the preferred I/O path for an NVMe bdev in multipath mode.
NOTE: This RPC does not support NVMe bdevs in failover mode.
#### Parameters
Name | Optional | Type | Description
----------------------- | -------- | ----------- | -----------
name | Required | string | Name of the NVMe bdev
cntlid | Required | number | NVMe-oF controller ID
#### Example
Example request:
~~~json
{
"jsonrpc": "2.0",
"method": "bdev_nvme_set_preferred_path",
"id": 1,
"params": {
"name": "Nvme0n1",
"cntlid": 0
}
}
~~~
Example response:
~~~json
{
"jsonrpc": "2.0",
"id": 1,
"result": true
}
~~~
### bdev_nvme_cuse_register {#rpc_bdev_nvme_cuse_register}
Register CUSE device on NVMe controller.

View File

@ -3475,6 +3475,160 @@ nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
return rc;
}
static void
dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
{
}
struct bdev_nvme_set_preferred_path_ctx {
struct spdk_bdev_desc *desc;
struct nvme_ns *nvme_ns;
bdev_nvme_set_preferred_path_cb cb_fn;
void *cb_arg;
};
static void
bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status)
{
struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
assert(ctx != NULL);
assert(ctx->desc != NULL);
assert(ctx->cb_fn != NULL);
spdk_bdev_close(ctx->desc);
ctx->cb_fn(ctx->cb_arg, status);
free(ctx);
}
static void
_bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i)
{
struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch);
struct nvme_io_path *io_path, *prev;
prev = NULL;
STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
if (io_path->nvme_ns == ctx->nvme_ns) {
break;
}
prev = io_path;
}
if (io_path != NULL && prev != NULL) {
STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq);
STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq);
/* We can set io_path to nbdev_ch->current_io_path directly here.
* However, it needs to be conditional. To simplify the code,
* just clear nbdev_ch->current_io_path and let find_io_path()
* fill it.
*/
nbdev_ch->current_io_path = NULL;
}
spdk_for_each_channel_continue(i, 0);
}
static struct nvme_ns *
bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid)
{
struct nvme_ns *nvme_ns, *prev;
const struct spdk_nvme_ctrlr_data *cdata;
prev = NULL;
TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
if (cdata->cntlid == cntlid) {
break;
}
prev = nvme_ns;
}
if (nvme_ns != NULL && prev != NULL) {
TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq);
TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq);
}
return nvme_ns;
}
/* This function supports only multipath mode. There is only a single I/O path
* for each NVMe-oF controller. Hence, just move the matched I/O path to the
* head of the I/O path list for each NVMe bdev channel.
*
* NVMe bdev channel may be acquired after completing this function. move the
* matched namespace to the head of the namespace list for the NVMe bdev too.
*/
void
bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg)
{
struct bdev_nvme_set_preferred_path_ctx *ctx;
struct spdk_bdev *bdev;
struct nvme_bdev *nbdev;
int rc = 0;
assert(cb_fn != NULL);
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
SPDK_ERRLOG("Failed to alloc context.\n");
rc = -ENOMEM;
goto err_alloc;
}
ctx->cb_fn = cb_fn;
ctx->cb_arg = cb_arg;
rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
if (rc != 0) {
SPDK_ERRLOG("Failed to open bdev %s.\n", name);
goto err_open;
}
bdev = spdk_bdev_desc_get_bdev(ctx->desc);
if (bdev->module != &nvme_if) {
SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
rc = -ENODEV;
goto err_bdev;
}
nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
pthread_mutex_lock(&nbdev->mutex);
ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid);
if (ctx->nvme_ns == NULL) {
pthread_mutex_unlock(&nbdev->mutex);
SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid);
rc = -ENODEV;
goto err_bdev;
}
pthread_mutex_unlock(&nbdev->mutex);
spdk_for_each_channel(nbdev,
_bdev_nvme_set_preferred_path,
ctx,
bdev_nvme_set_preferred_path_done);
return;
err_bdev:
spdk_bdev_close(ctx->desc);
err_open:
free(ctx);
err_alloc:
cb_fn(cb_arg, rc);
}
static void
aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
{

View File

@ -318,4 +318,19 @@ int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id);
*/
int bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg);
typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);
/**
* Set the preferred I/O path for an NVMe bdev in multipath mode.
*
* NOTE: This function does not support NVMe bdevs in failover mode.
*
* \param name NVMe bdev name
* \param cntlid NVMe-oF controller ID
* \param cb_fn Function to be called back after completion.
* \param cb_arg Argument for callback function.
*/
void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);
#endif /* SPDK_BDEV_NVME_H */

View File

@ -2126,3 +2126,73 @@ rpc_bdev_nvme_get_io_paths(struct spdk_jsonrpc_request *request,
rpc_bdev_nvme_get_io_paths_done);
}
SPDK_RPC_REGISTER("bdev_nvme_get_io_paths", rpc_bdev_nvme_get_io_paths, SPDK_RPC_RUNTIME)
struct rpc_bdev_nvme_set_preferred_path {
char *name;
uint16_t cntlid;
};
static void
free_rpc_bdev_nvme_set_preferred_path(struct rpc_bdev_nvme_set_preferred_path *req)
{
free(req->name);
}
static const struct spdk_json_object_decoder rpc_bdev_nvme_set_preferred_path_decoders[] = {
{"name", offsetof(struct rpc_bdev_nvme_set_preferred_path, name), spdk_json_decode_string},
{"cntlid", offsetof(struct rpc_bdev_nvme_set_preferred_path, cntlid), spdk_json_decode_uint16},
};
struct rpc_bdev_nvme_set_preferred_path_ctx {
struct rpc_bdev_nvme_set_preferred_path req;
struct spdk_jsonrpc_request *request;
};
static void
rpc_bdev_nvme_set_preferred_path_done(void *cb_arg, int rc)
{
struct rpc_bdev_nvme_set_preferred_path_ctx *ctx = cb_arg;
if (rc == 0) {
spdk_jsonrpc_send_bool_response(ctx->request, true);
} else {
spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc));
}
free_rpc_bdev_nvme_set_preferred_path(&ctx->req);
free(ctx);
}
static void
rpc_bdev_nvme_set_preferred_path(struct spdk_jsonrpc_request *request,
const struct spdk_json_val *params)
{
struct rpc_bdev_nvme_set_preferred_path_ctx *ctx;
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
return;
}
if (spdk_json_decode_object(params, rpc_bdev_nvme_set_preferred_path_decoders,
SPDK_COUNTOF(rpc_bdev_nvme_set_preferred_path_decoders),
&ctx->req)) {
SPDK_ERRLOG("spdk_json_decode_object failed\n");
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"spdk_json_decode_object failed");
goto cleanup;
}
ctx->request = request;
bdev_nvme_set_preferred_path(ctx->req.name, ctx->req.cntlid,
rpc_bdev_nvme_set_preferred_path_done, ctx);
return;
cleanup:
free_rpc_bdev_nvme_set_preferred_path(&ctx->req);
free(ctx);
}
SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path,
SPDK_RPC_RUNTIME)

View File

@ -826,6 +826,20 @@ def bdev_nvme_get_io_paths(client, name):
return client.call('bdev_nvme_get_io_paths', params)
def bdev_nvme_set_preferred_path(client, name, cntlid):
"""Set the preferred I/O path for an NVMe bdev when in multipath mode
Args:
name: NVMe bdev name
cntlid: NVMe-oF controller ID
"""
params = {'name': name,
'cntlid': cntlid}
return client.call('bdev_nvme_set_preferred_path', params)
def bdev_nvme_cuse_register(client, name):
"""Register CUSE devices on NVMe controller.

View File

@ -768,6 +768,17 @@ if __name__ == "__main__":
p.add_argument('-n', '--name', help="Name of the NVMe bdev", required=False)
p.set_defaults(func=bdev_nvme_get_io_paths)
def bdev_nvme_set_preferred_path(args):
rpc.bdev.bdev_nvme_set_preferred_path(args.client,
name=args.name,
cntlid=args.cntlid)
p = subparsers.add_parser('bdev_nvme_set_preferred_path',
help="""Set the preferred I/O path for an NVMe bdev when in multipath mode""")
p.add_argument('-b', '--name', help='Name of the NVMe bdev', required=True)
p.add_argument('-c', '--cntlid', help='NVMe-oF controller ID', type=int, required=True)
p.set_defaults(func=bdev_nvme_set_preferred_path)
def bdev_nvme_cuse_register(args):
rpc.bdev.bdev_nvme_cuse_register(args.client,
name=args.name)

View File

@ -206,6 +206,8 @@ DEFINE_STUB_V(spdk_bdev_module_fini_done, (void));
DEFINE_STUB_V(spdk_bdev_module_list_add, (struct spdk_bdev_module *bdev_module));
DEFINE_STUB_V(spdk_bdev_close, (struct spdk_bdev_desc *desc));
DEFINE_STUB(spdk_opal_dev_construct, struct spdk_opal_dev *, (struct spdk_nvme_ctrlr *ctrlr), NULL);
DEFINE_STUB_V(spdk_opal_dev_destruct, (struct spdk_opal_dev *dev));
@ -308,6 +310,7 @@ static TAILQ_HEAD(, spdk_nvme_ctrlr) g_ut_attached_ctrlrs = TAILQ_HEAD_INITIALIZ
static int g_ut_attach_ctrlr_status;
static size_t g_ut_attach_bdev_count;
static int g_ut_register_bdev_status;
static struct spdk_bdev *g_ut_registered_bdev;
static uint16_t g_ut_cntlid;
static struct nvme_path_id g_any_path = {};
@ -1207,6 +1210,8 @@ spdk_nvme_poll_group_remove(struct spdk_nvme_poll_group *group,
int
spdk_bdev_register(struct spdk_bdev *bdev)
{
g_ut_registered_bdev = bdev;
return g_ut_register_bdev_status;
}
@ -1216,11 +1221,37 @@ spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void
int rc;
rc = bdev->fn_table->destruct(bdev->ctxt);
if (bdev == g_ut_registered_bdev) {
g_ut_registered_bdev = NULL;
}
if (rc <= 0 && cb_fn != NULL) {
cb_fn(cb_arg, rc);
}
}
int
spdk_bdev_open_ext(const char *bdev_name, bool write,
spdk_bdev_event_cb_t event_cb, void *event_ctx,
struct spdk_bdev_desc **desc)
{
if (g_ut_registered_bdev == NULL ||
strcmp(g_ut_registered_bdev->name, bdev_name) != 0) {
return -ENODEV;
}
*desc = (struct spdk_bdev_desc *)g_ut_registered_bdev;
return 0;
}
struct spdk_bdev *
spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
{
return (struct spdk_bdev *)desc;
}
int
spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
{
@ -6020,6 +6051,155 @@ test_ana_transition(void)
CU_ASSERT(nvme_ns.ana_transition_timedout == true);
}
static void
_set_preferred_path_cb(void *cb_arg, int rc)
{
bool *done = cb_arg;
*done = true;
}
static void
test_set_preferred_path(void)
{
struct nvme_path_id path1 = {}, path2 = {}, path3 = {};
struct spdk_nvme_ctrlr *ctrlr1, *ctrlr2, *ctrlr3;
struct nvme_bdev_ctrlr *nbdev_ctrlr;
const int STRING_SIZE = 32;
const char *attached_names[STRING_SIZE];
struct nvme_bdev *bdev;
struct spdk_io_channel *ch;
struct nvme_bdev_channel *nbdev_ch;
struct nvme_io_path *io_path;
struct spdk_uuid uuid1 = { .u.raw = { 0x1 } };
const struct spdk_nvme_ctrlr_data *cdata;
bool done;
int rc;
memset(attached_names, 0, sizeof(char *) * STRING_SIZE);
ut_init_trid(&path1.trid);
ut_init_trid2(&path2.trid);
ut_init_trid3(&path3.trid);
g_ut_attach_ctrlr_status = 0;
g_ut_attach_bdev_count = 1;
set_thread(0);
ctrlr1 = ut_attach_ctrlr(&path1.trid, 1, true, true);
SPDK_CU_ASSERT_FATAL(ctrlr1 != NULL);
ctrlr1->ns[0].uuid = &uuid1;
rc = bdev_nvme_create(&path1.trid, "nvme0", attached_names, STRING_SIZE,
attach_ctrlr_done, NULL, NULL, NULL, true);
CU_ASSERT(rc == 0);
spdk_delay_us(1000);
poll_threads();
spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
poll_threads();
ctrlr2 = ut_attach_ctrlr(&path2.trid, 1, true, true);
SPDK_CU_ASSERT_FATAL(ctrlr2 != NULL);
ctrlr2->ns[0].uuid = &uuid1;
rc = bdev_nvme_create(&path2.trid, "nvme0", attached_names, STRING_SIZE,
attach_ctrlr_done, NULL, NULL, NULL, true);
CU_ASSERT(rc == 0);
spdk_delay_us(1000);
poll_threads();
spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
poll_threads();
ctrlr3 = ut_attach_ctrlr(&path3.trid, 1, true, true);
SPDK_CU_ASSERT_FATAL(ctrlr3 != NULL);
ctrlr3->ns[0].uuid = &uuid1;
rc = bdev_nvme_create(&path3.trid, "nvme0", attached_names, STRING_SIZE,
attach_ctrlr_done, NULL, NULL, NULL, true);
CU_ASSERT(rc == 0);
spdk_delay_us(1000);
poll_threads();
spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
poll_threads();
nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name("nvme0");
SPDK_CU_ASSERT_FATAL(nbdev_ctrlr != NULL);
bdev = nvme_bdev_ctrlr_get_bdev(nbdev_ctrlr, 1);
SPDK_CU_ASSERT_FATAL(bdev != NULL);
/* ctrlr1 was added first. Hence io_path to ctrlr1 should be preferred. */
ch = spdk_get_io_channel(bdev);
SPDK_CU_ASSERT_FATAL(ch != NULL);
nbdev_ch = spdk_io_channel_get_ctx(ch);
io_path = bdev_nvme_find_io_path(nbdev_ch);
SPDK_CU_ASSERT_FATAL(io_path != NULL);
CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr1);
/* If io_path to ctrlr2 is set to the preferred path dynamically, find_io_path()
* should return io_path to ctrlr2.
*/
cdata = spdk_nvme_ctrlr_get_data(ctrlr2);
done = false;
bdev_nvme_set_preferred_path(bdev->disk.name, cdata->cntlid, _set_preferred_path_cb, &done);
poll_threads();
CU_ASSERT(done == true);
io_path = bdev_nvme_find_io_path(nbdev_ch);
SPDK_CU_ASSERT_FATAL(io_path != NULL);
CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr2);
/* If io_path to ctrlr3 is set to the preferred path and then a new I/O channel is
* aquired, find_io_path() should return io_path to ctrlr3.
*/
spdk_put_io_channel(ch);
poll_threads();
cdata = spdk_nvme_ctrlr_get_data(ctrlr3);
done = false;
bdev_nvme_set_preferred_path(bdev->disk.name, cdata->cntlid, _set_preferred_path_cb, &done);
poll_threads();
CU_ASSERT(done == true);
ch = spdk_get_io_channel(bdev);
SPDK_CU_ASSERT_FATAL(ch != NULL);
nbdev_ch = spdk_io_channel_get_ctx(ch);
io_path = bdev_nvme_find_io_path(nbdev_ch);
SPDK_CU_ASSERT_FATAL(io_path != NULL);
CU_ASSERT(io_path->nvme_ns->ctrlr->ctrlr == ctrlr3);
spdk_put_io_channel(ch);
poll_threads();
rc = bdev_nvme_delete("nvme0", &g_any_path);
CU_ASSERT(rc == 0);
poll_threads();
spdk_delay_us(1000);
poll_threads();
CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
}
int
main(int argc, const char **argv)
{
@ -6069,6 +6249,7 @@ main(int argc, const char **argv)
CU_ADD_TEST(suite, test_fail_path);
CU_ADD_TEST(suite, test_nvme_ns_cmp);
CU_ADD_TEST(suite, test_ana_transition);
CU_ADD_TEST(suite, test_set_preferred_path);
CU_basic_set_mode(CU_BRM_VERBOSE);