bdev/nvme: select io path according to outstanding io numbder

Support selecting io path according to number of outstanding io of
each path in a channel. It's optional, and can be set by calling
RPC "bdev_nvme_set_multipath_policy -s queue_depth".

Change-Id: I82cdfbd69b3e105c973844c4f34dc98f0dca2faf
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14734
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Richael Zhuang 2022-09-20 15:12:47 +08:00 committed by Jim Harris
parent a8d21b9b55
commit 6aa4edc27d
8 changed files with 182 additions and 8 deletions

View File

@ -91,6 +91,9 @@ Added spdk_rpc_set_allowlist to restrict allowed RPCs to the specified list.
Changed `bdev_raid_get_bdevs` RPC output format to include raid_bdev details.
Added `selector` parameter to bdev_nvme_set_multipath_policy RPC to set path selector for multipath.
Option `round_robin` and `queue_depth` are available.
### bdevperf
Promoted the application to example to match similar programs: fio_plugin and perf.
@ -150,6 +153,10 @@ a specified qpair.
Updated `bdev_nvme_set_options` RPC (and rpc.py) to support the new `transport_tos` parameter.
For the active-active policy of the multipath mode, in addition to the default round-robin path
selector, the minimum queue depth path selector was added. The minimum queue depth path selector
selects an I/O path according to the number of outstanding requests of each nvme qpair.
## v22.09
### accel

View File

@ -4135,7 +4135,8 @@ Example response:
### bdev_nvme_set_multipath_policy {#rpc_bdev_nvme_set_multipath_policy}
Set multipath policy of the NVMe bdev in multipath mode.
Set multipath policy of the NVMe bdev in multipath mode or set multipath
selector for active-active multipath policy.
#### Parameters
@ -4143,6 +4144,7 @@ Name | Optional | Type | Description
----------------------- | -------- | ----------- | -----------
name | Required | string | Name of the NVMe bdev
policy | Required | string | Multipath policy: active_active or active_passive
selector | Optional | string | Multipath selector: round_robin or queue_depth, used in active-active mode. Default is round_robin
#### Example

View File

@ -655,6 +655,7 @@ bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
pthread_mutex_lock(&nbdev->mutex);
nbdev_ch->mp_policy = nbdev->mp_policy;
nbdev_ch->mp_selector = nbdev->mp_selector;
TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
@ -873,6 +874,51 @@ _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
return non_optimized;
}
static struct nvme_io_path *
_bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
{
struct nvme_io_path *io_path;
struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
uint32_t num_outstanding_reqs;
STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) {
/* The device is currently resetting. */
continue;
}
if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) {
continue;
}
num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
switch (io_path->nvme_ns->ana_state) {
case SPDK_NVME_ANA_OPTIMIZED_STATE:
if (num_outstanding_reqs < opt_min_qd) {
opt_min_qd = num_outstanding_reqs;
optimized = io_path;
}
break;
case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
if (num_outstanding_reqs < non_opt_min_qd) {
non_opt_min_qd = num_outstanding_reqs;
non_optimized = io_path;
}
break;
default:
break;
}
}
/* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
if (optimized != NULL) {
return optimized;
}
return non_optimized;
}
static inline struct nvme_io_path *
bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
{
@ -881,7 +927,12 @@ bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
return nbdev_ch->current_io_path;
}
return _bdev_nvme_find_io_path(nbdev_ch);
if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
return _bdev_nvme_find_io_path(nbdev_ch);
} else {
return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
}
}
/* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
@ -3301,6 +3352,7 @@ nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
bdev->ref = 1;
bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
TAILQ_INIT(&bdev->nvme_ns_list);
TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
bdev->opal = nvme_ctrlr->opal_dev != NULL;
@ -4110,6 +4162,7 @@ _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i)
struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch);
nbdev_ch->mp_policy = nbdev->mp_policy;
nbdev_ch->mp_selector = nbdev->mp_selector;
nbdev_ch->current_io_path = NULL;
spdk_for_each_channel_continue(i, 0);
@ -4117,7 +4170,7 @@ _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i)
void
bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy,
bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
enum bdev_nvme_multipath_selector selector, bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
{
struct bdev_nvme_set_multipath_policy_ctx *ctx;
struct spdk_bdev *bdev;
@ -4153,6 +4206,7 @@ bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy
pthread_mutex_lock(&nbdev->mutex);
nbdev->mp_policy = policy;
nbdev->mp_selector = selector;
pthread_mutex_unlock(&nbdev->mutex);
spdk_for_each_channel(nbdev,

View File

@ -28,6 +28,11 @@ enum bdev_nvme_multipath_policy {
BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
};
enum bdev_nvme_multipath_selector {
BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1,
BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH,
};
typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status);
typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
@ -158,6 +163,7 @@ struct nvme_bdev {
pthread_mutex_t mutex;
int ref;
enum bdev_nvme_multipath_policy mp_policy;
enum bdev_nvme_multipath_selector mp_selector;
TAILQ_HEAD(, nvme_ns) nvme_ns_list;
bool opal;
TAILQ_ENTRY(nvme_bdev) tailq;
@ -196,6 +202,7 @@ struct nvme_io_path {
struct nvme_bdev_channel {
struct nvme_io_path *current_io_path;
enum bdev_nvme_multipath_policy mp_policy;
enum bdev_nvme_multipath_selector mp_selector;
STAILQ_HEAD(, nvme_io_path) io_path_list;
TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list;
struct spdk_poller *retry_io_poller;
@ -345,10 +352,12 @@ typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc);
*
* \param name NVMe bdev name
* \param policy Multipath policy (active-passive or active-active)
* \param selector Multipath selector (round_robin, queue_depth)
* \param cb_fn Function to be called back after completion.
*/
void bdev_nvme_set_multipath_policy(const char *name,
enum bdev_nvme_multipath_policy policy,
enum bdev_nvme_multipath_selector selector,
bdev_nvme_set_multipath_policy_cb cb_fn,
void *cb_arg);

View File

@ -2209,6 +2209,7 @@ SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_pa
struct rpc_set_multipath_policy {
char *name;
enum bdev_nvme_multipath_policy policy;
enum bdev_nvme_multipath_selector selector;
};
static void
@ -2234,9 +2235,27 @@ rpc_decode_mp_policy(const struct spdk_json_val *val, void *out)
return 0;
}
static int
rpc_decode_mp_selector(const struct spdk_json_val *val, void *out)
{
enum bdev_nvme_multipath_selector *selector = out;
if (spdk_json_strequal(val, "round_robin") == true) {
*selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
} else if (spdk_json_strequal(val, "queue_depth") == true) {
*selector = BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH;
} else {
SPDK_NOTICELOG("Invalid parameter value: selector\n");
return -EINVAL;
}
return 0;
}
static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = {
{"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string},
{"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy},
{"selector", offsetof(struct rpc_set_multipath_policy, selector), rpc_decode_mp_selector, true},
};
struct rpc_set_multipath_policy_ctx {
@ -2282,7 +2301,14 @@ rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request,
ctx->request = request;
bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy,
if (ctx->req.policy != BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && ctx->req.selector > 0) {
SPDK_ERRLOG("selector only works in active_active mode\n");
spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
"spdk_json_decode_object failed");
goto cleanup;
}
bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy, ctx->req.selector,
rpc_bdev_nvme_set_multipath_policy_done, ctx);
return;

View File

@ -943,16 +943,19 @@ def bdev_nvme_set_preferred_path(client, name, cntlid):
return client.call('bdev_nvme_set_preferred_path', params)
def bdev_nvme_set_multipath_policy(client, name, policy):
def bdev_nvme_set_multipath_policy(client, name, policy, selector):
"""Set multipath policy of the NVMe bdev
Args:
name: NVMe bdev name
policy: Multipath policy (active_passive or active_active)
selector: Multipath selector (round_robin, queue_depth)
"""
params = {'name': name,
'policy': policy}
if selector:
params['selector'] = selector
return client.call('bdev_nvme_set_multipath_policy', params)

View File

@ -860,12 +860,14 @@ if __name__ == "__main__":
def bdev_nvme_set_multipath_policy(args):
rpc.bdev.bdev_nvme_set_multipath_policy(args.client,
name=args.name,
policy=args.policy)
policy=args.policy,
selector=args.selector)
p = subparsers.add_parser('bdev_nvme_set_multipath_policy',
help="""Set multipath policy of the NVMe bdev""")
p.add_argument('-b', '--name', help='Name of the NVMe bdev', required=True)
p.add_argument('-p', '--policy', help='Multipath policy (active_passive or active_active)', required=True)
p.add_argument('-s', '--selector', help='Multipath selector (round_robin, queue_depth)', required=False)
p.set_defaults(func=bdev_nvme_set_multipath_policy)
def bdev_nvme_cuse_register(args):

View File

@ -305,6 +305,12 @@ spdk_nvme_ctrlr_get_next_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
return 0;
}
uint32_t
spdk_nvme_qpair_get_num_outstanding_reqs(struct spdk_nvme_qpair *qpair)
{
return qpair->num_outstanding_reqs;
}
static TAILQ_HEAD(, spdk_nvme_ctrlr) g_ut_init_ctrlrs = TAILQ_HEAD_INITIALIZER(g_ut_init_ctrlrs);
static TAILQ_HEAD(, spdk_nvme_ctrlr) g_ut_attached_ctrlrs = TAILQ_HEAD_INITIALIZER(
g_ut_attached_ctrlrs);
@ -5845,6 +5851,7 @@ test_find_next_io_path(void)
struct nvme_bdev_channel nbdev_ch = {
.io_path_list = STAILQ_HEAD_INITIALIZER(nbdev_ch.io_path_list),
.mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
.mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN,
};
struct spdk_nvme_qpair qpair1 = {}, qpair2 = {}, qpair3 = {};
struct spdk_nvme_ctrlr ctrlr1 = {}, ctrlr2 = {}, ctrlr3 = {};
@ -5866,7 +5873,9 @@ test_find_next_io_path(void)
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path2, stailq);
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path3, stailq);
/* nbdev_ch->current_io_path is filled always when bdev_nvme_find_next_io_path() is called. */
/* test the case when nbdev_ch->current_io_path is filled, the case of current_io_path = NULL
* is covered in test_find_io_path.
*/
nbdev_ch.current_io_path = &io_path2;
nvme_ns1.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
@ -5891,6 +5900,59 @@ test_find_next_io_path(void)
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
}
static void
test_find_io_path_min_qd(void)
{
struct nvme_bdev_channel nbdev_ch = {
.io_path_list = STAILQ_HEAD_INITIALIZER(nbdev_ch.io_path_list),
.mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
.mp_selector = BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH,
};
struct spdk_nvme_qpair qpair1 = {}, qpair2 = {}, qpair3 = {};
struct spdk_nvme_ctrlr ctrlr1 = {}, ctrlr2 = {}, ctrlr3 = {};
struct nvme_ctrlr nvme_ctrlr1 = { .ctrlr = &ctrlr1, };
struct nvme_ctrlr nvme_ctrlr2 = { .ctrlr = &ctrlr2, };
struct nvme_ctrlr nvme_ctrlr3 = { .ctrlr = &ctrlr3, };
struct nvme_ctrlr_channel ctrlr_ch1 = {};
struct nvme_ctrlr_channel ctrlr_ch2 = {};
struct nvme_ctrlr_channel ctrlr_ch3 = {};
struct nvme_qpair nvme_qpair1 = { .ctrlr_ch = &ctrlr_ch1, .ctrlr = &nvme_ctrlr1, .qpair = &qpair1, };
struct nvme_qpair nvme_qpair2 = { .ctrlr_ch = &ctrlr_ch2, .ctrlr = &nvme_ctrlr2, .qpair = &qpair2, };
struct nvme_qpair nvme_qpair3 = { .ctrlr_ch = &ctrlr_ch3, .ctrlr = &nvme_ctrlr3, .qpair = &qpair3, };
struct nvme_ns nvme_ns1 = {}, nvme_ns2 = {}, nvme_ns3 = {};
struct nvme_io_path io_path1 = { .qpair = &nvme_qpair1, .nvme_ns = &nvme_ns1, };
struct nvme_io_path io_path2 = { .qpair = &nvme_qpair2, .nvme_ns = &nvme_ns2, };
struct nvme_io_path io_path3 = { .qpair = &nvme_qpair3, .nvme_ns = &nvme_ns3, };
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path1, stailq);
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path2, stailq);
STAILQ_INSERT_TAIL(&nbdev_ch.io_path_list, &io_path3, stailq);
/* Test if the minumum io_outstanding or the ANA optimized state is
* prioritized when using least queue depth selector
*/
qpair1.num_outstanding_reqs = 2;
qpair2.num_outstanding_reqs = 1;
qpair3.num_outstanding_reqs = 0;
nvme_ns1.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
nvme_ns1.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_NON_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
nvme_ns1.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns2.ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
nvme_ns3.ana_state = SPDK_NVME_ANA_INACCESSIBLE_STATE;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path2);
qpair2.num_outstanding_reqs = 4;
CU_ASSERT(bdev_nvme_find_io_path(&nbdev_ch) == &io_path1);
}
static void
test_disable_auto_failback(void)
{
@ -6115,29 +6177,35 @@ test_set_multipath_policy(void)
*/
done = -1;
bdev_nvme_set_multipath_policy(bdev->disk.name, BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH,
ut_set_multipath_policy_done, &done);
poll_threads();
CU_ASSERT(done == 0);
CU_ASSERT(bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE);
CU_ASSERT(bdev->mp_selector == BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH);
ch = spdk_get_io_channel(bdev);
SPDK_CU_ASSERT_FATAL(ch != NULL);
nbdev_ch = spdk_io_channel_get_ctx(ch);
CU_ASSERT(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE);
CU_ASSERT(nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH);
/* If multipath policy is updated while a I/O channel is active,
* the update should be applied to the I/O channel immediately.
*/
done = -1;
bdev_nvme_set_multipath_policy(bdev->disk.name, BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE,
BDEV_NVME_MP_SELECTOR_ROUND_ROBIN,
ut_set_multipath_policy_done, &done);
poll_threads();
CU_ASSERT(done == 0);
CU_ASSERT(bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE);
CU_ASSERT(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE);
CU_ASSERT(bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN);
CU_ASSERT(nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN);
spdk_put_io_channel(ch);
@ -6257,17 +6325,19 @@ test_retry_io_to_same_path(void)
done = -1;
bdev_nvme_set_multipath_policy(bdev->disk.name, BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
ut_set_multipath_policy_done, &done);
BDEV_NVME_MP_SELECTOR_ROUND_ROBIN, ut_set_multipath_policy_done, &done);
poll_threads();
CU_ASSERT(done == 0);
CU_ASSERT(bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE);
CU_ASSERT(bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN);
ch = spdk_get_io_channel(bdev);
SPDK_CU_ASSERT_FATAL(ch != NULL);
nbdev_ch = spdk_io_channel_get_ctx(ch);
CU_ASSERT(nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE);
CU_ASSERT(bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN);
bdev_io = ut_alloc_bdev_io(SPDK_BDEV_IO_TYPE_WRITE, bdev, ch);
ut_bdev_io_set_buf(bdev_io);
@ -6408,6 +6478,7 @@ main(int argc, const char **argv)
CU_ADD_TEST(suite, test_ana_transition);
CU_ADD_TEST(suite, test_set_preferred_path);
CU_ADD_TEST(suite, test_find_next_io_path);
CU_ADD_TEST(suite, test_find_io_path_min_qd);
CU_ADD_TEST(suite, test_disable_auto_failback);
CU_ADD_TEST(suite, test_set_multipath_policy);
CU_ADD_TEST(suite, test_uuid_generation);