bdev_nvme: update nvme_io_path stat when IO completes

Currently we have stat per bdev I/O channel, but for NVMe bdev
multipath, we don't have stat per I/O path. Especially for
active-active mode, we may want to observe each path's statistics.

This patch support IO stat for nvme_io_path. Record each nvme_io_path
stat using structure spdk_bdev_io_stat.

The following is the comparison of bdevperf test.

Test on Arm server with the following basic configuration.
1 Null bdev: block size: 4K, num_blocks:16k
run bdevperf with io size=4k, qdepth=1/32/128, rw type=randwrite/mixed with 70% read/randread

Each time run 30 seconds, each item run for 16 times and get the average.

The result is as follows.

qdepth type   IOPS(default) IOPS(this patch)  diff
1   randwrite   7795157.27  7859909.78       0.83%
1   mix(70% r)  7418607.08  7404026.54      -0.20%
1   randread    8053560.83  8046315.44      -0.09%

32  randwrite   15409191.3  15327642.11	    -0.53%
32  mix(70% r)  13760145.97 13714666.28	    -0.33%
32  randread    16136922.98 16038855.39	    -0.61%

128 randwrite   14815647.56 14944902.74	     0.87%
128 mix(70% r)  13414858.59 13412317.46	    -0.02%
128 randread    15508642.43 15521752.41	     0.08%

Change-Id: I4eb5673f49d65d3ff9b930361d2f31ab0ccfa021
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14743
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
This commit is contained in:
Richael Zhuang 2022-09-29 11:52:43 +08:00 committed by Jim Harris
parent 8ddc102a31
commit f61b004197
8 changed files with 148 additions and 2 deletions

View File

@ -66,6 +66,9 @@ collecting NVMe error counts.
New APIs `spdk_bdev_reset_io_stat`, `spdk_bdev_add_io_stat` and `spdk_bdev_dump_io_stat_json` New APIs `spdk_bdev_reset_io_stat`, `spdk_bdev_add_io_stat` and `spdk_bdev_dump_io_stat_json`
were added to process I/O statistics outside the generic bdev layer, especially in bdev modules. were added to process I/O statistics outside the generic bdev layer, especially in bdev modules.
Added I/O statistics per I/O path to the NVMe bdev module for NVMe bdev multipath. It can be
enabled by a new option io_path_stat of RPC bdev_nvme_set_options.
### event ### event
Added core lock file mechanism to prevent the same CPU cores from being used by multiple Added core lock file mechanism to prevent the same CPU cores from being used by multiple
@ -101,6 +104,8 @@ Added `rr_min_io` option to RPC bdev_nvme_set_multipath_policy. It switches I/O
another path after rr_min_io I/Os are routed to current io path for the round-robin another path after rr_min_io I/Os are routed to current io path for the round-robin
path selector. path selector.
Added option `--io-path-stat` for RPC bdev_nvme_set_option to enable collecting io path stat.
### bdevperf ### bdevperf
Promoted the application to example to match similar programs: fio_plugin and perf. Promoted the application to example to match similar programs: fio_plugin and perf.

View File

@ -3618,6 +3618,7 @@ generate_uuids | Optional | boolean | Enable generation of UUIDs
transport_tos | Optional | number | IPv4 Type of Service value. Only applicable for RDMA transport. Default: 0 (no TOS is applied). transport_tos | Optional | number | IPv4 Type of Service value. Only applicable for RDMA transport. Default: 0 (no TOS is applied).
nvme_error_stat | Optional | boolean | Enable collecting NVMe error counts. nvme_error_stat | Optional | boolean | Enable collecting NVMe error counts.
rdma_srq_size | Optional | number | Set the size of a shared rdma receive queue. Default: 0 (disabled). rdma_srq_size | Optional | number | Set the size of a shared rdma receive queue. Default: 0 (disabled).
io_path_stat | Optional | boolean | Enable collecting I/O stat of each nvme bdev io path. Default: `false`.
#### Example #### Example

View File

@ -93,6 +93,9 @@ struct nvme_bdev_io {
/* How many times the current I/O was retried. */ /* How many times the current I/O was retried. */
int32_t retry_count; int32_t retry_count;
/* Current tsc at submit time. */
uint64_t submit_tsc;
}; };
struct nvme_probe_skip_entry { struct nvme_probe_skip_entry {
@ -126,6 +129,7 @@ static struct spdk_bdev_nvme_opts g_opts = {
.generate_uuids = false, .generate_uuids = false,
.transport_tos = 0, .transport_tos = 0,
.nvme_error_stat = false, .nvme_error_stat = false,
.io_path_stat = false,
}; };
#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
@ -588,10 +592,21 @@ _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_
return -ENOMEM; return -ENOMEM;
} }
if (g_opts.io_path_stat) {
io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
if (io_path->stat == NULL) {
free(io_path);
SPDK_ERRLOG("Failed to alloc io_path stat.\n");
return -ENOMEM;
}
spdk_bdev_reset_io_stat(io_path->stat, BDEV_RESET_STAT_MAXMIN);
}
io_path->nvme_ns = nvme_ns; io_path->nvme_ns = nvme_ns;
ch = spdk_get_io_channel(nvme_ns->ctrlr); ch = spdk_get_io_channel(nvme_ns->ctrlr);
if (ch == NULL) { if (ch == NULL) {
free(io_path->stat);
free(io_path); free(io_path);
SPDK_ERRLOG("Failed to alloc io_channel.\n"); SPDK_ERRLOG("Failed to alloc io_channel.\n");
return -ENOMEM; return -ENOMEM;
@ -635,6 +650,7 @@ _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_pat
ch = spdk_io_channel_from_ctx(ctrlr_ch); ch = spdk_io_channel_from_ctx(ctrlr_ch);
spdk_put_io_channel(ch); spdk_put_io_channel(ch);
free(io_path->stat);
free(io_path); free(io_path);
} }
@ -1122,6 +1138,99 @@ bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk
pthread_mutex_unlock(&nbdev->mutex); pthread_mutex_unlock(&nbdev->mutex);
} }
static inline void
bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
{
struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
uint32_t blocklen = bdev_io->bdev->blocklen;
struct spdk_bdev_io_stat *stat;
uint64_t tsc_diff;
if (bio->io_path->stat == NULL) {
return;
}
tsc_diff = spdk_get_ticks() - bio->submit_tsc;
stat = bio->io_path->stat;
switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_READ:
stat->bytes_read += num_blocks * blocklen;
stat->num_read_ops++;
stat->read_latency_ticks += tsc_diff;
if (stat->max_read_latency_ticks < tsc_diff) {
stat->max_read_latency_ticks = tsc_diff;
}
if (stat->min_read_latency_ticks > tsc_diff) {
stat->min_read_latency_ticks = tsc_diff;
}
break;
case SPDK_BDEV_IO_TYPE_WRITE:
stat->bytes_written += num_blocks * blocklen;
stat->num_write_ops++;
stat->write_latency_ticks += tsc_diff;
if (stat->max_write_latency_ticks < tsc_diff) {
stat->max_write_latency_ticks = tsc_diff;
}
if (stat->min_write_latency_ticks > tsc_diff) {
stat->min_write_latency_ticks = tsc_diff;
}
break;
case SPDK_BDEV_IO_TYPE_UNMAP:
stat->bytes_unmapped += num_blocks * blocklen;
stat->num_unmap_ops++;
stat->unmap_latency_ticks += tsc_diff;
if (stat->max_unmap_latency_ticks < tsc_diff) {
stat->max_unmap_latency_ticks = tsc_diff;
}
if (stat->min_unmap_latency_ticks > tsc_diff) {
stat->min_unmap_latency_ticks = tsc_diff;
}
break;
case SPDK_BDEV_IO_TYPE_ZCOPY:
/* Track the data in the start phase only */
if (!bdev_io->u.bdev.zcopy.start) {
break;
}
if (bdev_io->u.bdev.zcopy.populate) {
stat->bytes_read += num_blocks * blocklen;
stat->num_read_ops++;
stat->read_latency_ticks += tsc_diff;
if (stat->max_read_latency_ticks < tsc_diff) {
stat->max_read_latency_ticks = tsc_diff;
}
if (stat->min_read_latency_ticks > tsc_diff) {
stat->min_read_latency_ticks = tsc_diff;
}
} else {
stat->bytes_written += num_blocks * blocklen;
stat->num_write_ops++;
stat->write_latency_ticks += tsc_diff;
if (stat->max_write_latency_ticks < tsc_diff) {
stat->max_write_latency_ticks = tsc_diff;
}
if (stat->min_write_latency_ticks > tsc_diff) {
stat->min_write_latency_ticks = tsc_diff;
}
}
break;
case SPDK_BDEV_IO_TYPE_COPY:
stat->bytes_copied += num_blocks * blocklen;
stat->num_copy_ops++;
stat->copy_latency_ticks += tsc_diff;
if (stat->max_copy_latency_ticks < tsc_diff) {
stat->max_copy_latency_ticks = tsc_diff;
}
if (stat->min_copy_latency_ticks > tsc_diff) {
stat->min_copy_latency_ticks = tsc_diff;
}
break;
default:
break;
}
}
static inline void static inline void
bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
const struct spdk_nvme_cpl *cpl) const struct spdk_nvme_cpl *cpl)
@ -1136,6 +1245,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
bdev_nvme_update_io_path_stat(bio);
goto complete; goto complete;
} }
@ -1188,6 +1298,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
complete: complete:
bio->retry_count = 0; bio->retry_count = 0;
bio->submit_tsc = 0;
__bdev_nvme_io_complete(bdev_io, 0, cpl); __bdev_nvme_io_complete(bdev_io, 0, cpl);
} }
@ -1223,6 +1334,7 @@ bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
} }
bio->retry_count = 0; bio->retry_count = 0;
bio->submit_tsc = 0;
__bdev_nvme_io_complete(bdev_io, io_status, NULL); __bdev_nvme_io_complete(bdev_io, io_status, NULL);
} }
@ -2328,6 +2440,15 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i
struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
if (spdk_likely(nbdev_io->submit_tsc == 0)) {
nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
} else {
/* There are cases where submit_tsc != 0, i.e. retry I/O.
* We need to update submit_tsc here.
*/
nbdev_io->submit_tsc = spdk_get_ticks();
}
spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
if (spdk_unlikely(!nbdev_io->io_path)) { if (spdk_unlikely(!nbdev_io->io_path)) {
@ -6942,6 +7063,7 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
spdk_json_write_object_end(w); spdk_json_write_object_end(w);
spdk_json_write_object_end(w); spdk_json_write_object_end(w);

View File

@ -198,6 +198,9 @@ struct nvme_io_path {
/* The following are used to update io_path cache of the nvme_bdev_channel. */ /* The following are used to update io_path cache of the nvme_bdev_channel. */
struct nvme_bdev_channel *nbdev_ch; struct nvme_bdev_channel *nbdev_ch;
TAILQ_ENTRY(nvme_io_path) tailq; TAILQ_ENTRY(nvme_io_path) tailq;
/* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */
struct spdk_bdev_io_stat *stat;
}; };
struct nvme_bdev_channel { struct nvme_bdev_channel {
@ -274,6 +277,7 @@ struct spdk_bdev_nvme_opts {
uint8_t transport_tos; uint8_t transport_tos;
bool nvme_error_stat; bool nvme_error_stat;
uint32_t rdma_srq_size; uint32_t rdma_srq_size;
bool io_path_stat;
}; };
struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);

View File

@ -73,6 +73,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
{"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true}, {"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true},
{"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true}, {"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true},
{"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true}, {"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true},
{"io_path_stat", offsetof(struct spdk_bdev_nvme_opts, io_path_stat), spdk_json_decode_bool, true},
}; };
static void static void

View File

@ -533,7 +533,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None,
transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None,
fast_io_fail_timeout_sec=None, disable_auto_failback=None, generate_uuids=None, fast_io_fail_timeout_sec=None, disable_auto_failback=None, generate_uuids=None,
transport_tos=None, nvme_error_stat=None, rdma_srq_size=None): transport_tos=None, nvme_error_stat=None, rdma_srq_size=None, io_path_stat=None):
"""Set options for the bdev nvme. This is startup command. """Set options for the bdev nvme. This is startup command.
Args: Args:
@ -577,6 +577,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
The default is 0 which means no TOS is applied. (optional) The default is 0 which means no TOS is applied. (optional)
nvme_error_stat: Enable collecting NVMe error counts. (optional) nvme_error_stat: Enable collecting NVMe error counts. (optional)
rdma_srq_size: Set the size of a shared rdma receive queue. Default: 0 (disabled) (optional) rdma_srq_size: Set the size of a shared rdma receive queue. Default: 0 (disabled) (optional)
io_path_stat: Enable collection I/O path stat of each io path. (optional)
""" """
params = {} params = {}
@ -654,6 +655,9 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
if rdma_srq_size is not None: if rdma_srq_size is not None:
params['rdma_srq_size'] = rdma_srq_size params['rdma_srq_size'] = rdma_srq_size
if io_path_stat is not None:
params['io_path_stat'] = io_path_stat
return client.call('bdev_nvme_set_options', params) return client.call('bdev_nvme_set_options', params)

View File

@ -562,7 +562,8 @@ if __name__ == "__main__":
generate_uuids=args.generate_uuids, generate_uuids=args.generate_uuids,
transport_tos=args.transport_tos, transport_tos=args.transport_tos,
nvme_error_stat=args.nvme_error_stat, nvme_error_stat=args.nvme_error_stat,
rdma_srq_size=args.rdma_srq_size) rdma_srq_size=args.rdma_srq_size,
io_path_stat=args.io_path_stat)
p = subparsers.add_parser('bdev_nvme_set_options', p = subparsers.add_parser('bdev_nvme_set_options',
help='Set options for the bdev nvme type. This is startup command.') help='Set options for the bdev nvme type. This is startup command.')
@ -637,6 +638,9 @@ if __name__ == "__main__":
p.add_argument('-m', '--nvme-error-stat', help="Enable collecting NVMe error counts.", action='store_true') p.add_argument('-m', '--nvme-error-stat', help="Enable collecting NVMe error counts.", action='store_true')
p.add_argument('-q', '--rdma-srq-size', p.add_argument('-q', '--rdma-srq-size',
help='Set the size of a shared rdma receive queue. Default: 0 (disabled)', type=int) help='Set the size of a shared rdma receive queue. Default: 0 (disabled)', type=int)
p.add_argument('--io-path-stat',
help="""Enable collecting I/O path stat of each io path.""",
action='store_true')
p.set_defaults(func=bdev_nvme_set_options) p.set_defaults(func=bdev_nvme_set_options)

View File

@ -58,6 +58,11 @@ DEFINE_STUB_V(spdk_nvme_transport_get_opts, (struct spdk_nvme_transport_opts *op
DEFINE_STUB(spdk_nvme_transport_set_opts, int, (const struct spdk_nvme_transport_opts *opts, DEFINE_STUB(spdk_nvme_transport_set_opts, int, (const struct spdk_nvme_transport_opts *opts,
size_t opts_size), 0); size_t opts_size), 0);
DEFINE_STUB(spdk_bdev_io_get_submit_tsc, uint64_t, (struct spdk_bdev_io *bdev_io), 0);
DEFINE_STUB_V(spdk_bdev_reset_io_stat, (struct spdk_bdev_io_stat *stat,
enum spdk_bdev_reset_stat_mode mode));
int int
spdk_nvme_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr, spdk_nvme_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr,
struct spdk_memory_domain **domains, int array_size) struct spdk_memory_domain **domains, int array_size)