bdev_nvme: update nvme_io_path stat when IO completes

Currently we have stat per bdev I/O channel, but for NVMe bdev
multipath, we don't have stat per I/O path. Especially for
active-active mode, we may want to observe each path's statistics.

This patch support IO stat for nvme_io_path. Record each nvme_io_path
stat using structure spdk_bdev_io_stat.

The following is the comparison of bdevperf test.

Test on Arm server with the following basic configuration.
1 Null bdev: block size: 4K, num_blocks:16k
run bdevperf with io size=4k, qdepth=1/32/128, rw type=randwrite/mixed with 70% read/randread

Each time run 30 seconds, each item run for 16 times and get the average.

The result is as follows.

qdepth type   IOPS(default) IOPS(this patch)  diff
1   randwrite   7795157.27  7859909.78       0.83%
1   mix(70% r)  7418607.08  7404026.54      -0.20%
1   randread    8053560.83  8046315.44      -0.09%

32  randwrite   15409191.3  15327642.11	    -0.53%
32  mix(70% r)  13760145.97 13714666.28	    -0.33%
32  randread    16136922.98 16038855.39	    -0.61%

128 randwrite   14815647.56 14944902.74	     0.87%
128 mix(70% r)  13414858.59 13412317.46	    -0.02%
128 randread    15508642.43 15521752.41	     0.08%

Change-Id: I4eb5673f49d65d3ff9b930361d2f31ab0ccfa021
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14743
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
This commit is contained in:
Richael Zhuang 2022-09-29 11:52:43 +08:00 committed by Jim Harris
parent 8ddc102a31
commit f61b004197
8 changed files with 148 additions and 2 deletions

View File

@ -66,6 +66,9 @@ collecting NVMe error counts.
New APIs `spdk_bdev_reset_io_stat`, `spdk_bdev_add_io_stat` and `spdk_bdev_dump_io_stat_json`
were added to process I/O statistics outside the generic bdev layer, especially in bdev modules.
Added I/O statistics per I/O path to the NVMe bdev module for NVMe bdev multipath. It can be
enabled by a new option io_path_stat of RPC bdev_nvme_set_options.
### event
Added core lock file mechanism to prevent the same CPU cores from being used by multiple
@ -101,6 +104,8 @@ Added `rr_min_io` option to RPC bdev_nvme_set_multipath_policy. It switches I/O
another path after rr_min_io I/Os are routed to current io path for the round-robin
path selector.
Added option `--io-path-stat` for RPC bdev_nvme_set_option to enable collecting io path stat.
### bdevperf
Promoted the application to example to match similar programs: fio_plugin and perf.

View File

@ -3618,6 +3618,7 @@ generate_uuids | Optional | boolean | Enable generation of UUIDs
transport_tos | Optional | number | IPv4 Type of Service value. Only applicable for RDMA transport. Default: 0 (no TOS is applied).
nvme_error_stat | Optional | boolean | Enable collecting NVMe error counts.
rdma_srq_size | Optional | number | Set the size of a shared rdma receive queue. Default: 0 (disabled).
io_path_stat | Optional | boolean | Enable collecting I/O stat of each nvme bdev io path. Default: `false`.
#### Example

View File

@ -93,6 +93,9 @@ struct nvme_bdev_io {
/* How many times the current I/O was retried. */
int32_t retry_count;
/* Current tsc at submit time. */
uint64_t submit_tsc;
};
struct nvme_probe_skip_entry {
@ -126,6 +129,7 @@ static struct spdk_bdev_nvme_opts g_opts = {
.generate_uuids = false,
.transport_tos = 0,
.nvme_error_stat = false,
.io_path_stat = false,
};
#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
@ -588,10 +592,21 @@ _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_
return -ENOMEM;
}
if (g_opts.io_path_stat) {
io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
if (io_path->stat == NULL) {
free(io_path);
SPDK_ERRLOG("Failed to alloc io_path stat.\n");
return -ENOMEM;
}
spdk_bdev_reset_io_stat(io_path->stat, BDEV_RESET_STAT_MAXMIN);
}
io_path->nvme_ns = nvme_ns;
ch = spdk_get_io_channel(nvme_ns->ctrlr);
if (ch == NULL) {
free(io_path->stat);
free(io_path);
SPDK_ERRLOG("Failed to alloc io_channel.\n");
return -ENOMEM;
@ -635,6 +650,7 @@ _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_pat
ch = spdk_io_channel_from_ctx(ctrlr_ch);
spdk_put_io_channel(ch);
free(io_path->stat);
free(io_path);
}
@ -1122,6 +1138,99 @@ bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk
pthread_mutex_unlock(&nbdev->mutex);
}
static inline void
bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
{
struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
uint32_t blocklen = bdev_io->bdev->blocklen;
struct spdk_bdev_io_stat *stat;
uint64_t tsc_diff;
if (bio->io_path->stat == NULL) {
return;
}
tsc_diff = spdk_get_ticks() - bio->submit_tsc;
stat = bio->io_path->stat;
switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_READ:
stat->bytes_read += num_blocks * blocklen;
stat->num_read_ops++;
stat->read_latency_ticks += tsc_diff;
if (stat->max_read_latency_ticks < tsc_diff) {
stat->max_read_latency_ticks = tsc_diff;
}
if (stat->min_read_latency_ticks > tsc_diff) {
stat->min_read_latency_ticks = tsc_diff;
}
break;
case SPDK_BDEV_IO_TYPE_WRITE:
stat->bytes_written += num_blocks * blocklen;
stat->num_write_ops++;
stat->write_latency_ticks += tsc_diff;
if (stat->max_write_latency_ticks < tsc_diff) {
stat->max_write_latency_ticks = tsc_diff;
}
if (stat->min_write_latency_ticks > tsc_diff) {
stat->min_write_latency_ticks = tsc_diff;
}
break;
case SPDK_BDEV_IO_TYPE_UNMAP:
stat->bytes_unmapped += num_blocks * blocklen;
stat->num_unmap_ops++;
stat->unmap_latency_ticks += tsc_diff;
if (stat->max_unmap_latency_ticks < tsc_diff) {
stat->max_unmap_latency_ticks = tsc_diff;
}
if (stat->min_unmap_latency_ticks > tsc_diff) {
stat->min_unmap_latency_ticks = tsc_diff;
}
break;
case SPDK_BDEV_IO_TYPE_ZCOPY:
/* Track the data in the start phase only */
if (!bdev_io->u.bdev.zcopy.start) {
break;
}
if (bdev_io->u.bdev.zcopy.populate) {
stat->bytes_read += num_blocks * blocklen;
stat->num_read_ops++;
stat->read_latency_ticks += tsc_diff;
if (stat->max_read_latency_ticks < tsc_diff) {
stat->max_read_latency_ticks = tsc_diff;
}
if (stat->min_read_latency_ticks > tsc_diff) {
stat->min_read_latency_ticks = tsc_diff;
}
} else {
stat->bytes_written += num_blocks * blocklen;
stat->num_write_ops++;
stat->write_latency_ticks += tsc_diff;
if (stat->max_write_latency_ticks < tsc_diff) {
stat->max_write_latency_ticks = tsc_diff;
}
if (stat->min_write_latency_ticks > tsc_diff) {
stat->min_write_latency_ticks = tsc_diff;
}
}
break;
case SPDK_BDEV_IO_TYPE_COPY:
stat->bytes_copied += num_blocks * blocklen;
stat->num_copy_ops++;
stat->copy_latency_ticks += tsc_diff;
if (stat->max_copy_latency_ticks < tsc_diff) {
stat->max_copy_latency_ticks = tsc_diff;
}
if (stat->min_copy_latency_ticks > tsc_diff) {
stat->min_copy_latency_ticks = tsc_diff;
}
break;
default:
break;
}
}
static inline void
bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
const struct spdk_nvme_cpl *cpl)
@ -1136,6 +1245,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
bdev_nvme_update_io_path_stat(bio);
goto complete;
}
@ -1188,6 +1298,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
complete:
bio->retry_count = 0;
bio->submit_tsc = 0;
__bdev_nvme_io_complete(bdev_io, 0, cpl);
}
@ -1223,6 +1334,7 @@ bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
}
bio->retry_count = 0;
bio->submit_tsc = 0;
__bdev_nvme_io_complete(bdev_io, io_status, NULL);
}
@ -2328,6 +2440,15 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i
struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
if (spdk_likely(nbdev_io->submit_tsc == 0)) {
nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
} else {
/* There are cases where submit_tsc != 0, i.e. retry I/O.
* We need to update submit_tsc here.
*/
nbdev_io->submit_tsc = spdk_get_ticks();
}
spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
if (spdk_unlikely(!nbdev_io->io_path)) {
@ -6942,6 +7063,7 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);

View File

@ -198,6 +198,9 @@ struct nvme_io_path {
/* The following are used to update io_path cache of the nvme_bdev_channel. */
struct nvme_bdev_channel *nbdev_ch;
TAILQ_ENTRY(nvme_io_path) tailq;
/* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */
struct spdk_bdev_io_stat *stat;
};
struct nvme_bdev_channel {
@ -274,6 +277,7 @@ struct spdk_bdev_nvme_opts {
uint8_t transport_tos;
bool nvme_error_stat;
uint32_t rdma_srq_size;
bool io_path_stat;
};
struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);

View File

@ -73,6 +73,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
{"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true},
{"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true},
{"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true},
{"io_path_stat", offsetof(struct spdk_bdev_nvme_opts, io_path_stat), spdk_json_decode_bool, true},
};
static void

View File

@ -533,7 +533,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None,
transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None,
fast_io_fail_timeout_sec=None, disable_auto_failback=None, generate_uuids=None,
transport_tos=None, nvme_error_stat=None, rdma_srq_size=None):
transport_tos=None, nvme_error_stat=None, rdma_srq_size=None, io_path_stat=None):
"""Set options for the bdev nvme. This is startup command.
Args:
@ -577,6 +577,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
The default is 0 which means no TOS is applied. (optional)
nvme_error_stat: Enable collecting NVMe error counts. (optional)
rdma_srq_size: Set the size of a shared rdma receive queue. Default: 0 (disabled) (optional)
io_path_stat: Enable collection I/O path stat of each io path. (optional)
"""
params = {}
@ -654,6 +655,9 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
if rdma_srq_size is not None:
params['rdma_srq_size'] = rdma_srq_size
if io_path_stat is not None:
params['io_path_stat'] = io_path_stat
return client.call('bdev_nvme_set_options', params)

View File

@ -562,7 +562,8 @@ if __name__ == "__main__":
generate_uuids=args.generate_uuids,
transport_tos=args.transport_tos,
nvme_error_stat=args.nvme_error_stat,
rdma_srq_size=args.rdma_srq_size)
rdma_srq_size=args.rdma_srq_size,
io_path_stat=args.io_path_stat)
p = subparsers.add_parser('bdev_nvme_set_options',
help='Set options for the bdev nvme type. This is startup command.')
@ -637,6 +638,9 @@ if __name__ == "__main__":
p.add_argument('-m', '--nvme-error-stat', help="Enable collecting NVMe error counts.", action='store_true')
p.add_argument('-q', '--rdma-srq-size',
help='Set the size of a shared rdma receive queue. Default: 0 (disabled)', type=int)
p.add_argument('--io-path-stat',
help="""Enable collecting I/O path stat of each io path.""",
action='store_true')
p.set_defaults(func=bdev_nvme_set_options)

View File

@ -58,6 +58,11 @@ DEFINE_STUB_V(spdk_nvme_transport_get_opts, (struct spdk_nvme_transport_opts *op
DEFINE_STUB(spdk_nvme_transport_set_opts, int, (const struct spdk_nvme_transport_opts *opts,
size_t opts_size), 0);
DEFINE_STUB(spdk_bdev_io_get_submit_tsc, uint64_t, (struct spdk_bdev_io *bdev_io), 0);
DEFINE_STUB_V(spdk_bdev_reset_io_stat, (struct spdk_bdev_io_stat *stat,
enum spdk_bdev_reset_stat_mode mode));
int
spdk_nvme_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr,
struct spdk_memory_domain **domains, int array_size)