bdev_nvme: update nvme_io_path stat when IO completes
Currently we have stat per bdev I/O channel, but for NVMe bdev multipath, we don't have stat per I/O path. Especially for active-active mode, we may want to observe each path's statistics. This patch support IO stat for nvme_io_path. Record each nvme_io_path stat using structure spdk_bdev_io_stat. The following is the comparison of bdevperf test. Test on Arm server with the following basic configuration. 1 Null bdev: block size: 4K, num_blocks:16k run bdevperf with io size=4k, qdepth=1/32/128, rw type=randwrite/mixed with 70% read/randread Each time run 30 seconds, each item run for 16 times and get the average. The result is as follows. qdepth type IOPS(default) IOPS(this patch) diff 1 randwrite 7795157.27 7859909.78 0.83% 1 mix(70% r) 7418607.08 7404026.54 -0.20% 1 randread 8053560.83 8046315.44 -0.09% 32 randwrite 15409191.3 15327642.11 -0.53% 32 mix(70% r) 13760145.97 13714666.28 -0.33% 32 randread 16136922.98 16038855.39 -0.61% 128 randwrite 14815647.56 14944902.74 0.87% 128 mix(70% r) 13414858.59 13412317.46 -0.02% 128 randread 15508642.43 15521752.41 0.08% Change-Id: I4eb5673f49d65d3ff9b930361d2f31ab0ccfa021 Signed-off-by: Richael Zhuang <richael.zhuang@arm.com> Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14743 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com> Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
This commit is contained in:
parent
8ddc102a31
commit
f61b004197
@ -66,6 +66,9 @@ collecting NVMe error counts.
|
||||
New APIs `spdk_bdev_reset_io_stat`, `spdk_bdev_add_io_stat` and `spdk_bdev_dump_io_stat_json`
|
||||
were added to process I/O statistics outside the generic bdev layer, especially in bdev modules.
|
||||
|
||||
Added I/O statistics per I/O path to the NVMe bdev module for NVMe bdev multipath. It can be
|
||||
enabled by a new option io_path_stat of RPC bdev_nvme_set_options.
|
||||
|
||||
### event
|
||||
|
||||
Added core lock file mechanism to prevent the same CPU cores from being used by multiple
|
||||
@ -101,6 +104,8 @@ Added `rr_min_io` option to RPC bdev_nvme_set_multipath_policy. It switches I/O
|
||||
another path after rr_min_io I/Os are routed to current io path for the round-robin
|
||||
path selector.
|
||||
|
||||
Added option `--io-path-stat` for RPC bdev_nvme_set_option to enable collecting io path stat.
|
||||
|
||||
### bdevperf
|
||||
|
||||
Promoted the application to example to match similar programs: fio_plugin and perf.
|
||||
|
@ -3618,6 +3618,7 @@ generate_uuids | Optional | boolean | Enable generation of UUIDs
|
||||
transport_tos | Optional | number | IPv4 Type of Service value. Only applicable for RDMA transport. Default: 0 (no TOS is applied).
|
||||
nvme_error_stat | Optional | boolean | Enable collecting NVMe error counts.
|
||||
rdma_srq_size | Optional | number | Set the size of a shared rdma receive queue. Default: 0 (disabled).
|
||||
io_path_stat | Optional | boolean | Enable collecting I/O stat of each nvme bdev io path. Default: `false`.
|
||||
|
||||
#### Example
|
||||
|
||||
|
@ -93,6 +93,9 @@ struct nvme_bdev_io {
|
||||
|
||||
/* How many times the current I/O was retried. */
|
||||
int32_t retry_count;
|
||||
|
||||
/* Current tsc at submit time. */
|
||||
uint64_t submit_tsc;
|
||||
};
|
||||
|
||||
struct nvme_probe_skip_entry {
|
||||
@ -126,6 +129,7 @@ static struct spdk_bdev_nvme_opts g_opts = {
|
||||
.generate_uuids = false,
|
||||
.transport_tos = 0,
|
||||
.nvme_error_stat = false,
|
||||
.io_path_stat = false,
|
||||
};
|
||||
|
||||
#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
|
||||
@ -588,10 +592,21 @@ _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
if (g_opts.io_path_stat) {
|
||||
io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
|
||||
if (io_path->stat == NULL) {
|
||||
free(io_path);
|
||||
SPDK_ERRLOG("Failed to alloc io_path stat.\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
spdk_bdev_reset_io_stat(io_path->stat, BDEV_RESET_STAT_MAXMIN);
|
||||
}
|
||||
|
||||
io_path->nvme_ns = nvme_ns;
|
||||
|
||||
ch = spdk_get_io_channel(nvme_ns->ctrlr);
|
||||
if (ch == NULL) {
|
||||
free(io_path->stat);
|
||||
free(io_path);
|
||||
SPDK_ERRLOG("Failed to alloc io_channel.\n");
|
||||
return -ENOMEM;
|
||||
@ -635,6 +650,7 @@ _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_pat
|
||||
ch = spdk_io_channel_from_ctx(ctrlr_ch);
|
||||
spdk_put_io_channel(ch);
|
||||
|
||||
free(io_path->stat);
|
||||
free(io_path);
|
||||
}
|
||||
|
||||
@ -1122,6 +1138,99 @@ bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk
|
||||
pthread_mutex_unlock(&nbdev->mutex);
|
||||
}
|
||||
|
||||
static inline void
|
||||
bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
|
||||
{
|
||||
struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
|
||||
uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
|
||||
uint32_t blocklen = bdev_io->bdev->blocklen;
|
||||
struct spdk_bdev_io_stat *stat;
|
||||
uint64_t tsc_diff;
|
||||
|
||||
if (bio->io_path->stat == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
tsc_diff = spdk_get_ticks() - bio->submit_tsc;
|
||||
stat = bio->io_path->stat;
|
||||
|
||||
switch (bdev_io->type) {
|
||||
case SPDK_BDEV_IO_TYPE_READ:
|
||||
stat->bytes_read += num_blocks * blocklen;
|
||||
stat->num_read_ops++;
|
||||
stat->read_latency_ticks += tsc_diff;
|
||||
if (stat->max_read_latency_ticks < tsc_diff) {
|
||||
stat->max_read_latency_ticks = tsc_diff;
|
||||
}
|
||||
if (stat->min_read_latency_ticks > tsc_diff) {
|
||||
stat->min_read_latency_ticks = tsc_diff;
|
||||
}
|
||||
break;
|
||||
case SPDK_BDEV_IO_TYPE_WRITE:
|
||||
stat->bytes_written += num_blocks * blocklen;
|
||||
stat->num_write_ops++;
|
||||
stat->write_latency_ticks += tsc_diff;
|
||||
if (stat->max_write_latency_ticks < tsc_diff) {
|
||||
stat->max_write_latency_ticks = tsc_diff;
|
||||
}
|
||||
if (stat->min_write_latency_ticks > tsc_diff) {
|
||||
stat->min_write_latency_ticks = tsc_diff;
|
||||
}
|
||||
break;
|
||||
case SPDK_BDEV_IO_TYPE_UNMAP:
|
||||
stat->bytes_unmapped += num_blocks * blocklen;
|
||||
stat->num_unmap_ops++;
|
||||
stat->unmap_latency_ticks += tsc_diff;
|
||||
if (stat->max_unmap_latency_ticks < tsc_diff) {
|
||||
stat->max_unmap_latency_ticks = tsc_diff;
|
||||
}
|
||||
if (stat->min_unmap_latency_ticks > tsc_diff) {
|
||||
stat->min_unmap_latency_ticks = tsc_diff;
|
||||
}
|
||||
break;
|
||||
case SPDK_BDEV_IO_TYPE_ZCOPY:
|
||||
/* Track the data in the start phase only */
|
||||
if (!bdev_io->u.bdev.zcopy.start) {
|
||||
break;
|
||||
}
|
||||
if (bdev_io->u.bdev.zcopy.populate) {
|
||||
stat->bytes_read += num_blocks * blocklen;
|
||||
stat->num_read_ops++;
|
||||
stat->read_latency_ticks += tsc_diff;
|
||||
if (stat->max_read_latency_ticks < tsc_diff) {
|
||||
stat->max_read_latency_ticks = tsc_diff;
|
||||
}
|
||||
if (stat->min_read_latency_ticks > tsc_diff) {
|
||||
stat->min_read_latency_ticks = tsc_diff;
|
||||
}
|
||||
} else {
|
||||
stat->bytes_written += num_blocks * blocklen;
|
||||
stat->num_write_ops++;
|
||||
stat->write_latency_ticks += tsc_diff;
|
||||
if (stat->max_write_latency_ticks < tsc_diff) {
|
||||
stat->max_write_latency_ticks = tsc_diff;
|
||||
}
|
||||
if (stat->min_write_latency_ticks > tsc_diff) {
|
||||
stat->min_write_latency_ticks = tsc_diff;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case SPDK_BDEV_IO_TYPE_COPY:
|
||||
stat->bytes_copied += num_blocks * blocklen;
|
||||
stat->num_copy_ops++;
|
||||
stat->copy_latency_ticks += tsc_diff;
|
||||
if (stat->max_copy_latency_ticks < tsc_diff) {
|
||||
stat->max_copy_latency_ticks = tsc_diff;
|
||||
}
|
||||
if (stat->min_copy_latency_ticks > tsc_diff) {
|
||||
stat->min_copy_latency_ticks = tsc_diff;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
|
||||
const struct spdk_nvme_cpl *cpl)
|
||||
@ -1136,6 +1245,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
|
||||
assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
|
||||
|
||||
if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
|
||||
bdev_nvme_update_io_path_stat(bio);
|
||||
goto complete;
|
||||
}
|
||||
|
||||
@ -1188,6 +1298,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
|
||||
|
||||
complete:
|
||||
bio->retry_count = 0;
|
||||
bio->submit_tsc = 0;
|
||||
__bdev_nvme_io_complete(bdev_io, 0, cpl);
|
||||
}
|
||||
|
||||
@ -1223,6 +1334,7 @@ bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
|
||||
}
|
||||
|
||||
bio->retry_count = 0;
|
||||
bio->submit_tsc = 0;
|
||||
__bdev_nvme_io_complete(bdev_io, io_status, NULL);
|
||||
}
|
||||
|
||||
@ -2328,6 +2440,15 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i
|
||||
struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
|
||||
struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
|
||||
|
||||
if (spdk_likely(nbdev_io->submit_tsc == 0)) {
|
||||
nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
|
||||
} else {
|
||||
/* There are cases where submit_tsc != 0, i.e. retry I/O.
|
||||
* We need to update submit_tsc here.
|
||||
*/
|
||||
nbdev_io->submit_tsc = spdk_get_ticks();
|
||||
}
|
||||
|
||||
spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
|
||||
nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
|
||||
if (spdk_unlikely(!nbdev_io->io_path)) {
|
||||
@ -6942,6 +7063,7 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
|
||||
spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
|
||||
spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
|
||||
spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
|
||||
spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
|
||||
spdk_json_write_object_end(w);
|
||||
|
||||
spdk_json_write_object_end(w);
|
||||
|
@ -198,6 +198,9 @@ struct nvme_io_path {
|
||||
/* The following are used to update io_path cache of the nvme_bdev_channel. */
|
||||
struct nvme_bdev_channel *nbdev_ch;
|
||||
TAILQ_ENTRY(nvme_io_path) tailq;
|
||||
|
||||
/* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */
|
||||
struct spdk_bdev_io_stat *stat;
|
||||
};
|
||||
|
||||
struct nvme_bdev_channel {
|
||||
@ -274,6 +277,7 @@ struct spdk_bdev_nvme_opts {
|
||||
uint8_t transport_tos;
|
||||
bool nvme_error_stat;
|
||||
uint32_t rdma_srq_size;
|
||||
bool io_path_stat;
|
||||
};
|
||||
|
||||
struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
|
||||
|
@ -73,6 +73,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
|
||||
{"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true},
|
||||
{"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true},
|
||||
{"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true},
|
||||
{"io_path_stat", offsetof(struct spdk_bdev_nvme_opts, io_path_stat), spdk_json_decode_bool, true},
|
||||
};
|
||||
|
||||
static void
|
||||
|
@ -533,7 +533,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
|
||||
delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None,
|
||||
transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None,
|
||||
fast_io_fail_timeout_sec=None, disable_auto_failback=None, generate_uuids=None,
|
||||
transport_tos=None, nvme_error_stat=None, rdma_srq_size=None):
|
||||
transport_tos=None, nvme_error_stat=None, rdma_srq_size=None, io_path_stat=None):
|
||||
"""Set options for the bdev nvme. This is startup command.
|
||||
|
||||
Args:
|
||||
@ -577,6 +577,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
|
||||
The default is 0 which means no TOS is applied. (optional)
|
||||
nvme_error_stat: Enable collecting NVMe error counts. (optional)
|
||||
rdma_srq_size: Set the size of a shared rdma receive queue. Default: 0 (disabled) (optional)
|
||||
io_path_stat: Enable collection I/O path stat of each io path. (optional)
|
||||
|
||||
"""
|
||||
params = {}
|
||||
@ -654,6 +655,9 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
|
||||
if rdma_srq_size is not None:
|
||||
params['rdma_srq_size'] = rdma_srq_size
|
||||
|
||||
if io_path_stat is not None:
|
||||
params['io_path_stat'] = io_path_stat
|
||||
|
||||
return client.call('bdev_nvme_set_options', params)
|
||||
|
||||
|
||||
|
@ -562,7 +562,8 @@ if __name__ == "__main__":
|
||||
generate_uuids=args.generate_uuids,
|
||||
transport_tos=args.transport_tos,
|
||||
nvme_error_stat=args.nvme_error_stat,
|
||||
rdma_srq_size=args.rdma_srq_size)
|
||||
rdma_srq_size=args.rdma_srq_size,
|
||||
io_path_stat=args.io_path_stat)
|
||||
|
||||
p = subparsers.add_parser('bdev_nvme_set_options',
|
||||
help='Set options for the bdev nvme type. This is startup command.')
|
||||
@ -637,6 +638,9 @@ if __name__ == "__main__":
|
||||
p.add_argument('-m', '--nvme-error-stat', help="Enable collecting NVMe error counts.", action='store_true')
|
||||
p.add_argument('-q', '--rdma-srq-size',
|
||||
help='Set the size of a shared rdma receive queue. Default: 0 (disabled)', type=int)
|
||||
p.add_argument('--io-path-stat',
|
||||
help="""Enable collecting I/O path stat of each io path.""",
|
||||
action='store_true')
|
||||
|
||||
p.set_defaults(func=bdev_nvme_set_options)
|
||||
|
||||
|
@ -58,6 +58,11 @@ DEFINE_STUB_V(spdk_nvme_transport_get_opts, (struct spdk_nvme_transport_opts *op
|
||||
DEFINE_STUB(spdk_nvme_transport_set_opts, int, (const struct spdk_nvme_transport_opts *opts,
|
||||
size_t opts_size), 0);
|
||||
|
||||
DEFINE_STUB(spdk_bdev_io_get_submit_tsc, uint64_t, (struct spdk_bdev_io *bdev_io), 0);
|
||||
|
||||
DEFINE_STUB_V(spdk_bdev_reset_io_stat, (struct spdk_bdev_io_stat *stat,
|
||||
enum spdk_bdev_reset_stat_mode mode));
|
||||
|
||||
int
|
||||
spdk_nvme_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr,
|
||||
struct spdk_memory_domain **domains, int array_size)
|
||||
|
Loading…
Reference in New Issue
Block a user