diff --git a/CHANGELOG.md b/CHANGELOG.md index 2db35330f..52758736f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,6 +66,9 @@ collecting NVMe error counts. New APIs `spdk_bdev_reset_io_stat`, `spdk_bdev_add_io_stat` and `spdk_bdev_dump_io_stat_json` were added to process I/O statistics outside the generic bdev layer, especially in bdev modules. +Added I/O statistics per I/O path to the NVMe bdev module for NVMe bdev multipath. It can be +enabled by a new option io_path_stat of RPC bdev_nvme_set_options. + ### event Added core lock file mechanism to prevent the same CPU cores from being used by multiple @@ -101,6 +104,8 @@ Added `rr_min_io` option to RPC bdev_nvme_set_multipath_policy. It switches I/O another path after rr_min_io I/Os are routed to current io path for the round-robin path selector. +Added option `--io-path-stat` for RPC bdev_nvme_set_option to enable collecting io path stat. + ### bdevperf Promoted the application to example to match similar programs: fio_plugin and perf. diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 84cf13f98..c2723374c 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -3618,6 +3618,7 @@ generate_uuids | Optional | boolean | Enable generation of UUIDs transport_tos | Optional | number | IPv4 Type of Service value. Only applicable for RDMA transport. Default: 0 (no TOS is applied). nvme_error_stat | Optional | boolean | Enable collecting NVMe error counts. rdma_srq_size | Optional | number | Set the size of a shared rdma receive queue. Default: 0 (disabled). +io_path_stat | Optional | boolean | Enable collecting I/O stat of each nvme bdev io path. Default: `false`. #### Example diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index 75bb534e9..59da17c14 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -93,6 +93,9 @@ struct nvme_bdev_io { /* How many times the current I/O was retried. */ int32_t retry_count; + + /* Current tsc at submit time. */ + uint64_t submit_tsc; }; struct nvme_probe_skip_entry { @@ -126,6 +129,7 @@ static struct spdk_bdev_nvme_opts g_opts = { .generate_uuids = false, .transport_tos = 0, .nvme_error_stat = false, + .io_path_stat = false, }; #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL @@ -588,10 +592,21 @@ _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ return -ENOMEM; } + if (g_opts.io_path_stat) { + io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); + if (io_path->stat == NULL) { + free(io_path); + SPDK_ERRLOG("Failed to alloc io_path stat.\n"); + return -ENOMEM; + } + spdk_bdev_reset_io_stat(io_path->stat, BDEV_RESET_STAT_MAXMIN); + } + io_path->nvme_ns = nvme_ns; ch = spdk_get_io_channel(nvme_ns->ctrlr); if (ch == NULL) { + free(io_path->stat); free(io_path); SPDK_ERRLOG("Failed to alloc io_channel.\n"); return -ENOMEM; @@ -635,6 +650,7 @@ _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_pat ch = spdk_io_channel_from_ctx(ctrlr_ch); spdk_put_io_channel(ch); + free(io_path->stat); free(io_path); } @@ -1122,6 +1138,99 @@ bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk pthread_mutex_unlock(&nbdev->mutex); } +static inline void +bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + uint64_t num_blocks = bdev_io->u.bdev.num_blocks; + uint32_t blocklen = bdev_io->bdev->blocklen; + struct spdk_bdev_io_stat *stat; + uint64_t tsc_diff; + + if (bio->io_path->stat == NULL) { + return; + } + + tsc_diff = spdk_get_ticks() - bio->submit_tsc; + stat = bio->io_path->stat; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + stat->bytes_read += num_blocks * blocklen; + stat->num_read_ops++; + stat->read_latency_ticks += tsc_diff; + if (stat->max_read_latency_ticks < tsc_diff) { + stat->max_read_latency_ticks = tsc_diff; + } + if (stat->min_read_latency_ticks > tsc_diff) { + stat->min_read_latency_ticks = tsc_diff; + } + break; + case SPDK_BDEV_IO_TYPE_WRITE: + stat->bytes_written += num_blocks * blocklen; + stat->num_write_ops++; + stat->write_latency_ticks += tsc_diff; + if (stat->max_write_latency_ticks < tsc_diff) { + stat->max_write_latency_ticks = tsc_diff; + } + if (stat->min_write_latency_ticks > tsc_diff) { + stat->min_write_latency_ticks = tsc_diff; + } + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + stat->bytes_unmapped += num_blocks * blocklen; + stat->num_unmap_ops++; + stat->unmap_latency_ticks += tsc_diff; + if (stat->max_unmap_latency_ticks < tsc_diff) { + stat->max_unmap_latency_ticks = tsc_diff; + } + if (stat->min_unmap_latency_ticks > tsc_diff) { + stat->min_unmap_latency_ticks = tsc_diff; + } + break; + case SPDK_BDEV_IO_TYPE_ZCOPY: + /* Track the data in the start phase only */ + if (!bdev_io->u.bdev.zcopy.start) { + break; + } + if (bdev_io->u.bdev.zcopy.populate) { + stat->bytes_read += num_blocks * blocklen; + stat->num_read_ops++; + stat->read_latency_ticks += tsc_diff; + if (stat->max_read_latency_ticks < tsc_diff) { + stat->max_read_latency_ticks = tsc_diff; + } + if (stat->min_read_latency_ticks > tsc_diff) { + stat->min_read_latency_ticks = tsc_diff; + } + } else { + stat->bytes_written += num_blocks * blocklen; + stat->num_write_ops++; + stat->write_latency_ticks += tsc_diff; + if (stat->max_write_latency_ticks < tsc_diff) { + stat->max_write_latency_ticks = tsc_diff; + } + if (stat->min_write_latency_ticks > tsc_diff) { + stat->min_write_latency_ticks = tsc_diff; + } + } + break; + case SPDK_BDEV_IO_TYPE_COPY: + stat->bytes_copied += num_blocks * blocklen; + stat->num_copy_ops++; + stat->copy_latency_ticks += tsc_diff; + if (stat->max_copy_latency_ticks < tsc_diff) { + stat->max_copy_latency_ticks = tsc_diff; + } + if (stat->min_copy_latency_ticks > tsc_diff) { + stat->min_copy_latency_ticks = tsc_diff; + } + break; + default: + break; + } +} + static inline void bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, const struct spdk_nvme_cpl *cpl) @@ -1136,6 +1245,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { + bdev_nvme_update_io_path_stat(bio); goto complete; } @@ -1188,6 +1298,7 @@ bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, complete: bio->retry_count = 0; + bio->submit_tsc = 0; __bdev_nvme_io_complete(bdev_io, 0, cpl); } @@ -1223,6 +1334,7 @@ bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) } bio->retry_count = 0; + bio->submit_tsc = 0; __bdev_nvme_io_complete(bdev_io, io_status, NULL); } @@ -2328,6 +2440,15 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; + if (spdk_likely(nbdev_io->submit_tsc == 0)) { + nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); + } else { + /* There are cases where submit_tsc != 0, i.e. retry I/O. + * We need to update submit_tsc here. + */ + nbdev_io->submit_tsc = spdk_get_ticks(); + } + spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); if (spdk_unlikely(!nbdev_io->io_path)) { @@ -6942,6 +7063,7 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); + spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); spdk_json_write_object_end(w); spdk_json_write_object_end(w); diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h index 09ac1bb7a..32aa5cfde 100644 --- a/module/bdev/nvme/bdev_nvme.h +++ b/module/bdev/nvme/bdev_nvme.h @@ -198,6 +198,9 @@ struct nvme_io_path { /* The following are used to update io_path cache of the nvme_bdev_channel. */ struct nvme_bdev_channel *nbdev_ch; TAILQ_ENTRY(nvme_io_path) tailq; + + /* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */ + struct spdk_bdev_io_stat *stat; }; struct nvme_bdev_channel { @@ -274,6 +277,7 @@ struct spdk_bdev_nvme_opts { uint8_t transport_tos; bool nvme_error_stat; uint32_t rdma_srq_size; + bool io_path_stat; }; struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); diff --git a/module/bdev/nvme/bdev_nvme_rpc.c b/module/bdev/nvme/bdev_nvme_rpc.c index 7481cadae..a2f3cc2fe 100644 --- a/module/bdev/nvme/bdev_nvme_rpc.c +++ b/module/bdev/nvme/bdev_nvme_rpc.c @@ -73,6 +73,7 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = {"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true}, {"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true}, {"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true}, + {"io_path_stat", offsetof(struct spdk_bdev_nvme_opts, io_path_stat), spdk_json_decode_bool, true}, }; static void diff --git a/python/spdk/rpc/bdev.py b/python/spdk/rpc/bdev.py index e95d82c18..3bbaba911 100644 --- a/python/spdk/rpc/bdev.py +++ b/python/spdk/rpc/bdev.py @@ -533,7 +533,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, fast_io_fail_timeout_sec=None, disable_auto_failback=None, generate_uuids=None, - transport_tos=None, nvme_error_stat=None, rdma_srq_size=None): + transport_tos=None, nvme_error_stat=None, rdma_srq_size=None, io_path_stat=None): """Set options for the bdev nvme. This is startup command. Args: @@ -577,6 +577,7 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo The default is 0 which means no TOS is applied. (optional) nvme_error_stat: Enable collecting NVMe error counts. (optional) rdma_srq_size: Set the size of a shared rdma receive queue. Default: 0 (disabled) (optional) + io_path_stat: Enable collection I/O path stat of each io path. (optional) """ params = {} @@ -654,6 +655,9 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo if rdma_srq_size is not None: params['rdma_srq_size'] = rdma_srq_size + if io_path_stat is not None: + params['io_path_stat'] = io_path_stat + return client.call('bdev_nvme_set_options', params) diff --git a/scripts/rpc.py b/scripts/rpc.py index c7c5402bb..a3ff17323 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -562,7 +562,8 @@ if __name__ == "__main__": generate_uuids=args.generate_uuids, transport_tos=args.transport_tos, nvme_error_stat=args.nvme_error_stat, - rdma_srq_size=args.rdma_srq_size) + rdma_srq_size=args.rdma_srq_size, + io_path_stat=args.io_path_stat) p = subparsers.add_parser('bdev_nvme_set_options', help='Set options for the bdev nvme type. This is startup command.') @@ -637,6 +638,9 @@ if __name__ == "__main__": p.add_argument('-m', '--nvme-error-stat', help="Enable collecting NVMe error counts.", action='store_true') p.add_argument('-q', '--rdma-srq-size', help='Set the size of a shared rdma receive queue. Default: 0 (disabled)', type=int) + p.add_argument('--io-path-stat', + help="""Enable collecting I/O path stat of each io path.""", + action='store_true') p.set_defaults(func=bdev_nvme_set_options) diff --git a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c index cf924a357..1a143aa01 100644 --- a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c +++ b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c @@ -58,6 +58,11 @@ DEFINE_STUB_V(spdk_nvme_transport_get_opts, (struct spdk_nvme_transport_opts *op DEFINE_STUB(spdk_nvme_transport_set_opts, int, (const struct spdk_nvme_transport_opts *opts, size_t opts_size), 0); +DEFINE_STUB(spdk_bdev_io_get_submit_tsc, uint64_t, (struct spdk_bdev_io *bdev_io), 0); + +DEFINE_STUB_V(spdk_bdev_reset_io_stat, (struct spdk_bdev_io_stat *stat, + enum spdk_bdev_reset_stat_mode mode)); + int spdk_nvme_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr, struct spdk_memory_domain **domains, int array_size)