From dfc989439662457d39bac524be72e8ea1c20e817 Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Wed, 14 Sep 2022 11:57:03 +0200 Subject: [PATCH] bdev: send bdev reset based on outstanding IO and a new timeout parameter A new parameter io_drain_timeout has been added to spdk_bdev structure. If this value is unset, the bdev reset behavior does not change. The io_drain_timeout controls how long a bdev reset must wait for IO to complete prior to issuing a reset to the underlying device. If there is no outstanding IO at the end of that period, the reset is skipped. Change-Id: I585af427064ce234a4f60afc3d69bc9fc3252432 Signed-off-by: Krzysztof Karas Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14501 Reviewed-by: Shuhei Matsumoto Reviewed-by: Jim Harris Reviewed-by: Tomasz Zawadzki Tested-by: SPDK CI Jenkins Community-CI: Mellanox Build Bot --- include/spdk/bdev_module.h | 30 ++++ lib/bdev/bdev.c | 85 +++++++++- module/bdev/lvol/vbdev_lvol.c | 7 + test/unit/lib/bdev/mt/bdev.c/bdev_ut.c | 212 +++++++++++++++++++++++++ 4 files changed, 330 insertions(+), 4 deletions(-) diff --git a/include/spdk/bdev_module.h b/include/spdk/bdev_module.h index 3081e236a..53d642f26 100644 --- a/include/spdk/bdev_module.h +++ b/include/spdk/bdev_module.h @@ -28,6 +28,11 @@ extern "C" { #endif +/* This parameter is best defined for bdevs that share an underlying bdev, + * such as multiple lvol bdevs sharing an nvme device, to avoid unnecessarily + * resetting the underlying bdev and affecting other bdevs that are sharing it. */ +#define BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE 5 + /** Block device module */ struct spdk_bdev_module { /** @@ -431,6 +436,25 @@ struct spdk_bdev { */ bool media_events; + /* Upon receiving a reset request, this is the amount of time in seconds + * to wait for all I/O to complete before moving forward with the reset. + * If all I/O completes prior to this time out, the reset will be skipped. + * A value of 0 is special and will always send resets immediately, even + * if there is no I/O outstanding. + * + * Use case example: + * A shared bdev (e.g. multiple lvol bdevs sharing an underlying nvme bdev) + * needs to be reset. For a non-zero value bdev reset code will wait + * `reset_io_drain_timeout` seconds for outstanding IO that are present + * on any bdev channel, before sending a reset down to the underlying device. + * That way we can avoid sending "empty" resets and interrupting work of + * other lvols that use the same bdev. BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE + * is a good choice for the value of this parameter. + * + * If this parameter remains equal to zero, the bdev reset will be forcefully + * sent down to the device, without any delays and waiting for outstanding IO. */ + uint16_t reset_io_drain_timeout; + /** * Pointer to the bdev module that registered this bdev. */ @@ -629,6 +653,12 @@ struct spdk_bdev_io { struct { /** Channel reference held while messages for this reset are in progress. */ struct spdk_io_channel *ch_ref; + struct { + /* Handle to timed poller that checks each channel for outstanding IO. */ + struct spdk_poller *poller; + /* Store calculated time value, when a poller should stop its work. */ + uint64_t stop_time_tsc; + } wait_poller; } reset; struct { /** The outstanding request matching bio_cb_arg which this abort attempts to cancel. */ diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index 4e7248c41..e3570c0e5 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -54,6 +54,7 @@ int __itt_init_ittlib(const char *, __itt_group_id); * when splitting into children requests at a time. */ #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) +#define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 static const char *qos_rpc_type[] = {"rw_ios_per_sec", "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" @@ -5276,15 +5277,91 @@ spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, return 0; } +static int bdev_reset_poll_for_outstanding_io(void *ctx); + static void -bdev_reset_dev(struct spdk_io_channel_iter *i, int status) +bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status) { struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); struct spdk_bdev_io *bdev_io; bdev_io = TAILQ_FIRST(&ch->queued_resets); - TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); - bdev_io_submit_reset(bdev_io); + + if (status == -EBUSY) { + if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { + bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, + ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); + } else { + /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, + * start the reset. */ + TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); + bdev_io_submit_reset(bdev_io); + } + } else { + TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); + SPDK_DEBUGLOG(bdev, + "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", + ch->bdev->name); + /* Mark the completion status as a SUCCESS and complete the reset. */ + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } +} + +static void +bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch); + int status = 0; + + if (cur_ch->io_outstanding > 0) { + /* If a channel has outstanding IO, set status to -EBUSY code. This will stop + * further iteration over the rest of the channels and pass non-zero status + * to the callback function. */ + status = -EBUSY; + } + spdk_for_each_channel_continue(i, status); +} + +static int +bdev_reset_poll_for_outstanding_io(void *ctx) +{ + struct spdk_bdev_channel *ch = ctx; + struct spdk_bdev_io *bdev_io; + + bdev_io = TAILQ_FIRST(&ch->queued_resets); + + spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); + spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, + ch, bdev_reset_check_outstanding_io_done); + + return SPDK_POLLER_BUSY; +} + +static void +bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); + struct spdk_bdev *bdev = ch->bdev; + struct spdk_bdev_io *bdev_io; + + bdev_io = TAILQ_FIRST(&ch->queued_resets); + + if (bdev->reset_io_drain_timeout == 0) { + TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); + + bdev_io_submit_reset(bdev_io); + return; + } + + bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + + (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); + + /* In case bdev->reset_io_drain_timeout is not equal to zero, + * submit the reset to the underlying module only if outstanding I/O + * remain after reset_io_drain_timeout seconds have passed. */ + spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, + ch, bdev_reset_check_outstanding_io_done); } static void @@ -5331,7 +5408,7 @@ bdev_start_reset(void *ctx) struct spdk_bdev_channel *ch = ctx; spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, - ch, bdev_reset_dev); + ch, bdev_reset_freeze_channel_done); } static void diff --git a/module/bdev/lvol/vbdev_lvol.c b/module/bdev/lvol/vbdev_lvol.c index dadf6c01d..1ce041436 100644 --- a/module/bdev/lvol/vbdev_lvol.c +++ b/module/bdev/lvol/vbdev_lvol.c @@ -1041,6 +1041,13 @@ _create_lvol_disk(struct spdk_lvol *lvol, bool destroy) bdev->fn_table = &vbdev_lvol_fn_table; bdev->module = &g_lvol_if; + /* Set default bdev reset waiting time. This value indicates how much + * time a reset should wait before forcing a reset down to the underlying + * bdev module. + * Setting this parameter is mainly to avoid "empty" resets to a shared + * bdev that may be used by multiple lvols. */ + bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE; + rc = spdk_bdev_register(bdev); if (rc) { free(lvol_bdev); diff --git a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c index bf9c52ef1..1af31c8ff 100644 --- a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c +++ b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c @@ -486,6 +486,8 @@ aborted_reset_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) spdk_bdev_free_io(bdev_io); } +static void io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); + static void aborted_reset(void) { @@ -542,6 +544,58 @@ aborted_reset(void) teardown_test(); } +static void +aborted_reset_no_outstanding_io(void) +{ + struct spdk_io_channel *io_ch[2]; + struct spdk_bdev_channel *bdev_ch[2]; + struct spdk_bdev *bdev[2]; + enum spdk_bdev_io_status status1 = SPDK_BDEV_IO_STATUS_PENDING, + status2 = SPDK_BDEV_IO_STATUS_PENDING; + + setup_test(); + + /* + * This time we test the reset without any outstanding IO + * present on the bdev channel, so both resets should finish + * immediately. + */ + + set_thread(0); + /* Set reset_io_drain_timeout to allow bdev + * reset to stay pending until we call abort. */ + io_ch[0] = spdk_bdev_get_io_channel(g_desc); + bdev_ch[0] = spdk_io_channel_get_ctx(io_ch[0]); + bdev[0] = bdev_ch[0]->bdev; + bdev[0]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE; + CU_ASSERT(io_ch[0] != NULL); + spdk_bdev_reset(g_desc, io_ch[0], aborted_reset_done, &status1); + poll_threads(); + CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL); + CU_ASSERT(status1 == SPDK_BDEV_IO_STATUS_SUCCESS); + spdk_put_io_channel(io_ch[0]); + + set_thread(1); + /* Set reset_io_drain_timeout to allow bdev + * reset to stay pending until we call abort. */ + io_ch[1] = spdk_bdev_get_io_channel(g_desc); + bdev_ch[1] = spdk_io_channel_get_ctx(io_ch[1]); + bdev[1] = bdev_ch[1]->bdev; + bdev[1]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE; + CU_ASSERT(io_ch[1] != NULL); + spdk_bdev_reset(g_desc, io_ch[1], aborted_reset_done, &status2); + poll_threads(); + CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL); + CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_SUCCESS); + spdk_put_io_channel(io_ch[1]); + + stub_complete_io(g_bdev.io_target, 0); + poll_threads(); + + teardown_test(); +} + + static void io_during_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) { @@ -655,6 +709,162 @@ io_during_reset(void) teardown_test(); } +static uint32_t +count_queued_resets(void *io_target) +{ + struct spdk_io_channel *_ch = spdk_get_io_channel(io_target); + struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct spdk_bdev_io *io; + uint32_t submitted_resets = 0; + + TAILQ_FOREACH(io, &ch->outstanding_io, module_link) { + if (io->type == SPDK_BDEV_IO_TYPE_RESET) { + submitted_resets++; + } + } + + spdk_put_io_channel(_ch); + + return submitted_resets; +} + +static void +reset_completions(void) +{ + struct spdk_io_channel *io_ch; + struct spdk_bdev_channel *bdev_ch; + struct spdk_bdev *bdev; + enum spdk_bdev_io_status status0, status_reset; + int rc, iter; + + setup_test(); + + /* This test covers four test cases: + * 1) reset_io_drain_timeout of a bdev is greater than 0 + * 2) No outstandind IO are present on any bdev channel + * 3) Outstanding IO finish during bdev reset + * 4) Outstanding IO do not finish before reset is done waiting + * for them. + * + * Above conditions mainly affect the timing of bdev reset completion + * and whether a reset should be skipped via spdk_bdev_io_complete() + * or sent down to the underlying bdev module via bdev_io_submit_reset(). */ + + /* Test preparation */ + set_thread(0); + io_ch = spdk_bdev_get_io_channel(g_desc); + bdev_ch = spdk_io_channel_get_ctx(io_ch); + CU_ASSERT(bdev_ch->flags == 0); + + + /* Test case 1) reset_io_drain_timeout set to 0. Reset should be sent down immediately. */ + bdev = &g_bdev.bdev; + bdev->reset_io_drain_timeout = 0; + + status_reset = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset); + CU_ASSERT(rc == 0); + poll_threads(); + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1); + + /* Call reset completion inside bdev module. */ + stub_complete_io(g_bdev.io_target, 0); + poll_threads(); + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0); + CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS); + CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL); + + + /* Test case 2) no outstanding IO are present. Reset should perform one iteration over + * channels and then be skipped. */ + bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE; + status_reset = SPDK_BDEV_IO_STATUS_PENDING; + + rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset); + CU_ASSERT(rc == 0); + poll_threads(); + /* Reset was never submitted to the bdev module. */ + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0); + CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS); + CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL); + + + /* Test case 3) outstanding IO finish during bdev reset procedure. Reset should initiate + * wait poller to check for IO completions every second, until reset_io_drain_timeout is + * reached, but finish earlier than this threshold. */ + status0 = SPDK_BDEV_IO_STATUS_PENDING; + status_reset = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0); + CU_ASSERT(rc == 0); + + rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset); + CU_ASSERT(rc == 0); + poll_threads(); + /* The reset just started and should not have been submitted yet. */ + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0); + + poll_threads(); + CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING); + /* Let the poller wait for about half the time then complete outstanding IO. */ + for (iter = 0; iter < 2; iter++) { + /* Reset is still processing and not submitted at this point. */ + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0); + spdk_delay_us(1000 * 1000); + poll_threads(); + poll_threads(); + } + CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING); + stub_complete_io(g_bdev.io_target, 0); + poll_threads(); + spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); + poll_threads(); + poll_threads(); + CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS); + /* Sending reset to the bdev module has been skipped. */ + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0); + CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL); + + + /* Test case 4) outstanding IO are still present after reset_io_drain_timeout + * seconds have passed. */ + status0 = SPDK_BDEV_IO_STATUS_PENDING; + status_reset = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0); + CU_ASSERT(rc == 0); + + rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset); + CU_ASSERT(rc == 0); + poll_threads(); + /* The reset just started and should not have been submitted yet. */ + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0); + + poll_threads(); + CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING); + /* Let the poller wait for reset_io_drain_timeout seconds. */ + for (iter = 0; iter < bdev->reset_io_drain_timeout; iter++) { + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0); + spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); + poll_threads(); + poll_threads(); + } + + /* After timing out, the reset should have been sent to the module. */ + CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1); + /* Complete reset submitted to the module and the read IO. */ + stub_complete_io(g_bdev.io_target, 0); + poll_threads(); + CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS); + CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL); + + + /* Destroy the channel and end the test. */ + spdk_put_io_channel(io_ch); + poll_threads(); + + teardown_test(); +} + + static void basic_qos(void) { @@ -2092,7 +2302,9 @@ main(int argc, char **argv) CU_ADD_TEST(suite, basic_qos); CU_ADD_TEST(suite, put_channel_during_reset); CU_ADD_TEST(suite, aborted_reset); + CU_ADD_TEST(suite, aborted_reset_no_outstanding_io); CU_ADD_TEST(suite, io_during_reset); + CU_ADD_TEST(suite, reset_completions); CU_ADD_TEST(suite, io_during_qos_queue); CU_ADD_TEST(suite, io_during_qos_reset); CU_ADD_TEST(suite, enomem);