bdev: send bdev reset based on outstanding IO and a new timeout parameter

A new parameter io_drain_timeout has been added to spdk_bdev
structure. If this value is unset, the bdev reset behavior
does not change.
The io_drain_timeout controls how long a bdev reset must wait for IO
to complete prior to issuing a reset to the underlying device.
If there is no outstanding IO at the end of that period, the reset
is skipped.

Change-Id: I585af427064ce234a4f60afc3d69bc9fc3252432
Signed-off-by: Krzysztof Karas <krzysztof.karas@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14501
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
This commit is contained in:
Krzysztof Karas 2022-09-14 11:57:03 +02:00 committed by Tomasz Zawadzki
parent eafb489c0d
commit dfc9894396
4 changed files with 330 additions and 4 deletions

View File

@ -28,6 +28,11 @@
extern "C" {
#endif
/* This parameter is best defined for bdevs that share an underlying bdev,
* such as multiple lvol bdevs sharing an nvme device, to avoid unnecessarily
* resetting the underlying bdev and affecting other bdevs that are sharing it. */
#define BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE 5
/** Block device module */
struct spdk_bdev_module {
/**
@ -431,6 +436,25 @@ struct spdk_bdev {
*/
bool media_events;
/* Upon receiving a reset request, this is the amount of time in seconds
* to wait for all I/O to complete before moving forward with the reset.
* If all I/O completes prior to this time out, the reset will be skipped.
* A value of 0 is special and will always send resets immediately, even
* if there is no I/O outstanding.
*
* Use case example:
* A shared bdev (e.g. multiple lvol bdevs sharing an underlying nvme bdev)
* needs to be reset. For a non-zero value bdev reset code will wait
* `reset_io_drain_timeout` seconds for outstanding IO that are present
* on any bdev channel, before sending a reset down to the underlying device.
* That way we can avoid sending "empty" resets and interrupting work of
* other lvols that use the same bdev. BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE
* is a good choice for the value of this parameter.
*
* If this parameter remains equal to zero, the bdev reset will be forcefully
* sent down to the device, without any delays and waiting for outstanding IO. */
uint16_t reset_io_drain_timeout;
/**
* Pointer to the bdev module that registered this bdev.
*/
@ -629,6 +653,12 @@ struct spdk_bdev_io {
struct {
/** Channel reference held while messages for this reset are in progress. */
struct spdk_io_channel *ch_ref;
struct {
/* Handle to timed poller that checks each channel for outstanding IO. */
struct spdk_poller *poller;
/* Store calculated time value, when a poller should stop its work. */
uint64_t stop_time_tsc;
} wait_poller;
} reset;
struct {
/** The outstanding request matching bio_cb_arg which this abort attempts to cancel. */

View File

@ -54,6 +54,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
* when splitting into children requests at a time.
*/
#define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
#define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
static const char *qos_rpc_type[] = {"rw_ios_per_sec",
"rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
@ -5276,16 +5277,92 @@ spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
return 0;
}
static int bdev_reset_poll_for_outstanding_io(void *ctx);
static void
bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status)
{
struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
struct spdk_bdev_io *bdev_io;
bdev_io = TAILQ_FIRST(&ch->queued_resets);
if (status == -EBUSY) {
if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
} else {
/* If outstanding IOs are still present and reset_io_drain_timeout seconds passed,
* start the reset. */
TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
bdev_io_submit_reset(bdev_io);
}
} else {
TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
SPDK_DEBUGLOG(bdev,
"Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
ch->bdev->name);
/* Mark the completion status as a SUCCESS and complete the reset. */
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
}
}
static void
bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i)
{
struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch);
int status = 0;
if (cur_ch->io_outstanding > 0) {
/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
* further iteration over the rest of the channels and pass non-zero status
* to the callback function. */
status = -EBUSY;
}
spdk_for_each_channel_continue(i, status);
}
static int
bdev_reset_poll_for_outstanding_io(void *ctx)
{
struct spdk_bdev_channel *ch = ctx;
struct spdk_bdev_io *bdev_io;
bdev_io = TAILQ_FIRST(&ch->queued_resets);
spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
ch, bdev_reset_check_outstanding_io_done);
return SPDK_POLLER_BUSY;
}
static void
bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status)
{
struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
struct spdk_bdev *bdev = ch->bdev;
struct spdk_bdev_io *bdev_io;
bdev_io = TAILQ_FIRST(&ch->queued_resets);
if (bdev->reset_io_drain_timeout == 0) {
TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
bdev_io_submit_reset(bdev_io);
return;
}
bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
/* In case bdev->reset_io_drain_timeout is not equal to zero,
* submit the reset to the underlying module only if outstanding I/O
* remain after reset_io_drain_timeout seconds have passed. */
spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
ch, bdev_reset_check_outstanding_io_done);
}
static void
bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
@ -5331,7 +5408,7 @@ bdev_start_reset(void *ctx)
struct spdk_bdev_channel *ch = ctx;
spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel,
ch, bdev_reset_dev);
ch, bdev_reset_freeze_channel_done);
}
static void

View File

@ -1041,6 +1041,13 @@ _create_lvol_disk(struct spdk_lvol *lvol, bool destroy)
bdev->fn_table = &vbdev_lvol_fn_table;
bdev->module = &g_lvol_if;
/* Set default bdev reset waiting time. This value indicates how much
* time a reset should wait before forcing a reset down to the underlying
* bdev module.
* Setting this parameter is mainly to avoid "empty" resets to a shared
* bdev that may be used by multiple lvols. */
bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
rc = spdk_bdev_register(bdev);
if (rc) {
free(lvol_bdev);

View File

@ -486,6 +486,8 @@ aborted_reset_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
spdk_bdev_free_io(bdev_io);
}
static void io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
static void
aborted_reset(void)
{
@ -542,6 +544,58 @@ aborted_reset(void)
teardown_test();
}
static void
aborted_reset_no_outstanding_io(void)
{
struct spdk_io_channel *io_ch[2];
struct spdk_bdev_channel *bdev_ch[2];
struct spdk_bdev *bdev[2];
enum spdk_bdev_io_status status1 = SPDK_BDEV_IO_STATUS_PENDING,
status2 = SPDK_BDEV_IO_STATUS_PENDING;
setup_test();
/*
* This time we test the reset without any outstanding IO
* present on the bdev channel, so both resets should finish
* immediately.
*/
set_thread(0);
/* Set reset_io_drain_timeout to allow bdev
* reset to stay pending until we call abort. */
io_ch[0] = spdk_bdev_get_io_channel(g_desc);
bdev_ch[0] = spdk_io_channel_get_ctx(io_ch[0]);
bdev[0] = bdev_ch[0]->bdev;
bdev[0]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
CU_ASSERT(io_ch[0] != NULL);
spdk_bdev_reset(g_desc, io_ch[0], aborted_reset_done, &status1);
poll_threads();
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
CU_ASSERT(status1 == SPDK_BDEV_IO_STATUS_SUCCESS);
spdk_put_io_channel(io_ch[0]);
set_thread(1);
/* Set reset_io_drain_timeout to allow bdev
* reset to stay pending until we call abort. */
io_ch[1] = spdk_bdev_get_io_channel(g_desc);
bdev_ch[1] = spdk_io_channel_get_ctx(io_ch[1]);
bdev[1] = bdev_ch[1]->bdev;
bdev[1]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
CU_ASSERT(io_ch[1] != NULL);
spdk_bdev_reset(g_desc, io_ch[1], aborted_reset_done, &status2);
poll_threads();
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_SUCCESS);
spdk_put_io_channel(io_ch[1]);
stub_complete_io(g_bdev.io_target, 0);
poll_threads();
teardown_test();
}
static void
io_during_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
@ -655,6 +709,162 @@ io_during_reset(void)
teardown_test();
}
static uint32_t
count_queued_resets(void *io_target)
{
struct spdk_io_channel *_ch = spdk_get_io_channel(io_target);
struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
struct spdk_bdev_io *io;
uint32_t submitted_resets = 0;
TAILQ_FOREACH(io, &ch->outstanding_io, module_link) {
if (io->type == SPDK_BDEV_IO_TYPE_RESET) {
submitted_resets++;
}
}
spdk_put_io_channel(_ch);
return submitted_resets;
}
static void
reset_completions(void)
{
struct spdk_io_channel *io_ch;
struct spdk_bdev_channel *bdev_ch;
struct spdk_bdev *bdev;
enum spdk_bdev_io_status status0, status_reset;
int rc, iter;
setup_test();
/* This test covers four test cases:
* 1) reset_io_drain_timeout of a bdev is greater than 0
* 2) No outstandind IO are present on any bdev channel
* 3) Outstanding IO finish during bdev reset
* 4) Outstanding IO do not finish before reset is done waiting
* for them.
*
* Above conditions mainly affect the timing of bdev reset completion
* and whether a reset should be skipped via spdk_bdev_io_complete()
* or sent down to the underlying bdev module via bdev_io_submit_reset(). */
/* Test preparation */
set_thread(0);
io_ch = spdk_bdev_get_io_channel(g_desc);
bdev_ch = spdk_io_channel_get_ctx(io_ch);
CU_ASSERT(bdev_ch->flags == 0);
/* Test case 1) reset_io_drain_timeout set to 0. Reset should be sent down immediately. */
bdev = &g_bdev.bdev;
bdev->reset_io_drain_timeout = 0;
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
CU_ASSERT(rc == 0);
poll_threads();
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);
/* Call reset completion inside bdev module. */
stub_complete_io(g_bdev.io_target, 0);
poll_threads();
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
/* Test case 2) no outstanding IO are present. Reset should perform one iteration over
* channels and then be skipped. */
bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
CU_ASSERT(rc == 0);
poll_threads();
/* Reset was never submitted to the bdev module. */
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
/* Test case 3) outstanding IO finish during bdev reset procedure. Reset should initiate
* wait poller to check for IO completions every second, until reset_io_drain_timeout is
* reached, but finish earlier than this threshold. */
status0 = SPDK_BDEV_IO_STATUS_PENDING;
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
CU_ASSERT(rc == 0);
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
CU_ASSERT(rc == 0);
poll_threads();
/* The reset just started and should not have been submitted yet. */
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
poll_threads();
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
/* Let the poller wait for about half the time then complete outstanding IO. */
for (iter = 0; iter < 2; iter++) {
/* Reset is still processing and not submitted at this point. */
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
spdk_delay_us(1000 * 1000);
poll_threads();
poll_threads();
}
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
stub_complete_io(g_bdev.io_target, 0);
poll_threads();
spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
poll_threads();
poll_threads();
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
/* Sending reset to the bdev module has been skipped. */
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
/* Test case 4) outstanding IO are still present after reset_io_drain_timeout
* seconds have passed. */
status0 = SPDK_BDEV_IO_STATUS_PENDING;
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
CU_ASSERT(rc == 0);
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
CU_ASSERT(rc == 0);
poll_threads();
/* The reset just started and should not have been submitted yet. */
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
poll_threads();
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
/* Let the poller wait for reset_io_drain_timeout seconds. */
for (iter = 0; iter < bdev->reset_io_drain_timeout; iter++) {
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
poll_threads();
poll_threads();
}
/* After timing out, the reset should have been sent to the module. */
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);
/* Complete reset submitted to the module and the read IO. */
stub_complete_io(g_bdev.io_target, 0);
poll_threads();
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
/* Destroy the channel and end the test. */
spdk_put_io_channel(io_ch);
poll_threads();
teardown_test();
}
static void
basic_qos(void)
{
@ -2092,7 +2302,9 @@ main(int argc, char **argv)
CU_ADD_TEST(suite, basic_qos);
CU_ADD_TEST(suite, put_channel_during_reset);
CU_ADD_TEST(suite, aborted_reset);
CU_ADD_TEST(suite, aborted_reset_no_outstanding_io);
CU_ADD_TEST(suite, io_during_reset);
CU_ADD_TEST(suite, reset_completions);
CU_ADD_TEST(suite, io_during_qos_queue);
CU_ADD_TEST(suite, io_during_qos_reset);
CU_ADD_TEST(suite, enomem);