bdev: send bdev reset based on outstanding IO and a new timeout parameter
A new parameter io_drain_timeout has been added to spdk_bdev structure. If this value is unset, the bdev reset behavior does not change. The io_drain_timeout controls how long a bdev reset must wait for IO to complete prior to issuing a reset to the underlying device. If there is no outstanding IO at the end of that period, the reset is skipped. Change-Id: I585af427064ce234a4f60afc3d69bc9fc3252432 Signed-off-by: Krzysztof Karas <krzysztof.karas@intel.com> Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14501 Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Community-CI: Mellanox Build Bot
This commit is contained in:
parent
eafb489c0d
commit
dfc9894396
@ -28,6 +28,11 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* This parameter is best defined for bdevs that share an underlying bdev,
|
||||
* such as multiple lvol bdevs sharing an nvme device, to avoid unnecessarily
|
||||
* resetting the underlying bdev and affecting other bdevs that are sharing it. */
|
||||
#define BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE 5
|
||||
|
||||
/** Block device module */
|
||||
struct spdk_bdev_module {
|
||||
/**
|
||||
@ -431,6 +436,25 @@ struct spdk_bdev {
|
||||
*/
|
||||
bool media_events;
|
||||
|
||||
/* Upon receiving a reset request, this is the amount of time in seconds
|
||||
* to wait for all I/O to complete before moving forward with the reset.
|
||||
* If all I/O completes prior to this time out, the reset will be skipped.
|
||||
* A value of 0 is special and will always send resets immediately, even
|
||||
* if there is no I/O outstanding.
|
||||
*
|
||||
* Use case example:
|
||||
* A shared bdev (e.g. multiple lvol bdevs sharing an underlying nvme bdev)
|
||||
* needs to be reset. For a non-zero value bdev reset code will wait
|
||||
* `reset_io_drain_timeout` seconds for outstanding IO that are present
|
||||
* on any bdev channel, before sending a reset down to the underlying device.
|
||||
* That way we can avoid sending "empty" resets and interrupting work of
|
||||
* other lvols that use the same bdev. BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE
|
||||
* is a good choice for the value of this parameter.
|
||||
*
|
||||
* If this parameter remains equal to zero, the bdev reset will be forcefully
|
||||
* sent down to the device, without any delays and waiting for outstanding IO. */
|
||||
uint16_t reset_io_drain_timeout;
|
||||
|
||||
/**
|
||||
* Pointer to the bdev module that registered this bdev.
|
||||
*/
|
||||
@ -629,6 +653,12 @@ struct spdk_bdev_io {
|
||||
struct {
|
||||
/** Channel reference held while messages for this reset are in progress. */
|
||||
struct spdk_io_channel *ch_ref;
|
||||
struct {
|
||||
/* Handle to timed poller that checks each channel for outstanding IO. */
|
||||
struct spdk_poller *poller;
|
||||
/* Store calculated time value, when a poller should stop its work. */
|
||||
uint64_t stop_time_tsc;
|
||||
} wait_poller;
|
||||
} reset;
|
||||
struct {
|
||||
/** The outstanding request matching bio_cb_arg which this abort attempts to cancel. */
|
||||
|
@ -54,6 +54,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
|
||||
* when splitting into children requests at a time.
|
||||
*/
|
||||
#define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
|
||||
#define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
|
||||
|
||||
static const char *qos_rpc_type[] = {"rw_ios_per_sec",
|
||||
"rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
|
||||
@ -5276,16 +5277,92 @@ spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bdev_reset_poll_for_outstanding_io(void *ctx);
|
||||
|
||||
static void
|
||||
bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
|
||||
bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status)
|
||||
{
|
||||
struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
|
||||
struct spdk_bdev_io *bdev_io;
|
||||
|
||||
bdev_io = TAILQ_FIRST(&ch->queued_resets);
|
||||
|
||||
if (status == -EBUSY) {
|
||||
if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
|
||||
bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
|
||||
ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
|
||||
} else {
|
||||
/* If outstanding IOs are still present and reset_io_drain_timeout seconds passed,
|
||||
* start the reset. */
|
||||
TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
|
||||
bdev_io_submit_reset(bdev_io);
|
||||
}
|
||||
} else {
|
||||
TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
|
||||
SPDK_DEBUGLOG(bdev,
|
||||
"Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
|
||||
ch->bdev->name);
|
||||
/* Mark the completion status as a SUCCESS and complete the reset. */
|
||||
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i)
|
||||
{
|
||||
struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
|
||||
struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch);
|
||||
int status = 0;
|
||||
|
||||
if (cur_ch->io_outstanding > 0) {
|
||||
/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
|
||||
* further iteration over the rest of the channels and pass non-zero status
|
||||
* to the callback function. */
|
||||
status = -EBUSY;
|
||||
}
|
||||
spdk_for_each_channel_continue(i, status);
|
||||
}
|
||||
|
||||
static int
|
||||
bdev_reset_poll_for_outstanding_io(void *ctx)
|
||||
{
|
||||
struct spdk_bdev_channel *ch = ctx;
|
||||
struct spdk_bdev_io *bdev_io;
|
||||
|
||||
bdev_io = TAILQ_FIRST(&ch->queued_resets);
|
||||
|
||||
spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
|
||||
spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
|
||||
ch, bdev_reset_check_outstanding_io_done);
|
||||
|
||||
return SPDK_POLLER_BUSY;
|
||||
}
|
||||
|
||||
static void
|
||||
bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status)
|
||||
{
|
||||
struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
|
||||
struct spdk_bdev *bdev = ch->bdev;
|
||||
struct spdk_bdev_io *bdev_io;
|
||||
|
||||
bdev_io = TAILQ_FIRST(&ch->queued_resets);
|
||||
|
||||
if (bdev->reset_io_drain_timeout == 0) {
|
||||
TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
|
||||
|
||||
bdev_io_submit_reset(bdev_io);
|
||||
return;
|
||||
}
|
||||
|
||||
bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
|
||||
(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
|
||||
|
||||
/* In case bdev->reset_io_drain_timeout is not equal to zero,
|
||||
* submit the reset to the underlying module only if outstanding I/O
|
||||
* remain after reset_io_drain_timeout seconds have passed. */
|
||||
spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
|
||||
ch, bdev_reset_check_outstanding_io_done);
|
||||
}
|
||||
|
||||
static void
|
||||
bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
|
||||
@ -5331,7 +5408,7 @@ bdev_start_reset(void *ctx)
|
||||
struct spdk_bdev_channel *ch = ctx;
|
||||
|
||||
spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel,
|
||||
ch, bdev_reset_dev);
|
||||
ch, bdev_reset_freeze_channel_done);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1041,6 +1041,13 @@ _create_lvol_disk(struct spdk_lvol *lvol, bool destroy)
|
||||
bdev->fn_table = &vbdev_lvol_fn_table;
|
||||
bdev->module = &g_lvol_if;
|
||||
|
||||
/* Set default bdev reset waiting time. This value indicates how much
|
||||
* time a reset should wait before forcing a reset down to the underlying
|
||||
* bdev module.
|
||||
* Setting this parameter is mainly to avoid "empty" resets to a shared
|
||||
* bdev that may be used by multiple lvols. */
|
||||
bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
|
||||
|
||||
rc = spdk_bdev_register(bdev);
|
||||
if (rc) {
|
||||
free(lvol_bdev);
|
||||
|
@ -486,6 +486,8 @@ aborted_reset_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
|
||||
spdk_bdev_free_io(bdev_io);
|
||||
}
|
||||
|
||||
static void io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
|
||||
|
||||
static void
|
||||
aborted_reset(void)
|
||||
{
|
||||
@ -542,6 +544,58 @@ aborted_reset(void)
|
||||
teardown_test();
|
||||
}
|
||||
|
||||
static void
|
||||
aborted_reset_no_outstanding_io(void)
|
||||
{
|
||||
struct spdk_io_channel *io_ch[2];
|
||||
struct spdk_bdev_channel *bdev_ch[2];
|
||||
struct spdk_bdev *bdev[2];
|
||||
enum spdk_bdev_io_status status1 = SPDK_BDEV_IO_STATUS_PENDING,
|
||||
status2 = SPDK_BDEV_IO_STATUS_PENDING;
|
||||
|
||||
setup_test();
|
||||
|
||||
/*
|
||||
* This time we test the reset without any outstanding IO
|
||||
* present on the bdev channel, so both resets should finish
|
||||
* immediately.
|
||||
*/
|
||||
|
||||
set_thread(0);
|
||||
/* Set reset_io_drain_timeout to allow bdev
|
||||
* reset to stay pending until we call abort. */
|
||||
io_ch[0] = spdk_bdev_get_io_channel(g_desc);
|
||||
bdev_ch[0] = spdk_io_channel_get_ctx(io_ch[0]);
|
||||
bdev[0] = bdev_ch[0]->bdev;
|
||||
bdev[0]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
|
||||
CU_ASSERT(io_ch[0] != NULL);
|
||||
spdk_bdev_reset(g_desc, io_ch[0], aborted_reset_done, &status1);
|
||||
poll_threads();
|
||||
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
|
||||
CU_ASSERT(status1 == SPDK_BDEV_IO_STATUS_SUCCESS);
|
||||
spdk_put_io_channel(io_ch[0]);
|
||||
|
||||
set_thread(1);
|
||||
/* Set reset_io_drain_timeout to allow bdev
|
||||
* reset to stay pending until we call abort. */
|
||||
io_ch[1] = spdk_bdev_get_io_channel(g_desc);
|
||||
bdev_ch[1] = spdk_io_channel_get_ctx(io_ch[1]);
|
||||
bdev[1] = bdev_ch[1]->bdev;
|
||||
bdev[1]->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
|
||||
CU_ASSERT(io_ch[1] != NULL);
|
||||
spdk_bdev_reset(g_desc, io_ch[1], aborted_reset_done, &status2);
|
||||
poll_threads();
|
||||
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
|
||||
CU_ASSERT(status2 == SPDK_BDEV_IO_STATUS_SUCCESS);
|
||||
spdk_put_io_channel(io_ch[1]);
|
||||
|
||||
stub_complete_io(g_bdev.io_target, 0);
|
||||
poll_threads();
|
||||
|
||||
teardown_test();
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
io_during_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
|
||||
{
|
||||
@ -655,6 +709,162 @@ io_during_reset(void)
|
||||
teardown_test();
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
count_queued_resets(void *io_target)
|
||||
{
|
||||
struct spdk_io_channel *_ch = spdk_get_io_channel(io_target);
|
||||
struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
|
||||
struct spdk_bdev_io *io;
|
||||
uint32_t submitted_resets = 0;
|
||||
|
||||
TAILQ_FOREACH(io, &ch->outstanding_io, module_link) {
|
||||
if (io->type == SPDK_BDEV_IO_TYPE_RESET) {
|
||||
submitted_resets++;
|
||||
}
|
||||
}
|
||||
|
||||
spdk_put_io_channel(_ch);
|
||||
|
||||
return submitted_resets;
|
||||
}
|
||||
|
||||
static void
|
||||
reset_completions(void)
|
||||
{
|
||||
struct spdk_io_channel *io_ch;
|
||||
struct spdk_bdev_channel *bdev_ch;
|
||||
struct spdk_bdev *bdev;
|
||||
enum spdk_bdev_io_status status0, status_reset;
|
||||
int rc, iter;
|
||||
|
||||
setup_test();
|
||||
|
||||
/* This test covers four test cases:
|
||||
* 1) reset_io_drain_timeout of a bdev is greater than 0
|
||||
* 2) No outstandind IO are present on any bdev channel
|
||||
* 3) Outstanding IO finish during bdev reset
|
||||
* 4) Outstanding IO do not finish before reset is done waiting
|
||||
* for them.
|
||||
*
|
||||
* Above conditions mainly affect the timing of bdev reset completion
|
||||
* and whether a reset should be skipped via spdk_bdev_io_complete()
|
||||
* or sent down to the underlying bdev module via bdev_io_submit_reset(). */
|
||||
|
||||
/* Test preparation */
|
||||
set_thread(0);
|
||||
io_ch = spdk_bdev_get_io_channel(g_desc);
|
||||
bdev_ch = spdk_io_channel_get_ctx(io_ch);
|
||||
CU_ASSERT(bdev_ch->flags == 0);
|
||||
|
||||
|
||||
/* Test case 1) reset_io_drain_timeout set to 0. Reset should be sent down immediately. */
|
||||
bdev = &g_bdev.bdev;
|
||||
bdev->reset_io_drain_timeout = 0;
|
||||
|
||||
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
|
||||
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
|
||||
CU_ASSERT(rc == 0);
|
||||
poll_threads();
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);
|
||||
|
||||
/* Call reset completion inside bdev module. */
|
||||
stub_complete_io(g_bdev.io_target, 0);
|
||||
poll_threads();
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
|
||||
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
|
||||
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
|
||||
|
||||
|
||||
/* Test case 2) no outstanding IO are present. Reset should perform one iteration over
|
||||
* channels and then be skipped. */
|
||||
bdev->reset_io_drain_timeout = BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE;
|
||||
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
|
||||
|
||||
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
|
||||
CU_ASSERT(rc == 0);
|
||||
poll_threads();
|
||||
/* Reset was never submitted to the bdev module. */
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
|
||||
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
|
||||
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
|
||||
|
||||
|
||||
/* Test case 3) outstanding IO finish during bdev reset procedure. Reset should initiate
|
||||
* wait poller to check for IO completions every second, until reset_io_drain_timeout is
|
||||
* reached, but finish earlier than this threshold. */
|
||||
status0 = SPDK_BDEV_IO_STATUS_PENDING;
|
||||
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
|
||||
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
|
||||
CU_ASSERT(rc == 0);
|
||||
|
||||
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
|
||||
CU_ASSERT(rc == 0);
|
||||
poll_threads();
|
||||
/* The reset just started and should not have been submitted yet. */
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
|
||||
|
||||
poll_threads();
|
||||
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
|
||||
/* Let the poller wait for about half the time then complete outstanding IO. */
|
||||
for (iter = 0; iter < 2; iter++) {
|
||||
/* Reset is still processing and not submitted at this point. */
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
|
||||
spdk_delay_us(1000 * 1000);
|
||||
poll_threads();
|
||||
poll_threads();
|
||||
}
|
||||
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
|
||||
stub_complete_io(g_bdev.io_target, 0);
|
||||
poll_threads();
|
||||
spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
|
||||
poll_threads();
|
||||
poll_threads();
|
||||
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
|
||||
/* Sending reset to the bdev module has been skipped. */
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
|
||||
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
|
||||
|
||||
|
||||
/* Test case 4) outstanding IO are still present after reset_io_drain_timeout
|
||||
* seconds have passed. */
|
||||
status0 = SPDK_BDEV_IO_STATUS_PENDING;
|
||||
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
|
||||
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, io_during_io_done, &status0);
|
||||
CU_ASSERT(rc == 0);
|
||||
|
||||
rc = spdk_bdev_reset(g_desc, io_ch, io_during_io_done, &status_reset);
|
||||
CU_ASSERT(rc == 0);
|
||||
poll_threads();
|
||||
/* The reset just started and should not have been submitted yet. */
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
|
||||
|
||||
poll_threads();
|
||||
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_PENDING);
|
||||
/* Let the poller wait for reset_io_drain_timeout seconds. */
|
||||
for (iter = 0; iter < bdev->reset_io_drain_timeout; iter++) {
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 0);
|
||||
spdk_delay_us(BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
|
||||
poll_threads();
|
||||
poll_threads();
|
||||
}
|
||||
|
||||
/* After timing out, the reset should have been sent to the module. */
|
||||
CU_ASSERT(count_queued_resets(g_bdev.io_target) == 1);
|
||||
/* Complete reset submitted to the module and the read IO. */
|
||||
stub_complete_io(g_bdev.io_target, 0);
|
||||
poll_threads();
|
||||
CU_ASSERT(status_reset == SPDK_BDEV_IO_STATUS_SUCCESS);
|
||||
CU_ASSERT(g_bdev.bdev.internal.reset_in_progress == NULL);
|
||||
|
||||
|
||||
/* Destroy the channel and end the test. */
|
||||
spdk_put_io_channel(io_ch);
|
||||
poll_threads();
|
||||
|
||||
teardown_test();
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
basic_qos(void)
|
||||
{
|
||||
@ -2092,7 +2302,9 @@ main(int argc, char **argv)
|
||||
CU_ADD_TEST(suite, basic_qos);
|
||||
CU_ADD_TEST(suite, put_channel_during_reset);
|
||||
CU_ADD_TEST(suite, aborted_reset);
|
||||
CU_ADD_TEST(suite, aborted_reset_no_outstanding_io);
|
||||
CU_ADD_TEST(suite, io_during_reset);
|
||||
CU_ADD_TEST(suite, reset_completions);
|
||||
CU_ADD_TEST(suite, io_during_qos_queue);
|
||||
CU_ADD_TEST(suite, io_during_qos_reset);
|
||||
CU_ADD_TEST(suite, enomem);
|
||||
|
Loading…
Reference in New Issue
Block a user