bdev: split writes based on write_unit_size

Add new bdev property split_on_write_unit which, if set to true, causes
writes to be split to match write_unit_size and fail if not aligned to
or not multiple of write_unit_size.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Change-Id: Id49f58a3288ddf5cfe4921ce4020ae4bcdd67298
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11390
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
Reviewed-by: Krzysztof Karas <krzysztof.karas@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Artur Paszkiewicz 2022-01-18 17:14:52 +01:00 committed by Tomasz Zawadzki
parent c89e20084b
commit d6e9827e9f
3 changed files with 183 additions and 6 deletions

View File

@ -309,6 +309,20 @@ struct spdk_bdev {
/** Number of blocks */
uint64_t blockcnt;
/**
* Specifies whether the write_unit_size is mandatory or
* only advisory. If set to true, the bdev layer will split
* WRITE I/O that span the write_unit_size before
* submitting them to the bdev module.
*
* This field takes precedence over split_on_optimal_io_boundary
* for WRITE I/O if both are set to true.
*
* Note that this field cannot be used to force splitting of
* UNMAP, WRITE_ZEROES or FLUSH I/O.
*/
bool split_on_write_unit;
/** Number of blocks required for write */
uint32_t write_unit_size;

View File

@ -2166,6 +2166,14 @@ bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_i
}
}
if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
bdev_io->bdev->split_on_write_unit &&
bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
SPDK_ERRLOG("IO does not match the write_unit_size\n");
_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
return;
}
if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
bdev_ch->io_outstanding++;
shared_resource->io_outstanding++;
@ -2242,11 +2250,18 @@ bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb
static bool
bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
{
uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary;
uint32_t max_size = bdev_io->bdev->max_segment_size;
int max_segs = bdev_io->bdev->max_num_segments;
uint32_t io_boundary;
struct spdk_bdev *bdev = bdev_io->bdev;
uint32_t max_size = bdev->max_segment_size;
int max_segs = bdev->max_num_segments;
io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0;
if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
io_boundary = bdev->write_unit_size;
} else if (bdev->split_on_optimal_io_boundary) {
io_boundary = bdev->optimal_io_boundary;
} else {
io_boundary = 0;
}
if (spdk_likely(!io_boundary && !max_segs && !max_size)) {
return false;
@ -2449,7 +2464,7 @@ _bdev_rw_split(void *_bdev_io)
uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
uint32_t iovcnt, iov_len, child_iovsize;
uint32_t blocklen = bdev->blocklen;
uint32_t io_boundary = bdev->optimal_io_boundary;
uint32_t io_boundary;
uint32_t max_segment_size = bdev->max_segment_size;
uint32_t max_child_iovcnt = bdev->max_num_segments;
void *md_buf = NULL;
@ -2458,7 +2473,14 @@ _bdev_rw_split(void *_bdev_io)
max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) :
BDEV_IO_NUM_CHILD_IOV;
io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX;
if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
io_boundary = bdev->write_unit_size;
} else if (bdev->split_on_optimal_io_boundary) {
io_boundary = bdev->optimal_io_boundary;
} else {
io_boundary = UINT32_MAX;
}
remaining = bdev_io->u.bdev.split_remaining_num_blocks;
current_offset = bdev_io->u.bdev.split_current_offset_blocks;

View File

@ -1249,6 +1249,25 @@ bdev_io_spans_split_test(void)
/* Exceed max_sizes */
CU_ASSERT(bdev_io_should_split(&bdev_io) == true);
bdev.max_segment_size = 0;
bdev.write_unit_size = 32;
bdev.split_on_write_unit = true;
bdev_io.type = SPDK_BDEV_IO_TYPE_WRITE;
/* This I/O is one write unit */
CU_ASSERT(bdev_io_should_split(&bdev_io) == false);
bdev_io.u.bdev.num_blocks = 32 * 2;
/* This I/O is more than one write unit */
CU_ASSERT(bdev_io_should_split(&bdev_io) == true);
bdev_io.u.bdev.offset_blocks = 1;
bdev_io.u.bdev.num_blocks = 32;
/* This I/O is not aligned to write unit size */
CU_ASSERT(bdev_io_should_split(&bdev_io) == true);
}
static void
@ -2846,6 +2865,127 @@ bdev_io_split_with_io_wait(void)
poll_threads();
}
static void
bdev_io_write_unit_split_test(void)
{
struct spdk_bdev *bdev;
struct spdk_bdev_desc *desc = NULL;
struct spdk_io_channel *io_ch;
struct spdk_bdev_opts bdev_opts = {};
struct iovec iov[BDEV_IO_NUM_CHILD_IOV * 4];
struct ut_expected_io *expected_io;
uint64_t i;
int rc;
spdk_bdev_get_opts(&bdev_opts, sizeof(bdev_opts));
bdev_opts.bdev_io_pool_size = 512;
bdev_opts.bdev_io_cache_size = 64;
rc = spdk_bdev_set_opts(&bdev_opts);
CU_ASSERT(rc == 0);
spdk_bdev_initialize(bdev_init_cb, NULL);
bdev = allocate_bdev("bdev0");
rc = spdk_bdev_open_ext(bdev->name, true, bdev_ut_event_cb, NULL, &desc);
CU_ASSERT(rc == 0);
SPDK_CU_ASSERT_FATAL(desc != NULL);
io_ch = spdk_bdev_get_io_channel(desc);
CU_ASSERT(io_ch != NULL);
/* Write I/O 2x larger than write_unit_size should get split into 2 I/Os */
bdev->write_unit_size = 32;
bdev->split_on_write_unit = true;
g_io_done = false;
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 0, 32, 1);
ut_expected_io_set_iov(expected_io, 0, (void *)0xF000, 32 * 512);
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 32, 32, 1);
ut_expected_io_set_iov(expected_io, 0, (void *)(0xF000 + 32 * 512), 32 * 512);
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 64, io_done, NULL);
CU_ASSERT(rc == 0);
CU_ASSERT(g_io_done == false);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2);
stub_complete_io(2);
CU_ASSERT(g_io_done == true);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_SUCCESS);
/* Same as above but with optimal_io_boundary < write_unit_size - the I/O should be split
* based on write_unit_size, not optimal_io_boundary */
bdev->split_on_optimal_io_boundary = true;
bdev->optimal_io_boundary = 16;
g_io_done = false;
rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 64, io_done, NULL);
CU_ASSERT(rc == 0);
CU_ASSERT(g_io_done == false);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2);
stub_complete_io(2);
CU_ASSERT(g_io_done == true);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_SUCCESS);
/* Write I/O should fail if it is smaller than write_unit_size */
g_io_done = false;
rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 31, io_done, NULL);
CU_ASSERT(rc == 0);
CU_ASSERT(g_io_done == false);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
poll_threads();
CU_ASSERT(g_io_done == true);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED);
/* Same for I/O not aligned to write_unit_size */
g_io_done = false;
rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 1, 32, io_done, NULL);
CU_ASSERT(rc == 0);
CU_ASSERT(g_io_done == false);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
poll_threads();
CU_ASSERT(g_io_done == true);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED);
/* Write should fail if it needs to be split but there are not enough iovs to submit
* an entire write unit */
bdev->write_unit_size = SPDK_COUNTOF(iov) / 2;
g_io_done = false;
for (i = 0; i < SPDK_COUNTOF(iov); i++) {
iov[i].iov_base = (void *)(0x1000 + 512 * i);
iov[i].iov_len = 512;
}
rc = spdk_bdev_writev_blocks(desc, io_ch, iov, SPDK_COUNTOF(iov), 0, SPDK_COUNTOF(iov),
io_done, NULL);
CU_ASSERT(rc == 0);
CU_ASSERT(g_io_done == false);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
poll_threads();
CU_ASSERT(g_io_done == true);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED);
spdk_put_io_channel(io_ch);
spdk_bdev_close(desc);
free_bdev(bdev);
spdk_bdev_finish(bdev_fini_cb, NULL);
poll_threads();
}
static void
bdev_io_alignment(void)
{
@ -5874,6 +6014,7 @@ main(int argc, char **argv)
CU_ADD_TEST(suite, bdev_io_max_size_and_segment_split_test);
CU_ADD_TEST(suite, bdev_io_mix_split_test);
CU_ADD_TEST(suite, bdev_io_split_with_io_wait);
CU_ADD_TEST(suite, bdev_io_write_unit_split_test);
CU_ADD_TEST(suite, bdev_io_alignment_with_boundary);
CU_ADD_TEST(suite, bdev_io_alignment);
CU_ADD_TEST(suite, bdev_histograms);