diff --git a/include/spdk/bdev_module.h b/include/spdk/bdev_module.h index 53d642f26..1cacc7822 100644 --- a/include/spdk/bdev_module.h +++ b/include/spdk/bdev_module.h @@ -309,6 +309,20 @@ struct spdk_bdev { /** Number of blocks */ uint64_t blockcnt; + /** + * Specifies whether the write_unit_size is mandatory or + * only advisory. If set to true, the bdev layer will split + * WRITE I/O that span the write_unit_size before + * submitting them to the bdev module. + * + * This field takes precedence over split_on_optimal_io_boundary + * for WRITE I/O if both are set to true. + * + * Note that this field cannot be used to force splitting of + * UNMAP, WRITE_ZEROES or FLUSH I/O. + */ + bool split_on_write_unit; + /** Number of blocks required for write */ uint32_t write_unit_size; diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index e3570c0e5..c21f50a0f 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -2166,6 +2166,14 @@ bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_i } } + if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && + bdev_io->bdev->split_on_write_unit && + bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { + SPDK_ERRLOG("IO does not match the write_unit_size\n"); + _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { bdev_ch->io_outstanding++; shared_resource->io_outstanding++; @@ -2242,11 +2250,18 @@ bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb static bool bdev_rw_should_split(struct spdk_bdev_io *bdev_io) { - uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; - uint32_t max_size = bdev_io->bdev->max_segment_size; - int max_segs = bdev_io->bdev->max_num_segments; + uint32_t io_boundary; + struct spdk_bdev *bdev = bdev_io->bdev; + uint32_t max_size = bdev->max_segment_size; + int max_segs = bdev->max_num_segments; - io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; + if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { + io_boundary = bdev->write_unit_size; + } else if (bdev->split_on_optimal_io_boundary) { + io_boundary = bdev->optimal_io_boundary; + } else { + io_boundary = 0; + } if (spdk_likely(!io_boundary && !max_segs && !max_size)) { return false; @@ -2449,7 +2464,7 @@ _bdev_rw_split(void *_bdev_io) uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; uint32_t iovcnt, iov_len, child_iovsize; uint32_t blocklen = bdev->blocklen; - uint32_t io_boundary = bdev->optimal_io_boundary; + uint32_t io_boundary; uint32_t max_segment_size = bdev->max_segment_size; uint32_t max_child_iovcnt = bdev->max_num_segments; void *md_buf = NULL; @@ -2458,7 +2473,14 @@ _bdev_rw_split(void *_bdev_io) max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : BDEV_IO_NUM_CHILD_IOV; - io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { + io_boundary = bdev->write_unit_size; + } else if (bdev->split_on_optimal_io_boundary) { + io_boundary = bdev->optimal_io_boundary; + } else { + io_boundary = UINT32_MAX; + } remaining = bdev_io->u.bdev.split_remaining_num_blocks; current_offset = bdev_io->u.bdev.split_current_offset_blocks; diff --git a/test/unit/lib/bdev/bdev.c/bdev_ut.c b/test/unit/lib/bdev/bdev.c/bdev_ut.c index 36589055a..9ad3af7a8 100644 --- a/test/unit/lib/bdev/bdev.c/bdev_ut.c +++ b/test/unit/lib/bdev/bdev.c/bdev_ut.c @@ -1249,6 +1249,25 @@ bdev_io_spans_split_test(void) /* Exceed max_sizes */ CU_ASSERT(bdev_io_should_split(&bdev_io) == true); + + bdev.max_segment_size = 0; + bdev.write_unit_size = 32; + bdev.split_on_write_unit = true; + bdev_io.type = SPDK_BDEV_IO_TYPE_WRITE; + + /* This I/O is one write unit */ + CU_ASSERT(bdev_io_should_split(&bdev_io) == false); + + bdev_io.u.bdev.num_blocks = 32 * 2; + + /* This I/O is more than one write unit */ + CU_ASSERT(bdev_io_should_split(&bdev_io) == true); + + bdev_io.u.bdev.offset_blocks = 1; + bdev_io.u.bdev.num_blocks = 32; + + /* This I/O is not aligned to write unit size */ + CU_ASSERT(bdev_io_should_split(&bdev_io) == true); } static void @@ -2846,6 +2865,127 @@ bdev_io_split_with_io_wait(void) poll_threads(); } +static void +bdev_io_write_unit_split_test(void) +{ + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc = NULL; + struct spdk_io_channel *io_ch; + struct spdk_bdev_opts bdev_opts = {}; + struct iovec iov[BDEV_IO_NUM_CHILD_IOV * 4]; + struct ut_expected_io *expected_io; + uint64_t i; + int rc; + + spdk_bdev_get_opts(&bdev_opts, sizeof(bdev_opts)); + bdev_opts.bdev_io_pool_size = 512; + bdev_opts.bdev_io_cache_size = 64; + + rc = spdk_bdev_set_opts(&bdev_opts); + CU_ASSERT(rc == 0); + spdk_bdev_initialize(bdev_init_cb, NULL); + + bdev = allocate_bdev("bdev0"); + + rc = spdk_bdev_open_ext(bdev->name, true, bdev_ut_event_cb, NULL, &desc); + CU_ASSERT(rc == 0); + SPDK_CU_ASSERT_FATAL(desc != NULL); + io_ch = spdk_bdev_get_io_channel(desc); + CU_ASSERT(io_ch != NULL); + + /* Write I/O 2x larger than write_unit_size should get split into 2 I/Os */ + bdev->write_unit_size = 32; + bdev->split_on_write_unit = true; + g_io_done = false; + + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 0, 32, 1); + ut_expected_io_set_iov(expected_io, 0, (void *)0xF000, 32 * 512); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 32, 32, 1); + ut_expected_io_set_iov(expected_io, 0, (void *)(0xF000 + 32 * 512), 32 * 512); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + + rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 64, io_done, NULL); + CU_ASSERT(rc == 0); + CU_ASSERT(g_io_done == false); + + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2); + stub_complete_io(2); + CU_ASSERT(g_io_done == true); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_SUCCESS); + + /* Same as above but with optimal_io_boundary < write_unit_size - the I/O should be split + * based on write_unit_size, not optimal_io_boundary */ + bdev->split_on_optimal_io_boundary = true; + bdev->optimal_io_boundary = 16; + g_io_done = false; + + rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 64, io_done, NULL); + CU_ASSERT(rc == 0); + CU_ASSERT(g_io_done == false); + + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2); + stub_complete_io(2); + CU_ASSERT(g_io_done == true); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_SUCCESS); + + /* Write I/O should fail if it is smaller than write_unit_size */ + g_io_done = false; + + rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 0, 31, io_done, NULL); + CU_ASSERT(rc == 0); + CU_ASSERT(g_io_done == false); + + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + poll_threads(); + CU_ASSERT(g_io_done == true); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED); + + /* Same for I/O not aligned to write_unit_size */ + g_io_done = false; + + rc = spdk_bdev_write_blocks(desc, io_ch, (void *)0xF000, 1, 32, io_done, NULL); + CU_ASSERT(rc == 0); + CU_ASSERT(g_io_done == false); + + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + poll_threads(); + CU_ASSERT(g_io_done == true); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED); + + /* Write should fail if it needs to be split but there are not enough iovs to submit + * an entire write unit */ + bdev->write_unit_size = SPDK_COUNTOF(iov) / 2; + g_io_done = false; + + for (i = 0; i < SPDK_COUNTOF(iov); i++) { + iov[i].iov_base = (void *)(0x1000 + 512 * i); + iov[i].iov_len = 512; + } + + rc = spdk_bdev_writev_blocks(desc, io_ch, iov, SPDK_COUNTOF(iov), 0, SPDK_COUNTOF(iov), + io_done, NULL); + CU_ASSERT(rc == 0); + CU_ASSERT(g_io_done == false); + + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + poll_threads(); + CU_ASSERT(g_io_done == true); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED); + + spdk_put_io_channel(io_ch); + spdk_bdev_close(desc); + free_bdev(bdev); + spdk_bdev_finish(bdev_fini_cb, NULL); + poll_threads(); +} + static void bdev_io_alignment(void) { @@ -5874,6 +6014,7 @@ main(int argc, char **argv) CU_ADD_TEST(suite, bdev_io_max_size_and_segment_split_test); CU_ADD_TEST(suite, bdev_io_mix_split_test); CU_ADD_TEST(suite, bdev_io_split_with_io_wait); + CU_ADD_TEST(suite, bdev_io_write_unit_split_test); CU_ADD_TEST(suite, bdev_io_alignment_with_boundary); CU_ADD_TEST(suite, bdev_io_alignment); CU_ADD_TEST(suite, bdev_histograms);