From 5616c1ed9c3d8356d05106d36f28f69617a6c8e4 Mon Sep 17 00:00:00 2001 From: Shuhei Matsumoto Date: Thu, 4 Oct 2018 16:44:55 +0900 Subject: [PATCH] bdev: Change split IOV submission from sequential to batch Large read I/O will be typical in some use cases such as web stream services. On the other hand, large write I/O may not be typical but will be sufficiently probable. Currently when large I/O is submitted to the RAID bdev, the I/O will be divided by the strip size of it and then divided I/Os are submitted sequentially. This patch tries to improve the performance of the RAID bdev in large I/Os. Besides, when the RAID bdev supports higher levels of RAID (such as RAID5), it should issue multiple I/Os to multiple base bdevs by batch fasion in the parity update. Having experience in batched I/O will be helpful in the future case too. In this patch, submit split I/Os by batch until all child IOVs are consumed or all data are submitted. If all child IOVs are consumed before all data are submitted, wait until all batched split I/Os complete and then submit again. In this patch, test code is added too. Change-Id: If6cd81cc0c306e3875a93c39dbe4288723b78937 Signed-off-by: Shuhei Matsumoto Reviewed-on: https://review.gerrithub.io/424770 Tested-by: SPDK CI Jenkins Chandler-Test-Pool: SPDK Automated Test System Reviewed-by: Changpeng Liu Reviewed-by: Ben Walker Reviewed-by: Darek Stojaczyk Reviewed-by: Jim Harris --- include/spdk/bdev_module.h | 3 + lib/bdev/bdev.c | 143 +++++++++++++++++----------- test/unit/lib/bdev/bdev.c/bdev_ut.c | 86 ++++++++++++----- 3 files changed, 154 insertions(+), 78 deletions(-) diff --git a/include/spdk/bdev_module.h b/include/spdk/bdev_module.h index 0cfcbbbc3..d88be7adf 100644 --- a/include/spdk/bdev_module.h +++ b/include/spdk/bdev_module.h @@ -395,6 +395,9 @@ struct spdk_bdev_io { /** current offset of the split I/O in the bdev */ uint64_t split_current_offset_blocks; + + /** count of outstanding batched split I/Os */ + uint32_t split_outstanding; } bdev; struct { /** Channel reference held while messages for this reset are in progress. */ diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index 09df169be..3d4808252 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -1238,9 +1238,9 @@ _spdk_bdev_io_split_with_payload(void *_bdev_io) struct spdk_bdev_io *bdev_io = _bdev_io; uint64_t current_offset, remaining; uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; - struct iovec *parent_iov; - uint64_t parent_iov_offset, child_iov_len; - uint32_t parent_iovpos, parent_iovcnt, child_iovcnt; + struct iovec *parent_iov, *iov; + uint64_t parent_iov_offset, iov_len; + uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; int rc; remaining = bdev_io->u.bdev.split_remaining_num_blocks; @@ -1257,59 +1257,85 @@ _spdk_bdev_io_split_with_payload(void *_bdev_io) parent_iov_offset -= parent_iov->iov_len; } - to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); - to_next_boundary = spdk_min(remaining, to_next_boundary); - to_next_boundary_bytes = to_next_boundary * blocklen; child_iovcnt = 0; - while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && - child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { - parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; - child_iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); - to_next_boundary_bytes -= child_iov_len; + while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { + to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); + to_next_boundary = spdk_min(remaining, to_next_boundary); + to_next_boundary_bytes = to_next_boundary * blocklen; + iov = &bdev_io->child_iov[child_iovcnt]; + iovcnt = 0; + while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && + child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { + parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; + iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); + to_next_boundary_bytes -= iov_len; - bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; - bdev_io->child_iov[child_iovcnt].iov_len = child_iov_len; + bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; + bdev_io->child_iov[child_iovcnt].iov_len = iov_len; - parent_iovpos++; - parent_iov_offset = 0; - child_iovcnt++; - } + if (iov_len < parent_iov->iov_len - parent_iov_offset) { + parent_iov_offset += iov_len; + } else { + parent_iovpos++; + parent_iov_offset = 0; + } + child_iovcnt++; + iovcnt++; + } + + if (to_next_boundary_bytes > 0) { + /* We had to stop this child I/O early because we ran out of + * child_iov space. Make sure the iovs collected are valid and + * then adjust to_next_boundary before starting the child I/O. + */ + if ((to_next_boundary_bytes % blocklen) != 0) { + SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", + to_next_boundary_bytes, blocklen); + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + if (bdev_io->u.bdev.split_outstanding == 0) { + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } + return; + } + to_next_boundary -= to_next_boundary_bytes / blocklen; + } + + bdev_io->u.bdev.split_outstanding++; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + rc = spdk_bdev_readv_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + iov, iovcnt, current_offset, to_next_boundary, + _spdk_bdev_io_split_done, bdev_io); + } else { + rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + iov, iovcnt, current_offset, to_next_boundary, + _spdk_bdev_io_split_done, bdev_io); + } + + if (rc == 0) { + current_offset += to_next_boundary; + remaining -= to_next_boundary; + bdev_io->u.bdev.split_current_offset_blocks = current_offset; + bdev_io->u.bdev.split_remaining_num_blocks = remaining; + } else { + bdev_io->u.bdev.split_outstanding--; + if (rc == -ENOMEM) { + if (bdev_io->u.bdev.split_outstanding == 0) { + /* No I/O is outstanding. Hence we should wait here. */ + _spdk_bdev_queue_io_wait_with_cb(bdev_io, + _spdk_bdev_io_split_with_payload); + } + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + if (bdev_io->u.bdev.split_outstanding == 0) { + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } + } - if (to_next_boundary_bytes > 0) { - /* We had to stop this child I/O early because we ran out of - * child_iov space. Make sure the iovs collected are valid and - * then adjust to_next_boundary before starting the child I/O. - */ - if ((to_next_boundary_bytes % blocklen) != 0) { - SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", - to_next_boundary_bytes, blocklen); - bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; - bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); return; } - to_next_boundary -= to_next_boundary_bytes / blocklen; - } - - if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { - rc = spdk_bdev_readv_blocks(bdev_io->internal.desc, - spdk_io_channel_from_ctx(bdev_io->internal.ch), - bdev_io->child_iov, child_iovcnt, current_offset, to_next_boundary, - _spdk_bdev_io_split_done, bdev_io); - } else { - rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, - spdk_io_channel_from_ctx(bdev_io->internal.ch), - bdev_io->child_iov, child_iovcnt, current_offset, to_next_boundary, - _spdk_bdev_io_split_done, bdev_io); - } - - if (rc == 0) { - bdev_io->u.bdev.split_current_offset_blocks += to_next_boundary; - bdev_io->u.bdev.split_remaining_num_blocks -= to_next_boundary; - } else if (rc == -ENOMEM) { - _spdk_bdev_queue_io_wait_with_cb(bdev_io, _spdk_bdev_io_split_with_payload); - } else { - bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; - bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); } } @@ -1322,13 +1348,20 @@ _spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_ar if (!success) { parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; - parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); + } + parent_io->u.bdev.split_outstanding--; + if (parent_io->u.bdev.split_outstanding != 0) { return; } - if (parent_io->u.bdev.split_remaining_num_blocks == 0) { - parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; - parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); + /* + * Parent I/O finishes when all blocks are consumed or there is any failure of + * child I/O and no outstanding child I/O. + */ + if (parent_io->u.bdev.split_remaining_num_blocks == 0 || + parent_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS) { + parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, + parent_io->internal.caller_ctx); return; } @@ -1346,6 +1379,8 @@ _spdk_bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; + bdev_io->u.bdev.split_outstanding = 0; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; _spdk_bdev_io_split_with_payload(bdev_io); } diff --git a/test/unit/lib/bdev/bdev.c/bdev_ut.c b/test/unit/lib/bdev/bdev.c/bdev_ut.c index e2071fd29..3c14f7124 100644 --- a/test/unit/lib/bdev/bdev.c/bdev_ut.c +++ b/test/unit/lib/bdev/bdev.c/bdev_ut.c @@ -300,7 +300,7 @@ allocate_bdev(char *name) bdev->name = name; bdev->fn_table = &fn_table; bdev->module = &bdev_ut_if; - bdev->blockcnt = 256; + bdev->blockcnt = 1024; bdev->blocklen = 512; rc = spdk_bdev_register(bdev); @@ -891,19 +891,10 @@ bdev_io_split(void) CU_ASSERT(rc == 0); CU_ASSERT(g_io_done == false); - /* The second child will get submitted once the first child is completed by - * stub_complete_io(). - */ - CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); - stub_complete_io(1); - CU_ASSERT(g_io_done == false); - - /* Complete the second child I/O. This should result in our callback getting - * invoked since the parent I/O is now complete. - */ - CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); - stub_complete_io(1); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2); + stub_complete_io(2); CU_ASSERT(g_io_done == true); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); /* Now set up a more complex, multi-vector command that needs to be split, * including splitting iovecs. @@ -934,16 +925,8 @@ bdev_io_split(void) CU_ASSERT(rc == 0); CU_ASSERT(g_io_done == false); - CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); - stub_complete_io(1); - CU_ASSERT(g_io_done == false); - - CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); - stub_complete_io(1); - CU_ASSERT(g_io_done == false); - - CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); - stub_complete_io(1); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 3); + stub_complete_io(3); CU_ASSERT(g_io_done == true); /* Test multi vector command that needs to be split by strip and then needs to be @@ -983,6 +966,7 @@ bdev_io_split(void) CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); stub_complete_io(1); CU_ASSERT(g_io_done == true); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); /* Test multi vector command that needs to be split by strip and then needs to be * split further due to the capacity of child iovs, but fails to split. The cause @@ -1066,6 +1050,7 @@ bdev_io_split_with_io_wait(void) .bdev_io_pool_size = 2, .bdev_io_cache_size = 1, }; + struct iovec iov[3]; struct ut_expected_io *expected_io; int rc; @@ -1094,7 +1079,6 @@ bdev_io_split_with_io_wait(void) * Child - Offset 14, length 2, payload 0xF000 * Child - Offset 16, length 6, payload 0xF000 + 2 * 512 * - * This I/O will consume three spdk_bdev_io. * Set up the expected values before calling spdk_bdev_read_blocks */ expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 14, 2, 1); @@ -1105,6 +1089,10 @@ bdev_io_split_with_io_wait(void) ut_expected_io_set_iov(expected_io, 0, (void *)(0xF000 + 2 * 512), 6 * 512); TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + /* The following children will be submitted sequentially due to the capacity of + * spdk_bdev_io. + */ + /* The first child I/O will be queued to wait until an spdk_bdev_io becomes available */ rc = spdk_bdev_read_blocks(desc, io_ch, (void *)0xF000, 14, 8, io_done, NULL); CU_ASSERT(rc == 0); @@ -1126,6 +1114,56 @@ bdev_io_split_with_io_wait(void) stub_complete_io(1); CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); + /* Now set up a more complex, multi-vector command that needs to be split, + * including splitting iovecs. + */ + iov[0].iov_base = (void *)0x10000; + iov[0].iov_len = 512; + iov[1].iov_base = (void *)0x20000; + iov[1].iov_len = 20 * 512; + iov[2].iov_base = (void *)0x30000; + iov[2].iov_len = 11 * 512; + + g_io_done = false; + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 14, 2, 2); + ut_expected_io_set_iov(expected_io, 0, (void *)0x10000, 512); + ut_expected_io_set_iov(expected_io, 1, (void *)0x20000, 512); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 16, 16, 1); + ut_expected_io_set_iov(expected_io, 0, (void *)(0x20000 + 512), 16 * 512); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_WRITE, 32, 14, 2); + ut_expected_io_set_iov(expected_io, 0, (void *)(0x20000 + 17 * 512), 3 * 512); + ut_expected_io_set_iov(expected_io, 1, (void *)0x30000, 11 * 512); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + + rc = spdk_bdev_writev_blocks(desc, io_ch, iov, 3, 14, 32, io_done, NULL); + CU_ASSERT(rc == 0); + CU_ASSERT(g_io_done == false); + + /* The following children will be submitted sequentially due to the capacity of + * spdk_bdev_io. + */ + + /* Completing the first child will submit the second child */ + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); + stub_complete_io(1); + CU_ASSERT(g_io_done == false); + + /* Completing the second child will submit the third child */ + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); + stub_complete_io(1); + CU_ASSERT(g_io_done == false); + + /* Completing the third child will result in our callback getting invoked + * since the parent I/O is now complete. + */ + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); + stub_complete_io(1); + CU_ASSERT(g_io_done == true); + CU_ASSERT(TAILQ_EMPTY(&g_bdev_ut_channel->expected_io)); spdk_put_io_channel(io_ch);