diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index 30c5bf458..d81ab37b4 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -1596,11 +1596,12 @@ _spdk_bdev_io_split(void *_bdev_io) { struct spdk_bdev_io *bdev_io = _bdev_io; uint64_t current_offset, remaining; - uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; + uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; struct iovec *parent_iov, *iov; uint64_t parent_iov_offset, iov_len; uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; void *md_buf = NULL; + bool child_iov_run_out = false; int rc; remaining = bdev_io->u.bdev.split_remaining_num_blocks; @@ -1652,18 +1653,28 @@ _spdk_bdev_io_split(void *_bdev_io) if (to_next_boundary_bytes > 0) { /* We had to stop this child I/O early because we ran out of - * child_iov space. Make sure the iovs collected are valid and - * then adjust to_next_boundary before starting the child I/O. + * child_iov space. Ensure the iovs to be aligned with block + * size and then adjust to_next_boundary before starting the + * child I/O. */ - if ((to_next_boundary_bytes % blocklen) != 0) { - SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", - to_next_boundary_bytes, blocklen); - bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; - if (bdev_io->u.bdev.split_outstanding == 0) { - bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + to_last_block_bytes = to_next_boundary_bytes % blocklen; + if (to_last_block_bytes != 0) { + to_next_boundary_bytes += _to_next_boundary(to_next_boundary_bytes, blocklen);; + + while (to_last_block_bytes > 0 && iovcnt > 0) { + iov_len = spdk_min(to_last_block_bytes, + bdev_io->child_iov[child_iovcnt - 1].iov_len); + bdev_io->child_iov[child_iovcnt - 1].iov_len -= iov_len; + if (bdev_io->child_iov[child_iovcnt - 1].iov_len == 0) { + child_iovcnt--; + iovcnt--; + } + to_last_block_bytes -= iov_len; } - return; + + assert(to_last_block_bytes == 0); } + child_iov_run_out = true; to_next_boundary -= to_next_boundary_bytes / blocklen; } @@ -1688,6 +1699,10 @@ _spdk_bdev_io_split(void *_bdev_io) remaining -= to_next_boundary; bdev_io->u.bdev.split_current_offset_blocks = current_offset; bdev_io->u.bdev.split_remaining_num_blocks = remaining; + /* stop splitting until child_iov is available */ + if (spdk_unlikely(child_iov_run_out)) { + return; + } } else { bdev_io->u.bdev.split_outstanding--; if (rc == -ENOMEM) { diff --git a/test/unit/lib/bdev/bdev.c/bdev_ut.c b/test/unit/lib/bdev/bdev.c/bdev_ut.c index 9ada3605f..e2689744b 100644 --- a/test/unit/lib/bdev/bdev.c/bdev_ut.c +++ b/test/unit/lib/bdev/bdev.c/bdev_ut.c @@ -1149,8 +1149,8 @@ bdev_io_split(void) CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); /* Test multi vector command that needs to be split by strip and then needs to be - * split further due to the capacity of child iovs, but fails to split. The cause - * of failure of split is that the length of an iovec is not multiple of block size. + * split further due to the capacity of child iovs, the child request offset should + * be rewind to last aligned offset and go success without error. */ for (i = 0; i < BDEV_IO_NUM_CHILD_IOV - 1; i++) { iov[i].iov_base = (void *)((i + 1) * 0x10000); @@ -1159,15 +1159,51 @@ bdev_io_split(void) iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_base = (void *)(BDEV_IO_NUM_CHILD_IOV * 0x10000); iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_len = 256; + iov[BDEV_IO_NUM_CHILD_IOV].iov_base = (void *)((BDEV_IO_NUM_CHILD_IOV + 1) * 0x10000); + iov[BDEV_IO_NUM_CHILD_IOV].iov_len = 256; + + iov[BDEV_IO_NUM_CHILD_IOV + 1].iov_base = (void *)((BDEV_IO_NUM_CHILD_IOV + 2) * 0x10000); + iov[BDEV_IO_NUM_CHILD_IOV + 1].iov_len = 512; + bdev->optimal_io_boundary = BDEV_IO_NUM_CHILD_IOV; g_io_done = false; g_io_status = 0; + /* The first expected io should be start from offset 0 to BDEV_IO_NUM_CHILD_IOV - 1 */ + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 0, + BDEV_IO_NUM_CHILD_IOV - 1, BDEV_IO_NUM_CHILD_IOV - 1); + for (i = 0; i < BDEV_IO_NUM_CHILD_IOV - 1; i++) { + ut_expected_io_set_iov(expected_io, i, + (void *)((i + 1) * 0x10000), 512); + } + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + /* The second expected io should be start from offset BDEV_IO_NUM_CHILD_IOV - 1 to BDEV_IO_NUM_CHILD_IOV */ + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, BDEV_IO_NUM_CHILD_IOV - 1, + 1, 2); + ut_expected_io_set_iov(expected_io, 0, + (void *)(BDEV_IO_NUM_CHILD_IOV * 0x10000), 256); + ut_expected_io_set_iov(expected_io, 1, + (void *)((BDEV_IO_NUM_CHILD_IOV + 1) * 0x10000), 256); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); + /* The third expected io should be start from offset BDEV_IO_NUM_CHILD_IOV to BDEV_IO_NUM_CHILD_IOV + 1 */ + expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, BDEV_IO_NUM_CHILD_IOV, + 1, 1); + ut_expected_io_set_iov(expected_io, 0, + (void *)((BDEV_IO_NUM_CHILD_IOV + 2) * 0x10000), 512); + TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link); rc = spdk_bdev_readv_blocks(desc, io_ch, iov, BDEV_IO_NUM_CHILD_IOV * 2, 0, - BDEV_IO_NUM_CHILD_IOV * 2, io_done, NULL); + BDEV_IO_NUM_CHILD_IOV + 1, io_done, NULL); CU_ASSERT(rc == 0); + CU_ASSERT(g_io_done == false); + + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1); + stub_complete_io(1); + CU_ASSERT(g_io_done == false); + + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2); + stub_complete_io(2); CU_ASSERT(g_io_done == true); - CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED); + CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); /* Test a WRITE_ZEROES that would span an I/O boundary. WRITE_ZEROES should not be * split, so test that.