bdev: rewind child offset to last block size aligned iov

Here is the an example to describe existing issue:

There is a Write request with 64KiB data length, and this IO is cross the IO
boundary.  We assume that the parent IO will have 2 children requests, one is
33KiB length, the other one is 31KiB.  Here is the view of parent iovs, the
first 33KiB length data has 33 iovs:

iov.[0].iov_length = 1024;
.
.
iov.[31].iov_length = 256;
iov.[32].iov_length = 768;
.
.
iov.[64].iov_length = 1024;

In function _spdk_bdev_io_split(), then you can see that for the 33KiB length
child request, exiting code will run out of child child_iov space and return
error due to last one data buffer is not block size aligned.

Here we can rewind the existing offset to last block size aligned buffer to
avoid the error case, for backend which need aligned data buffer such as
AIO backend, the request will go through spdk_bdev_io_get_buf() again to
do the data copy, otherwise for those backend devices such as NVMe with
hardware SGL support, 256 data segment is fine for them.

Change-Id: I96ebdf29829d86f9b38fab28a7406eedc9fa44ef
Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/453604
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
This commit is contained in:
Changpeng Liu 2019-06-27 23:05:24 -04:00 committed by Darek Stojaczyk
parent 091bc429d7
commit efd7b514d4
2 changed files with 65 additions and 14 deletions

View File

@ -1596,11 +1596,12 @@ _spdk_bdev_io_split(void *_bdev_io)
{ {
struct spdk_bdev_io *bdev_io = _bdev_io; struct spdk_bdev_io *bdev_io = _bdev_io;
uint64_t current_offset, remaining; uint64_t current_offset, remaining;
uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
struct iovec *parent_iov, *iov; struct iovec *parent_iov, *iov;
uint64_t parent_iov_offset, iov_len; uint64_t parent_iov_offset, iov_len;
uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt;
void *md_buf = NULL; void *md_buf = NULL;
bool child_iov_run_out = false;
int rc; int rc;
remaining = bdev_io->u.bdev.split_remaining_num_blocks; remaining = bdev_io->u.bdev.split_remaining_num_blocks;
@ -1652,18 +1653,28 @@ _spdk_bdev_io_split(void *_bdev_io)
if (to_next_boundary_bytes > 0) { if (to_next_boundary_bytes > 0) {
/* We had to stop this child I/O early because we ran out of /* We had to stop this child I/O early because we ran out of
* child_iov space. Make sure the iovs collected are valid and * child_iov space. Ensure the iovs to be aligned with block
* then adjust to_next_boundary before starting the child I/O. * size and then adjust to_next_boundary before starting the
* child I/O.
*/ */
if ((to_next_boundary_bytes % blocklen) != 0) { to_last_block_bytes = to_next_boundary_bytes % blocklen;
SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", if (to_last_block_bytes != 0) {
to_next_boundary_bytes, blocklen); to_next_boundary_bytes += _to_next_boundary(to_next_boundary_bytes, blocklen);;
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
if (bdev_io->u.bdev.split_outstanding == 0) { while (to_last_block_bytes > 0 && iovcnt > 0) {
bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); iov_len = spdk_min(to_last_block_bytes,
bdev_io->child_iov[child_iovcnt - 1].iov_len);
bdev_io->child_iov[child_iovcnt - 1].iov_len -= iov_len;
if (bdev_io->child_iov[child_iovcnt - 1].iov_len == 0) {
child_iovcnt--;
iovcnt--;
}
to_last_block_bytes -= iov_len;
} }
return;
assert(to_last_block_bytes == 0);
} }
child_iov_run_out = true;
to_next_boundary -= to_next_boundary_bytes / blocklen; to_next_boundary -= to_next_boundary_bytes / blocklen;
} }
@ -1688,6 +1699,10 @@ _spdk_bdev_io_split(void *_bdev_io)
remaining -= to_next_boundary; remaining -= to_next_boundary;
bdev_io->u.bdev.split_current_offset_blocks = current_offset; bdev_io->u.bdev.split_current_offset_blocks = current_offset;
bdev_io->u.bdev.split_remaining_num_blocks = remaining; bdev_io->u.bdev.split_remaining_num_blocks = remaining;
/* stop splitting until child_iov is available */
if (spdk_unlikely(child_iov_run_out)) {
return;
}
} else { } else {
bdev_io->u.bdev.split_outstanding--; bdev_io->u.bdev.split_outstanding--;
if (rc == -ENOMEM) { if (rc == -ENOMEM) {

View File

@ -1149,8 +1149,8 @@ bdev_io_split(void)
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0); CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
/* Test multi vector command that needs to be split by strip and then needs to be /* Test multi vector command that needs to be split by strip and then needs to be
* split further due to the capacity of child iovs, but fails to split. The cause * split further due to the capacity of child iovs, the child request offset should
* of failure of split is that the length of an iovec is not multiple of block size. * be rewind to last aligned offset and go success without error.
*/ */
for (i = 0; i < BDEV_IO_NUM_CHILD_IOV - 1; i++) { for (i = 0; i < BDEV_IO_NUM_CHILD_IOV - 1; i++) {
iov[i].iov_base = (void *)((i + 1) * 0x10000); iov[i].iov_base = (void *)((i + 1) * 0x10000);
@ -1159,15 +1159,51 @@ bdev_io_split(void)
iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_base = (void *)(BDEV_IO_NUM_CHILD_IOV * 0x10000); iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_base = (void *)(BDEV_IO_NUM_CHILD_IOV * 0x10000);
iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_len = 256; iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_len = 256;
iov[BDEV_IO_NUM_CHILD_IOV].iov_base = (void *)((BDEV_IO_NUM_CHILD_IOV + 1) * 0x10000);
iov[BDEV_IO_NUM_CHILD_IOV].iov_len = 256;
iov[BDEV_IO_NUM_CHILD_IOV + 1].iov_base = (void *)((BDEV_IO_NUM_CHILD_IOV + 2) * 0x10000);
iov[BDEV_IO_NUM_CHILD_IOV + 1].iov_len = 512;
bdev->optimal_io_boundary = BDEV_IO_NUM_CHILD_IOV; bdev->optimal_io_boundary = BDEV_IO_NUM_CHILD_IOV;
g_io_done = false; g_io_done = false;
g_io_status = 0; g_io_status = 0;
/* The first expected io should be start from offset 0 to BDEV_IO_NUM_CHILD_IOV - 1 */
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 0,
BDEV_IO_NUM_CHILD_IOV - 1, BDEV_IO_NUM_CHILD_IOV - 1);
for (i = 0; i < BDEV_IO_NUM_CHILD_IOV - 1; i++) {
ut_expected_io_set_iov(expected_io, i,
(void *)((i + 1) * 0x10000), 512);
}
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
/* The second expected io should be start from offset BDEV_IO_NUM_CHILD_IOV - 1 to BDEV_IO_NUM_CHILD_IOV */
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, BDEV_IO_NUM_CHILD_IOV - 1,
1, 2);
ut_expected_io_set_iov(expected_io, 0,
(void *)(BDEV_IO_NUM_CHILD_IOV * 0x10000), 256);
ut_expected_io_set_iov(expected_io, 1,
(void *)((BDEV_IO_NUM_CHILD_IOV + 1) * 0x10000), 256);
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
/* The third expected io should be start from offset BDEV_IO_NUM_CHILD_IOV to BDEV_IO_NUM_CHILD_IOV + 1 */
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, BDEV_IO_NUM_CHILD_IOV,
1, 1);
ut_expected_io_set_iov(expected_io, 0,
(void *)((BDEV_IO_NUM_CHILD_IOV + 2) * 0x10000), 512);
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
rc = spdk_bdev_readv_blocks(desc, io_ch, iov, BDEV_IO_NUM_CHILD_IOV * 2, 0, rc = spdk_bdev_readv_blocks(desc, io_ch, iov, BDEV_IO_NUM_CHILD_IOV * 2, 0,
BDEV_IO_NUM_CHILD_IOV * 2, io_done, NULL); BDEV_IO_NUM_CHILD_IOV + 1, io_done, NULL);
CU_ASSERT(rc == 0); CU_ASSERT(rc == 0);
CU_ASSERT(g_io_done == false);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
stub_complete_io(1);
CU_ASSERT(g_io_done == false);
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2);
stub_complete_io(2);
CU_ASSERT(g_io_done == true); CU_ASSERT(g_io_done == true);
CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED); CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
/* Test a WRITE_ZEROES that would span an I/O boundary. WRITE_ZEROES should not be /* Test a WRITE_ZEROES that would span an I/O boundary. WRITE_ZEROES should not be
* split, so test that. * split, so test that.