bdev: rewind child offset to last block size aligned iov
Here is the an example to describe existing issue: There is a Write request with 64KiB data length, and this IO is cross the IO boundary. We assume that the parent IO will have 2 children requests, one is 33KiB length, the other one is 31KiB. Here is the view of parent iovs, the first 33KiB length data has 33 iovs: iov.[0].iov_length = 1024; . . iov.[31].iov_length = 256; iov.[32].iov_length = 768; . . iov.[64].iov_length = 1024; In function _spdk_bdev_io_split(), then you can see that for the 33KiB length child request, exiting code will run out of child child_iov space and return error due to last one data buffer is not block size aligned. Here we can rewind the existing offset to last block size aligned buffer to avoid the error case, for backend which need aligned data buffer such as AIO backend, the request will go through spdk_bdev_io_get_buf() again to do the data copy, otherwise for those backend devices such as NVMe with hardware SGL support, 256 data segment is fine for them. Change-Id: I96ebdf29829d86f9b38fab28a7406eedc9fa44ef Signed-off-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/453604 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
This commit is contained in:
parent
091bc429d7
commit
efd7b514d4
@ -1596,11 +1596,12 @@ _spdk_bdev_io_split(void *_bdev_io)
|
||||
{
|
||||
struct spdk_bdev_io *bdev_io = _bdev_io;
|
||||
uint64_t current_offset, remaining;
|
||||
uint32_t blocklen, to_next_boundary, to_next_boundary_bytes;
|
||||
uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
|
||||
struct iovec *parent_iov, *iov;
|
||||
uint64_t parent_iov_offset, iov_len;
|
||||
uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt;
|
||||
void *md_buf = NULL;
|
||||
bool child_iov_run_out = false;
|
||||
int rc;
|
||||
|
||||
remaining = bdev_io->u.bdev.split_remaining_num_blocks;
|
||||
@ -1652,18 +1653,28 @@ _spdk_bdev_io_split(void *_bdev_io)
|
||||
|
||||
if (to_next_boundary_bytes > 0) {
|
||||
/* We had to stop this child I/O early because we ran out of
|
||||
* child_iov space. Make sure the iovs collected are valid and
|
||||
* then adjust to_next_boundary before starting the child I/O.
|
||||
* child_iov space. Ensure the iovs to be aligned with block
|
||||
* size and then adjust to_next_boundary before starting the
|
||||
* child I/O.
|
||||
*/
|
||||
if ((to_next_boundary_bytes % blocklen) != 0) {
|
||||
SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n",
|
||||
to_next_boundary_bytes, blocklen);
|
||||
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
|
||||
if (bdev_io->u.bdev.split_outstanding == 0) {
|
||||
bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
|
||||
to_last_block_bytes = to_next_boundary_bytes % blocklen;
|
||||
if (to_last_block_bytes != 0) {
|
||||
to_next_boundary_bytes += _to_next_boundary(to_next_boundary_bytes, blocklen);;
|
||||
|
||||
while (to_last_block_bytes > 0 && iovcnt > 0) {
|
||||
iov_len = spdk_min(to_last_block_bytes,
|
||||
bdev_io->child_iov[child_iovcnt - 1].iov_len);
|
||||
bdev_io->child_iov[child_iovcnt - 1].iov_len -= iov_len;
|
||||
if (bdev_io->child_iov[child_iovcnt - 1].iov_len == 0) {
|
||||
child_iovcnt--;
|
||||
iovcnt--;
|
||||
}
|
||||
to_last_block_bytes -= iov_len;
|
||||
}
|
||||
return;
|
||||
|
||||
assert(to_last_block_bytes == 0);
|
||||
}
|
||||
child_iov_run_out = true;
|
||||
to_next_boundary -= to_next_boundary_bytes / blocklen;
|
||||
}
|
||||
|
||||
@ -1688,6 +1699,10 @@ _spdk_bdev_io_split(void *_bdev_io)
|
||||
remaining -= to_next_boundary;
|
||||
bdev_io->u.bdev.split_current_offset_blocks = current_offset;
|
||||
bdev_io->u.bdev.split_remaining_num_blocks = remaining;
|
||||
/* stop splitting until child_iov is available */
|
||||
if (spdk_unlikely(child_iov_run_out)) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
bdev_io->u.bdev.split_outstanding--;
|
||||
if (rc == -ENOMEM) {
|
||||
|
@ -1149,8 +1149,8 @@ bdev_io_split(void)
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
|
||||
|
||||
/* Test multi vector command that needs to be split by strip and then needs to be
|
||||
* split further due to the capacity of child iovs, but fails to split. The cause
|
||||
* of failure of split is that the length of an iovec is not multiple of block size.
|
||||
* split further due to the capacity of child iovs, the child request offset should
|
||||
* be rewind to last aligned offset and go success without error.
|
||||
*/
|
||||
for (i = 0; i < BDEV_IO_NUM_CHILD_IOV - 1; i++) {
|
||||
iov[i].iov_base = (void *)((i + 1) * 0x10000);
|
||||
@ -1159,15 +1159,51 @@ bdev_io_split(void)
|
||||
iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_base = (void *)(BDEV_IO_NUM_CHILD_IOV * 0x10000);
|
||||
iov[BDEV_IO_NUM_CHILD_IOV - 1].iov_len = 256;
|
||||
|
||||
iov[BDEV_IO_NUM_CHILD_IOV].iov_base = (void *)((BDEV_IO_NUM_CHILD_IOV + 1) * 0x10000);
|
||||
iov[BDEV_IO_NUM_CHILD_IOV].iov_len = 256;
|
||||
|
||||
iov[BDEV_IO_NUM_CHILD_IOV + 1].iov_base = (void *)((BDEV_IO_NUM_CHILD_IOV + 2) * 0x10000);
|
||||
iov[BDEV_IO_NUM_CHILD_IOV + 1].iov_len = 512;
|
||||
|
||||
bdev->optimal_io_boundary = BDEV_IO_NUM_CHILD_IOV;
|
||||
g_io_done = false;
|
||||
g_io_status = 0;
|
||||
/* The first expected io should be start from offset 0 to BDEV_IO_NUM_CHILD_IOV - 1 */
|
||||
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, 0,
|
||||
BDEV_IO_NUM_CHILD_IOV - 1, BDEV_IO_NUM_CHILD_IOV - 1);
|
||||
for (i = 0; i < BDEV_IO_NUM_CHILD_IOV - 1; i++) {
|
||||
ut_expected_io_set_iov(expected_io, i,
|
||||
(void *)((i + 1) * 0x10000), 512);
|
||||
}
|
||||
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
|
||||
/* The second expected io should be start from offset BDEV_IO_NUM_CHILD_IOV - 1 to BDEV_IO_NUM_CHILD_IOV */
|
||||
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, BDEV_IO_NUM_CHILD_IOV - 1,
|
||||
1, 2);
|
||||
ut_expected_io_set_iov(expected_io, 0,
|
||||
(void *)(BDEV_IO_NUM_CHILD_IOV * 0x10000), 256);
|
||||
ut_expected_io_set_iov(expected_io, 1,
|
||||
(void *)((BDEV_IO_NUM_CHILD_IOV + 1) * 0x10000), 256);
|
||||
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
|
||||
/* The third expected io should be start from offset BDEV_IO_NUM_CHILD_IOV to BDEV_IO_NUM_CHILD_IOV + 1 */
|
||||
expected_io = ut_alloc_expected_io(SPDK_BDEV_IO_TYPE_READ, BDEV_IO_NUM_CHILD_IOV,
|
||||
1, 1);
|
||||
ut_expected_io_set_iov(expected_io, 0,
|
||||
(void *)((BDEV_IO_NUM_CHILD_IOV + 2) * 0x10000), 512);
|
||||
TAILQ_INSERT_TAIL(&g_bdev_ut_channel->expected_io, expected_io, link);
|
||||
|
||||
rc = spdk_bdev_readv_blocks(desc, io_ch, iov, BDEV_IO_NUM_CHILD_IOV * 2, 0,
|
||||
BDEV_IO_NUM_CHILD_IOV * 2, io_done, NULL);
|
||||
BDEV_IO_NUM_CHILD_IOV + 1, io_done, NULL);
|
||||
CU_ASSERT(rc == 0);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 1);
|
||||
stub_complete_io(1);
|
||||
CU_ASSERT(g_io_done == false);
|
||||
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 2);
|
||||
stub_complete_io(2);
|
||||
CU_ASSERT(g_io_done == true);
|
||||
CU_ASSERT(g_io_status == SPDK_BDEV_IO_STATUS_FAILED);
|
||||
CU_ASSERT(g_bdev_ut_channel->outstanding_io_count == 0);
|
||||
|
||||
/* Test a WRITE_ZEROES that would span an I/O boundary. WRITE_ZEROES should not be
|
||||
* split, so test that.
|
||||
|
Loading…
Reference in New Issue
Block a user