From 4fb22601170e57bba95a7fe0cdee12ec320617dc Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Mon, 18 Mar 2019 08:03:44 -0700 Subject: [PATCH] lib/reduce: queue overlapping I/O to same logical chunk Write operations are not in-place, meaning that once it is completed, it's chunk and backing io units are freed. Reads in progress to these backing io units, from other I/O to the same chunk, could cause data corruption. Simultaneous writes to the same chunk can cause similar issues. There are cases where we can relax this in the future - for example, overlapped reads to the same chunk poses no problems. We'll leave that as a future exercise. Signed-off-by: Jim Harris Change-Id: I3a4d9e8e3ab920f823084cd1bae91f179a4386ff Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/448333 Reviewed-by: Shuhei Matsumoto Reviewed-by: Changpeng Liu Tested-by: SPDK CI Jenkins --- lib/reduce/reduce.c | 79 +++++++++++++++++++++-- test/unit/lib/reduce/reduce.c/reduce_ut.c | 69 +++++++++++++++++++- 2 files changed, 142 insertions(+), 6 deletions(-) diff --git a/lib/reduce/reduce.c b/lib/reduce/reduce.c index 6c3ba4fe2..372e77f5a 100644 --- a/lib/reduce/reduce.c +++ b/lib/reduce/reduce.c @@ -78,6 +78,9 @@ struct spdk_reduce_pm_file { uint64_t size; }; +#define REDUCE_IO_READV 1 +#define REDUCE_IO_WRITEV 2 + struct spdk_reduce_vol_request { /** * Scratch buffer used for read/modify/write operations on @@ -88,6 +91,7 @@ struct spdk_reduce_vol_request { struct iovec *buf_iov; struct iovec *iov; struct spdk_reduce_vol *vol; + int type; int reduce_errno; int iovcnt; int num_backing_ops; @@ -119,12 +123,17 @@ struct spdk_reduce_vol { struct spdk_reduce_vol_request *request_mem; TAILQ_HEAD(, spdk_reduce_vol_request) free_requests; + TAILQ_HEAD(, spdk_reduce_vol_request) executing_requests; + TAILQ_HEAD(, spdk_reduce_vol_request) queued_requests; /* Single contiguous buffer used for all request buffers for this volume. */ uint8_t *reqbufspace; struct iovec *buf_iov_mem; }; +static void _start_readv_request(struct spdk_reduce_vol_request *req); +static void _start_writev_request(struct spdk_reduce_vol_request *req); + /* * Allocate extra metadata chunks and corresponding backing io units to account for * outstanding IO in worst case scenario where logical map is completely allocated @@ -449,6 +458,10 @@ spdk_reduce_vol_init(struct spdk_reduce_vol_params *params, return; } + TAILQ_INIT(&vol->free_requests); + TAILQ_INIT(&vol->executing_requests); + TAILQ_INIT(&vol->queued_requests); + vol->backing_super = spdk_dma_zmalloc(sizeof(*vol->backing_super), 0, NULL); if (vol->backing_super == NULL) { cb_fn(cb_arg, NULL, -ENOMEM); @@ -654,6 +667,10 @@ spdk_reduce_vol_load(struct spdk_reduce_backing_dev *backing_dev, return; } + TAILQ_INIT(&vol->free_requests); + TAILQ_INIT(&vol->executing_requests); + TAILQ_INIT(&vol->queued_requests); + vol->backing_super = spdk_dma_zmalloc(sizeof(*vol->backing_super), 64, NULL); if (vol->backing_super == NULL) { _init_load_cleanup(vol, NULL); @@ -812,8 +829,26 @@ typedef void (*reduce_request_fn)(void *_req, int reduce_errno); static void _reduce_vol_complete_req(struct spdk_reduce_vol_request *req, int reduce_errno) { + struct spdk_reduce_vol_request *next_req; + struct spdk_reduce_vol *vol = req->vol; + req->cb_fn(req->cb_arg, reduce_errno); - TAILQ_INSERT_HEAD(&req->vol->free_requests, req, tailq); + TAILQ_REMOVE(&vol->executing_requests, req, tailq); + + TAILQ_FOREACH(next_req, &vol->queued_requests, tailq) { + if (next_req->logical_map_index == req->logical_map_index) { + TAILQ_REMOVE(&vol->queued_requests, next_req, tailq); + if (next_req->type == REDUCE_IO_READV) { + _start_readv_request(next_req); + } else { + assert(next_req->type == REDUCE_IO_WRITEV); + _start_writev_request(next_req); + } + break; + } + } + + TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq); } static void @@ -1010,9 +1045,24 @@ _iov_array_is_valid(struct spdk_reduce_vol *vol, struct iovec *iov, int iovcnt, return size == (length * vol->params.logical_block_size); } +static bool +_check_overlap(struct spdk_reduce_vol *vol, uint64_t logical_map_index) +{ + struct spdk_reduce_vol_request *req; + + TAILQ_FOREACH(req, &vol->executing_requests, tailq) { + if (logical_map_index == req->logical_map_index) { + return true; + } + } + + return false; +} + static void _start_readv_request(struct spdk_reduce_vol_request *req) { + TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq); _reduce_vol_read_chunk(req, _read_read_done); } @@ -1023,6 +1073,7 @@ spdk_reduce_vol_readv(struct spdk_reduce_vol *vol, { struct spdk_reduce_vol_request *req; uint64_t logical_map_index; + bool overlapped; int i; if (length == 0) { @@ -1041,7 +1092,9 @@ spdk_reduce_vol_readv(struct spdk_reduce_vol *vol, } logical_map_index = offset / vol->logical_blocks_per_chunk; - if (vol->pm_logical_map[logical_map_index] == REDUCE_EMPTY_MAP_ENTRY) { + overlapped = _check_overlap(vol, logical_map_index); + + if (!overlapped && vol->pm_logical_map[logical_map_index] == REDUCE_EMPTY_MAP_ENTRY) { /* * This chunk hasn't been allocated. So treat the data as all * zeroes for this chunk - do the memset and immediately complete @@ -1061,16 +1114,21 @@ spdk_reduce_vol_readv(struct spdk_reduce_vol *vol, } TAILQ_REMOVE(&vol->free_requests, req, tailq); + req->type = REDUCE_IO_READV; req->vol = vol; req->iov = iov; req->iovcnt = iovcnt; req->offset = offset; - req->logical_map_index = offset / vol->logical_blocks_per_chunk; + req->logical_map_index = logical_map_index; req->length = length; req->cb_fn = cb_fn; req->cb_arg = cb_arg; - _start_readv_request(req); + if (!overlapped) { + _start_readv_request(req); + } else { + TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq); + } } static void @@ -1082,6 +1140,7 @@ _start_writev_request(struct spdk_reduce_vol_request *req) int i; uint8_t *buf; + TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq); if (vol->pm_logical_map[req->logical_map_index] != REDUCE_EMPTY_MAP_ENTRY) { /* Read old chunk, then overwrite with data from this write operation. * TODO: bypass reading old chunk if this write operation overwrites @@ -1117,6 +1176,8 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol, spdk_reduce_vol_op_complete cb_fn, void *cb_arg) { struct spdk_reduce_vol_request *req; + uint64_t logical_map_index; + bool overlapped; if (length == 0) { cb_fn(cb_arg, 0); @@ -1133,6 +1194,9 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol, return; } + logical_map_index = offset / vol->logical_blocks_per_chunk; + overlapped = _check_overlap(vol, logical_map_index); + req = TAILQ_FIRST(&vol->free_requests); if (req == NULL) { cb_fn(cb_arg, -ENOMEM); @@ -1140,6 +1204,7 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol, } TAILQ_REMOVE(&vol->free_requests, req, tailq); + req->type = REDUCE_IO_WRITEV; req->vol = vol; req->iov = iov; req->iovcnt = iovcnt; @@ -1149,7 +1214,11 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol, req->cb_fn = cb_fn; req->cb_arg = cb_arg; - _start_writev_request(req); + if (!overlapped) { + _start_writev_request(req); + } else { + TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq); + } } SPDK_LOG_REGISTER_COMPONENT("reduce", SPDK_LOG_REDUCE) diff --git a/test/unit/lib/reduce/reduce.c/reduce_ut.c b/test/unit/lib/reduce/reduce.c/reduce_ut.c index 9107f8acb..c82e4e5d4 100644 --- a/test/unit/lib/reduce/reduce.c/reduce_ut.c +++ b/test/unit/lib/reduce/reduce.c/reduce_ut.c @@ -961,6 +961,72 @@ defer_bdev_io(void) backing_dev_destroy(&backing_dev); } +static void +overlapped(void) +{ + struct spdk_reduce_vol_params params = {}; + struct spdk_reduce_backing_dev backing_dev = {}; + const uint32_t logical_block_size = 512; + struct iovec iov; + char buf[2 * logical_block_size]; + char compare_buf[2 * logical_block_size]; + + params.chunk_size = 16 * 1024; + params.backing_io_unit_size = 4096; + params.logical_block_size = logical_block_size; + spdk_uuid_generate(¶ms.uuid); + + backing_dev_init(&backing_dev, ¶ms, 512); + + g_vol = NULL; + g_reduce_errno = -1; + spdk_reduce_vol_init(¶ms, &backing_dev, TEST_MD_PATH, init_cb, NULL); + CU_ASSERT(g_reduce_errno == 0); + SPDK_CU_ASSERT_FATAL(g_vol != NULL); + + /* Write 0xAA to 1 512-byte logical block. */ + memset(buf, 0xAA, logical_block_size); + iov.iov_base = buf; + iov.iov_len = logical_block_size; + g_reduce_errno = -100; + g_defer_bdev_io = true; + spdk_reduce_vol_writev(g_vol, &iov, 1, 0, 1, write_cb, NULL); + /* Callback should not have executed, so this should still equal -100. */ + CU_ASSERT(g_reduce_errno == -100); + CU_ASSERT(!TAILQ_EMPTY(&g_pending_bdev_io)); + CU_ASSERT(g_pending_bdev_io_count == params.chunk_size / params.backing_io_unit_size); + + /* Now do an overlapped I/O to the same chunk. */ + spdk_reduce_vol_writev(g_vol, &iov, 1, 1, 1, write_cb, NULL); + /* Callback should not have executed, so this should still equal -100. */ + CU_ASSERT(g_reduce_errno == -100); + CU_ASSERT(!TAILQ_EMPTY(&g_pending_bdev_io)); + /* The second I/O overlaps with the first one. So we should only see pending bdev_io + * related to the first I/O here - the second one won't start until the first one is completed. + */ + CU_ASSERT(g_pending_bdev_io_count == params.chunk_size / params.backing_io_unit_size); + + backing_dev_io_execute(0); + CU_ASSERT(g_reduce_errno == 0); + + g_defer_bdev_io = false; + memset(compare_buf, 0xAA, sizeof(compare_buf)); + memset(buf, 0xFF, sizeof(buf)); + iov.iov_base = buf; + iov.iov_len = 2 * logical_block_size; + g_reduce_errno = -100; + spdk_reduce_vol_readv(g_vol, &iov, 1, 0, 2, read_cb, NULL); + CU_ASSERT(g_reduce_errno == 0); + CU_ASSERT(memcmp(buf, compare_buf, 2 * logical_block_size) == 0); + + g_reduce_errno = -1; + spdk_reduce_vol_unload(g_vol, unload_cb, NULL); + CU_ASSERT(g_reduce_errno == 0); + + persistent_pm_buf_destroy(); + backing_dev_destroy(&backing_dev); +} + int main(int argc, char **argv) { @@ -987,7 +1053,8 @@ main(int argc, char **argv) CU_add_test(suite, "write_maps", write_maps) == NULL || CU_add_test(suite, "read_write", read_write) == NULL || CU_add_test(suite, "destroy", destroy) == NULL || - CU_add_test(suite, "defer_bdev_io", defer_bdev_io) == NULL + CU_add_test(suite, "defer_bdev_io", defer_bdev_io) == NULL || + CU_add_test(suite, "overlapped", overlapped) == NULL ) { CU_cleanup_registry(); return CU_get_error();