From 4fb22601170e57bba95a7fe0cdee12ec320617dc Mon Sep 17 00:00:00 2001
From: Jim Harris <james.r.harris@intel.com>
Date: Mon, 18 Mar 2019 08:03:44 -0700
Subject: [PATCH] lib/reduce: queue overlapping I/O to same logical chunk

Write operations are not in-place, meaning that once it
is completed, it's chunk and backing io units are freed.
Reads in progress to these backing io units, from other
I/O to the same chunk, could cause data corruption.
Simultaneous writes to the same chunk can cause similar
issues.

There are cases where we can relax this in the future -
for example, overlapped reads to the same chunk poses
no problems.  We'll leave that as a future exercise.

Signed-off-by: Jim Harris <james.r.harris@intel.com>
Change-Id: I3a4d9e8e3ab920f823084cd1bae91f179a4386ff
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/448333
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
---
 lib/reduce/reduce.c                       | 79 +++++++++++++++++++++--
 test/unit/lib/reduce/reduce.c/reduce_ut.c | 69 +++++++++++++++++++-
 2 files changed, 142 insertions(+), 6 deletions(-)

diff --git a/lib/reduce/reduce.c b/lib/reduce/reduce.c
index 6c3ba4fe2..372e77f5a 100644
--- a/lib/reduce/reduce.c
+++ b/lib/reduce/reduce.c
@@ -78,6 +78,9 @@ struct spdk_reduce_pm_file {
 	uint64_t		size;
 };
 
+#define REDUCE_IO_READV		1
+#define REDUCE_IO_WRITEV	2
+
 struct spdk_reduce_vol_request {
 	/**
 	 *  Scratch buffer used for read/modify/write operations on
@@ -88,6 +91,7 @@ struct spdk_reduce_vol_request {
 	struct iovec				*buf_iov;
 	struct iovec				*iov;
 	struct spdk_reduce_vol			*vol;
+	int					type;
 	int					reduce_errno;
 	int					iovcnt;
 	int					num_backing_ops;
@@ -119,12 +123,17 @@ struct spdk_reduce_vol {
 
 	struct spdk_reduce_vol_request		*request_mem;
 	TAILQ_HEAD(, spdk_reduce_vol_request)	free_requests;
+	TAILQ_HEAD(, spdk_reduce_vol_request)	executing_requests;
+	TAILQ_HEAD(, spdk_reduce_vol_request)	queued_requests;
 
 	/* Single contiguous buffer used for all request buffers for this volume. */
 	uint8_t					*reqbufspace;
 	struct iovec				*buf_iov_mem;
 };
 
+static void _start_readv_request(struct spdk_reduce_vol_request *req);
+static void _start_writev_request(struct spdk_reduce_vol_request *req);
+
 /*
  * Allocate extra metadata chunks and corresponding backing io units to account for
  *  outstanding IO in worst case scenario where logical map is completely allocated
@@ -449,6 +458,10 @@ spdk_reduce_vol_init(struct spdk_reduce_vol_params *params,
 		return;
 	}
 
+	TAILQ_INIT(&vol->free_requests);
+	TAILQ_INIT(&vol->executing_requests);
+	TAILQ_INIT(&vol->queued_requests);
+
 	vol->backing_super = spdk_dma_zmalloc(sizeof(*vol->backing_super), 0, NULL);
 	if (vol->backing_super == NULL) {
 		cb_fn(cb_arg, NULL, -ENOMEM);
@@ -654,6 +667,10 @@ spdk_reduce_vol_load(struct spdk_reduce_backing_dev *backing_dev,
 		return;
 	}
 
+	TAILQ_INIT(&vol->free_requests);
+	TAILQ_INIT(&vol->executing_requests);
+	TAILQ_INIT(&vol->queued_requests);
+
 	vol->backing_super = spdk_dma_zmalloc(sizeof(*vol->backing_super), 64, NULL);
 	if (vol->backing_super == NULL) {
 		_init_load_cleanup(vol, NULL);
@@ -812,8 +829,26 @@ typedef void (*reduce_request_fn)(void *_req, int reduce_errno);
 static void
 _reduce_vol_complete_req(struct spdk_reduce_vol_request *req, int reduce_errno)
 {
+	struct spdk_reduce_vol_request *next_req;
+	struct spdk_reduce_vol *vol = req->vol;
+
 	req->cb_fn(req->cb_arg, reduce_errno);
-	TAILQ_INSERT_HEAD(&req->vol->free_requests, req, tailq);
+	TAILQ_REMOVE(&vol->executing_requests, req, tailq);
+
+	TAILQ_FOREACH(next_req, &vol->queued_requests, tailq) {
+		if (next_req->logical_map_index == req->logical_map_index) {
+			TAILQ_REMOVE(&vol->queued_requests, next_req, tailq);
+			if (next_req->type == REDUCE_IO_READV) {
+				_start_readv_request(next_req);
+			} else {
+				assert(next_req->type == REDUCE_IO_WRITEV);
+				_start_writev_request(next_req);
+			}
+			break;
+		}
+	}
+
+	TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq);
 }
 
 static void
@@ -1010,9 +1045,24 @@ _iov_array_is_valid(struct spdk_reduce_vol *vol, struct iovec *iov, int iovcnt,
 	return size == (length * vol->params.logical_block_size);
 }
 
+static bool
+_check_overlap(struct spdk_reduce_vol *vol, uint64_t logical_map_index)
+{
+	struct spdk_reduce_vol_request *req;
+
+	TAILQ_FOREACH(req, &vol->executing_requests, tailq) {
+		if (logical_map_index == req->logical_map_index) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static void
 _start_readv_request(struct spdk_reduce_vol_request *req)
 {
+	TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq);
 	_reduce_vol_read_chunk(req, _read_read_done);
 }
 
@@ -1023,6 +1073,7 @@ spdk_reduce_vol_readv(struct spdk_reduce_vol *vol,
 {
 	struct spdk_reduce_vol_request *req;
 	uint64_t logical_map_index;
+	bool overlapped;
 	int i;
 
 	if (length == 0) {
@@ -1041,7 +1092,9 @@ spdk_reduce_vol_readv(struct spdk_reduce_vol *vol,
 	}
 
 	logical_map_index = offset / vol->logical_blocks_per_chunk;
-	if (vol->pm_logical_map[logical_map_index] == REDUCE_EMPTY_MAP_ENTRY) {
+	overlapped = _check_overlap(vol, logical_map_index);
+
+	if (!overlapped && vol->pm_logical_map[logical_map_index] == REDUCE_EMPTY_MAP_ENTRY) {
 		/*
 		 * This chunk hasn't been allocated.  So treat the data as all
 		 * zeroes for this chunk - do the memset and immediately complete
@@ -1061,16 +1114,21 @@ spdk_reduce_vol_readv(struct spdk_reduce_vol *vol,
 	}
 
 	TAILQ_REMOVE(&vol->free_requests, req, tailq);
+	req->type = REDUCE_IO_READV;
 	req->vol = vol;
 	req->iov = iov;
 	req->iovcnt = iovcnt;
 	req->offset = offset;
-	req->logical_map_index = offset / vol->logical_blocks_per_chunk;
+	req->logical_map_index = logical_map_index;
 	req->length = length;
 	req->cb_fn = cb_fn;
 	req->cb_arg = cb_arg;
 
-	_start_readv_request(req);
+	if (!overlapped) {
+		_start_readv_request(req);
+	} else {
+		TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq);
+	}
 }
 
 static void
@@ -1082,6 +1140,7 @@ _start_writev_request(struct spdk_reduce_vol_request *req)
 	int i;
 	uint8_t *buf;
 
+	TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq);
 	if (vol->pm_logical_map[req->logical_map_index] != REDUCE_EMPTY_MAP_ENTRY) {
 		/* Read old chunk, then overwrite with data from this write operation.
 		 * TODO: bypass reading old chunk if this write operation overwrites
@@ -1117,6 +1176,8 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol,
 		       spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
 {
 	struct spdk_reduce_vol_request *req;
+	uint64_t logical_map_index;
+	bool overlapped;
 
 	if (length == 0) {
 		cb_fn(cb_arg, 0);
@@ -1133,6 +1194,9 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol,
 		return;
 	}
 
+	logical_map_index = offset / vol->logical_blocks_per_chunk;
+	overlapped = _check_overlap(vol, logical_map_index);
+
 	req = TAILQ_FIRST(&vol->free_requests);
 	if (req == NULL) {
 		cb_fn(cb_arg, -ENOMEM);
@@ -1140,6 +1204,7 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol,
 	}
 
 	TAILQ_REMOVE(&vol->free_requests, req, tailq);
+	req->type = REDUCE_IO_WRITEV;
 	req->vol = vol;
 	req->iov = iov;
 	req->iovcnt = iovcnt;
@@ -1149,7 +1214,11 @@ spdk_reduce_vol_writev(struct spdk_reduce_vol *vol,
 	req->cb_fn = cb_fn;
 	req->cb_arg = cb_arg;
 
-	_start_writev_request(req);
+	if (!overlapped) {
+		_start_writev_request(req);
+	} else {
+		TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq);
+	}
 }
 
 SPDK_LOG_REGISTER_COMPONENT("reduce", SPDK_LOG_REDUCE)
diff --git a/test/unit/lib/reduce/reduce.c/reduce_ut.c b/test/unit/lib/reduce/reduce.c/reduce_ut.c
index 9107f8acb..c82e4e5d4 100644
--- a/test/unit/lib/reduce/reduce.c/reduce_ut.c
+++ b/test/unit/lib/reduce/reduce.c/reduce_ut.c
@@ -961,6 +961,72 @@ defer_bdev_io(void)
 	backing_dev_destroy(&backing_dev);
 }
 
+static void
+overlapped(void)
+{
+	struct spdk_reduce_vol_params params = {};
+	struct spdk_reduce_backing_dev backing_dev = {};
+	const uint32_t logical_block_size = 512;
+	struct iovec iov;
+	char buf[2 * logical_block_size];
+	char compare_buf[2 * logical_block_size];
+
+	params.chunk_size = 16 * 1024;
+	params.backing_io_unit_size = 4096;
+	params.logical_block_size = logical_block_size;
+	spdk_uuid_generate(&params.uuid);
+
+	backing_dev_init(&backing_dev, &params, 512);
+
+	g_vol = NULL;
+	g_reduce_errno = -1;
+	spdk_reduce_vol_init(&params, &backing_dev, TEST_MD_PATH, init_cb, NULL);
+	CU_ASSERT(g_reduce_errno == 0);
+	SPDK_CU_ASSERT_FATAL(g_vol != NULL);
+
+	/* Write 0xAA to 1 512-byte logical block. */
+	memset(buf, 0xAA, logical_block_size);
+	iov.iov_base = buf;
+	iov.iov_len = logical_block_size;
+	g_reduce_errno = -100;
+	g_defer_bdev_io = true;
+	spdk_reduce_vol_writev(g_vol, &iov, 1, 0, 1, write_cb, NULL);
+	/* Callback should not have executed, so this should still equal -100. */
+	CU_ASSERT(g_reduce_errno == -100);
+	CU_ASSERT(!TAILQ_EMPTY(&g_pending_bdev_io));
+	CU_ASSERT(g_pending_bdev_io_count == params.chunk_size / params.backing_io_unit_size);
+
+	/* Now do an overlapped I/O to the same chunk. */
+	spdk_reduce_vol_writev(g_vol, &iov, 1, 1, 1, write_cb, NULL);
+	/* Callback should not have executed, so this should still equal -100. */
+	CU_ASSERT(g_reduce_errno == -100);
+	CU_ASSERT(!TAILQ_EMPTY(&g_pending_bdev_io));
+	/* The second I/O overlaps with the first one.  So we should only see pending bdev_io
+	 * related to the first I/O here - the second one won't start until the first one is completed.
+	 */
+	CU_ASSERT(g_pending_bdev_io_count == params.chunk_size / params.backing_io_unit_size);
+
+	backing_dev_io_execute(0);
+	CU_ASSERT(g_reduce_errno == 0);
+
+	g_defer_bdev_io = false;
+	memset(compare_buf, 0xAA, sizeof(compare_buf));
+	memset(buf, 0xFF, sizeof(buf));
+	iov.iov_base = buf;
+	iov.iov_len = 2 * logical_block_size;
+	g_reduce_errno = -100;
+	spdk_reduce_vol_readv(g_vol, &iov, 1, 0, 2, read_cb, NULL);
+	CU_ASSERT(g_reduce_errno == 0);
+	CU_ASSERT(memcmp(buf, compare_buf, 2 * logical_block_size) == 0);
+
+	g_reduce_errno = -1;
+	spdk_reduce_vol_unload(g_vol, unload_cb, NULL);
+	CU_ASSERT(g_reduce_errno == 0);
+
+	persistent_pm_buf_destroy();
+	backing_dev_destroy(&backing_dev);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -987,7 +1053,8 @@ main(int argc, char **argv)
 		CU_add_test(suite, "write_maps", write_maps) == NULL ||
 		CU_add_test(suite, "read_write", read_write) == NULL ||
 		CU_add_test(suite, "destroy", destroy) == NULL ||
-		CU_add_test(suite, "defer_bdev_io", defer_bdev_io) == NULL
+		CU_add_test(suite, "defer_bdev_io", defer_bdev_io) == NULL ||
+		CU_add_test(suite, "overlapped", overlapped) == NULL
 	) {
 		CU_cleanup_registry();
 		return CU_get_error();