bdev: add ENOMEM handling

At very high queue depths, bdev modules may not have enough internal resources to track all of the incoming I/O. For example, we allocate a finite number of nvme_request objects per allocated queue pair. Currently if these resources are exhausted, the bdev module will return failure (with no indication why) which gets propagated all the way back to the application. So instead, add SPDK_BDEV_IO_STATUS_NOMEM to allow bdev modules to indicate this type of failure. Also add handling for this status type in the generic bdev layer, involving queuing these I/O for later retry after other I/O on the failing channel have completed. This does place an expectation on the bdev module that these internal resources are allocated per io_channel. Otherwise we cannot guarantee forward progress solely on reception of completions. For example, without this guarantee, a bdev module could theoretically return ENOMEM even if there were no I/O oustanding for that io_channel. nvme, aio, rbd, virtio and null drivers comply with this expectation already. malloc only complies though when not using copy offload. This patch will fix malloc w/ copy engine to at least return ENOMEM when no copy descriptors are available. If the condition above occurs, I/O waiting for resources will get failed as part of a subsequent reset which matches the behavior it has today. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: Iea7cd51a611af8abe882794d0b2361fdbb74e84e Reviewed-on: https://review.gerrithub.io/378853 Tested-by: SPDK Automated Test System <sys_sgsw@intel.com> Reviewed-by: Daniel Verkamp <daniel.verkamp@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
2017-09-15 13:47:17 -07:00 · 2017-09-15 13:47:17 -07:00 · 94bc8cfdba
commit 94bc8cfdba
parent c5d8b108f2
6 changed files with 244 additions and 9 deletions
--- a/include/spdk_internal/bdev.h
+++ b/include/spdk_internal/bdev.h
@ -161,6 +161,13 @@ struct spdk_bdev_fn_table {

 /** bdev I/O completion status */
 enum spdk_bdev_io_status {
+	/*
+	 * NOMEM should be returned when a bdev module cannot start an I/O because of
+	 *  some lack of resources.  It may not be returned for RESET I/O.  I/O completed
+	 *  with NOMEM status will be retried after some I/O from the same channel have
+	 *  completed.
+	 */
+	SPDK_BDEV_IO_STATUS_NOMEM = -4,
 	SPDK_BDEV_IO_STATUS_SCSI_ERROR = -3,
 	SPDK_BDEV_IO_STATUS_NVME_ERROR = -2,
 	SPDK_BDEV_IO_STATUS_FAILED = -1,
--- a/lib/bdev/aio/bdev_aio.c
+++ b/lib/bdev/aio/bdev_aio.c
@ -120,7 +120,11 @@ bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,

 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
 	if (rc < 0) {
-		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
+		if (rc == EAGAIN) {
+			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
+		} else {
+			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
+		}
 		SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
 		return -1;
 	}
@ -146,7 +150,11 @@ bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,

 	rc = io_submit(aio_ch->io_ctx, 1, &iocb);
 	if (rc < 0) {
-		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
+		if (rc == EAGAIN) {
+			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
+		} else {
+			spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
+		}
 		SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
 		return -1;
 	}
--- a/lib/bdev/bdev.c
+++ b/lib/bdev/bdev.c
@ -57,6 +57,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
 #define SPDK_BDEV_IO_POOL_SIZE	(64 * 1024)
 #define BUF_SMALL_POOL_SIZE	8192
 #define BUF_LARGE_POOL_SIZE	1024
+#define NOMEM_THRESHOLD_COUNT	8

 typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t;

@ -128,6 +129,17 @@ struct spdk_bdev_channel {

 	bdev_io_tailq_t		queued_resets;

+	/*
+	 * Queue of IO awaiting retry because of a previous NOMEM status returned
+	 *  on this channel.
+	 */
+	bdev_io_tailq_t		nomem_io;
+
+	/*
+	 * Threshold which io_outstanding must drop to before retrying nomem_io.
+	 */
+	uint64_t		nomem_threshold;
+
 	uint32_t		flags;

 #ifdef SPDK_CONFIG_VTUNE
@ -612,7 +624,12 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
 	bdev_ch->io_outstanding++;
 	bdev_io->in_submit_request = true;
 	if (spdk_likely(bdev_ch->flags == 0)) {
-		bdev->fn_table->submit_request(ch, bdev_io);
+		if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) {
+			bdev->fn_table->submit_request(ch, bdev_io);
+		} else {
+			bdev_ch->io_outstanding--;
+			TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link);
+		}
 	} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
 	} else {
@ -676,6 +693,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
 	memset(&ch->stat, 0, sizeof(ch->stat));
 	ch->io_outstanding = 0;
 	TAILQ_INIT(&ch->queued_resets);
+	TAILQ_INIT(&ch->nomem_io);
+	ch->nomem_threshold = 0;
 	ch->flags = 0;

 #ifdef SPDK_CONFIG_VTUNE
@ -725,6 +744,15 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
 	TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) {
 		if (bdev_io->ch == ch) {
 			TAILQ_REMOVE(queue, bdev_io, link);
+			/*
+			 * spdk_bdev_io_complete() assumes that the completed I/O had
+			 *  been submitted to the bdev module.  Since in this case it
+			 *  hadn't, bump io_outstanding to account for the decrement
+			 *  that spdk_bdev_io_complete() will do.
+			 */
+			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
+				ch->io_outstanding++;
+			}
 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
 		}
 	}
@ -739,6 +767,7 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
 	mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);

 	_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
+	_spdk_bdev_abort_queued_io(&ch->nomem_io, ch);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch);

@ -1201,6 +1230,7 @@ _spdk_bdev_reset_abort_channel(void *io_device, struct spdk_io_channel *ch,

 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;

+	_spdk_bdev_abort_queued_io(&channel->nomem_io, channel);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);
 }
@ -1378,6 +1408,36 @@ spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
 	return 0;
 }

+static void
+_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
+{
+	struct spdk_bdev *bdev = bdev_ch->bdev;
+	struct spdk_bdev_io *bdev_io;
+
+	if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) {
+		/*
+		 * Allow some more I/O to complete before retrying the nomem_io queue.
+		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
+		 *  the context of a completion, because the resources for the I/O are
+		 *  not released until control returns to the bdev poller.  Also, we
+		 *  may require several small I/O to complete before a larger I/O
+		 *  (that requires splitting) can be submitted.
+		 */
+		return;
+	}
+
+	while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) {
+		bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io);
+		TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link);
+		bdev_ch->io_outstanding++;
+		bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
+		bdev->fn_table->submit_request(bdev_ch->channel, bdev_io);
+		if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
+			break;
+		}
+	}
+}
+
 static void
 _spdk_bdev_io_complete(void *ctx)
 {
@ -1396,6 +1456,9 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
 	bdev_io->status = status;

 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
+		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
+			SPDK_ERRLOG("NOMEM returned for reset\n");
+		}
 		pthread_mutex_lock(&bdev->mutex);
 		if (bdev_io == bdev->reset_in_progress) {
 			bdev->reset_in_progress = NULL;
@ -1408,6 +1471,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
 	} else {
 		assert(bdev_ch->io_outstanding > 0);
 		bdev_ch->io_outstanding--;
+		if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) {
+			if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) {
+				_spdk_bdev_ch_retry_io(bdev_ch);
+			}
+		} else {
+			TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link);
+			/*
+			 * Wait for some of the outstanding I/O to complete before we
+			 *  retry any of the nomem_io.  Normally we will wait for
+			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
+			 *  depth channels we will instead wait for half to complete.
+			 */
+			bdev_ch->nomem_threshold = spdk_max(bdev_ch->io_outstanding / 2,
+							    bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
+			return;
+		}
 	}

 	if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {
--- a/lib/bdev/malloc/bdev_malloc.c
+++ b/lib/bdev/malloc/bdev_malloc.c
@ -75,7 +75,11 @@ malloc_done(void *ref, int status)
 	struct malloc_task *task = __malloc_task_from_copy_task(ref);

 	if (status != 0) {
-		task->status = SPDK_BDEV_IO_STATUS_FAILED;
+		if (status == -ENOMEM) {
+			task->status = SPDK_BDEV_IO_STATUS_NOMEM;
+		} else {
+			task->status = SPDK_BDEV_IO_STATUS_FAILED;
+		}
 	}

 	if (--task->num_outstanding == 0) {
--- a/lib/bdev/nvme/bdev_nvme.c
+++ b/lib/bdev/nvme/bdev_nvme.c
@ -347,7 +347,11 @@ bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
 			      bdev_io->u.bdev.num_blocks,
 			      bdev_io->u.bdev.offset_blocks);

-	if (ret != 0) {
+	if (spdk_likely(ret == 0)) {
+		return;
+	} else if (ret == -ENOMEM) {
+		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+	} else {
 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
 	}
 }
@ -428,7 +432,11 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i
 	int rc = _bdev_nvme_submit_request(ch, bdev_io);

 	if (spdk_unlikely(rc != 0)) {
-		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+		if (rc == -ENOMEM) {
+			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+		} else {
+			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+		}
 	}
 }

@ -1256,7 +1264,7 @@ bdev_nvme_queue_cmd(struct nvme_bdev *bdev, struct spdk_nvme_qpair *qpair,
 					     bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge);
 	}

-	if (rc != 0) {
+	if (rc != 0 && rc != -ENOMEM) {
 		SPDK_ERRLOG("%s failed: rc = %d\n", direction == BDEV_DISK_READ ? "readv" : "writev", rc);
 	}
 	return rc;
--- a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c
+++ b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c
@ -99,12 +99,24 @@ stub_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
 {
 	struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);

+	if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
+		struct spdk_bdev_io *io;
+
+		while (!TAILQ_EMPTY(&ch->outstanding_io)) {
+			io = TAILQ_FIRST(&ch->outstanding_io);
+			TAILQ_REMOVE(&ch->outstanding_io, io, module_link);
+			ch->outstanding_cnt--;
+			spdk_bdev_io_complete(io, SPDK_BDEV_IO_STATUS_FAILED);
+			ch->avail_cnt++;
+		}
+	}
+
 	if (ch->avail_cnt > 0) {
 		TAILQ_INSERT_TAIL(&ch->outstanding_io, bdev_io, module_link);
 		ch->outstanding_cnt++;
 		ch->avail_cnt--;
 	} else {
-		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
 	}
 }

@ -413,6 +425,122 @@ io_during_reset(void)
 	teardown_test();
 }

+static void
+enomem_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+	enum spdk_bdev_io_status *status = cb_arg;
+
+	*status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+	spdk_bdev_free_io(bdev_io);
+}
+
+static uint32_t
+bdev_io_tailq_cnt(bdev_io_tailq_t *tailq)
+{
+	struct spdk_bdev_io *io;
+	uint32_t cnt = 0;
+
+	TAILQ_FOREACH(io, tailq, link) {
+		cnt++;
+	}
+
+	return cnt;
+}
+
+static void
+enomem(void)
+{
+	struct spdk_io_channel *io_ch;
+	struct spdk_bdev_channel *bdev_ch;
+	struct ut_bdev_channel *ut_ch;
+	const uint32_t IO_ARRAY_SIZE = 64;
+	const uint32_t AVAIL = 20;
+	enum spdk_bdev_io_status status[IO_ARRAY_SIZE], status_reset;
+	uint32_t nomem_cnt, i;
+	struct spdk_bdev_io *first_io;
+	int rc;
+
+	setup_test();
+
+	set_thread(0);
+	io_ch = spdk_bdev_get_io_channel(g_desc);
+	bdev_ch = spdk_io_channel_get_ctx(io_ch);
+	ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
+	ut_ch->avail_cnt = AVAIL;
+
+	/* First submit a number of IOs equal to what the channel can support. */
+	for (i = 0; i < AVAIL; i++) {
+		status[i] = SPDK_BDEV_IO_STATUS_PENDING;
+		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
+		CU_ASSERT(rc == 0);
+	}
+	CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io));
+
+	/*
+	 * Next, submit one additional I/O.  This one should fail with ENOMEM and then go onto
+	 *  the enomem_io list.
+	 */
+	status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
+	rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
+	CU_ASSERT(rc == 0);
+	SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io));
+	first_io = TAILQ_FIRST(&bdev_ch->nomem_io);
+
+	/*
+	 * Now submit a bunch more I/O.  These should all fail with ENOMEM and get queued behind
+	 *  the first_io above.
+	 */
+	for (i = AVAIL + 1; i < IO_ARRAY_SIZE; i++) {
+		status[i] = SPDK_BDEV_IO_STATUS_PENDING;
+		rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
+		CU_ASSERT(rc == 0);
+	}
+
+	/* Assert that first_io is still at the head of the list. */
+	CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io);
+	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
+	nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
+	CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));
+
+	/*
+	 * Complete 1 I/O only.  The key check here is bdev_io_tailq_cnt - this should not have
+	 *  changed since completing just 1 I/O should not trigger retrying the queued nomem_io
+	 *  list.
+	 */
+	stub_complete_io(1);
+	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
+
+	/*
+	 * Complete enough I/O to hit the nomem_theshold.  This should trigger retrying nomem_io,
+	 *  and we should see I/O get resubmitted to the test bdev module.
+	 */
+	stub_complete_io(NOMEM_THRESHOLD_COUNT - 1);
+	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt);
+	nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
+
+	/* Complete 1 I/O only.  This should not trigger retrying the queued nomem_io. */
+	stub_complete_io(1);
+	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
+
+	/*
+	 * Send a reset and confirm that all I/O are completed, including the ones that
+	 *  were queued on the nomem_io list.
+	 */
+	status_reset = SPDK_BDEV_IO_STATUS_PENDING;
+	rc = spdk_bdev_reset(g_desc, io_ch, enomem_done, &status_reset);
+	poll_threads();
+	CU_ASSERT(rc == 0);
+	/* This will complete the reset. */
+	stub_complete_io(0);
+
+	CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0);
+	CU_ASSERT(bdev_ch->io_outstanding == 0);
+
+	spdk_put_io_channel(io_ch);
+	poll_threads();
+	teardown_test();
+}
+
 int
 main(int argc, char **argv)
 {
@ -433,7 +561,8 @@ main(int argc, char **argv)
 		CU_add_test(suite, "basic", basic) == NULL ||
 		CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL ||
 		CU_add_test(suite, "aborted_reset", aborted_reset) == NULL ||
-		CU_add_test(suite, "io_during_reset", io_during_reset) == NULL
+		CU_add_test(suite, "io_during_reset", io_during_reset) == NULL ||
+		CU_add_test(suite, "enomem", enomem) == NULL
 	) {
 		CU_cleanup_registry();
 		return CU_get_error();