diff --git a/include/spdk_internal/bdev.h b/include/spdk_internal/bdev.h index 899c5bffa..4c65ab4d4 100644 --- a/include/spdk_internal/bdev.h +++ b/include/spdk_internal/bdev.h @@ -161,6 +161,13 @@ struct spdk_bdev_fn_table { /** bdev I/O completion status */ enum spdk_bdev_io_status { + /* + * NOMEM should be returned when a bdev module cannot start an I/O because of + * some lack of resources. It may not be returned for RESET I/O. I/O completed + * with NOMEM status will be retried after some I/O from the same channel have + * completed. + */ + SPDK_BDEV_IO_STATUS_NOMEM = -4, SPDK_BDEV_IO_STATUS_SCSI_ERROR = -3, SPDK_BDEV_IO_STATUS_NVME_ERROR = -2, SPDK_BDEV_IO_STATUS_FAILED = -1, diff --git a/lib/bdev/aio/bdev_aio.c b/lib/bdev/aio/bdev_aio.c index 96f53abad..0080082ba 100644 --- a/lib/bdev/aio/bdev_aio.c +++ b/lib/bdev/aio/bdev_aio.c @@ -120,7 +120,11 @@ bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch, rc = io_submit(aio_ch->io_ctx, 1, &iocb); if (rc < 0) { - spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + if (rc == EAGAIN) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + } SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); return -1; } @@ -146,7 +150,11 @@ bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch, rc = io_submit(aio_ch->io_ctx, 1, &iocb); if (rc < 0) { - spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + if (rc == EAGAIN) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + } SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); return -1; } diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index 74eb74776..e59ea21be 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -57,6 +57,7 @@ int __itt_init_ittlib(const char *, __itt_group_id); #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) #define BUF_SMALL_POOL_SIZE 8192 #define BUF_LARGE_POOL_SIZE 1024 +#define NOMEM_THRESHOLD_COUNT 8 typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t; @@ -128,6 +129,17 @@ struct spdk_bdev_channel { bdev_io_tailq_t queued_resets; + /* + * Queue of IO awaiting retry because of a previous NOMEM status returned + * on this channel. + */ + bdev_io_tailq_t nomem_io; + + /* + * Threshold which io_outstanding must drop to before retrying nomem_io. + */ + uint64_t nomem_threshold; + uint32_t flags; #ifdef SPDK_CONFIG_VTUNE @@ -612,7 +624,12 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) bdev_ch->io_outstanding++; bdev_io->in_submit_request = true; if (spdk_likely(bdev_ch->flags == 0)) { - bdev->fn_table->submit_request(ch, bdev_io); + if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) { + bdev->fn_table->submit_request(ch, bdev_io); + } else { + bdev_ch->io_outstanding--; + TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link); + } } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); } else { @@ -676,6 +693,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf) memset(&ch->stat, 0, sizeof(ch->stat)); ch->io_outstanding = 0; TAILQ_INIT(&ch->queued_resets); + TAILQ_INIT(&ch->nomem_io); + ch->nomem_threshold = 0; ch->flags = 0; #ifdef SPDK_CONFIG_VTUNE @@ -725,6 +744,15 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) { if (bdev_io->ch == ch) { TAILQ_REMOVE(queue, bdev_io, link); + /* + * spdk_bdev_io_complete() assumes that the completed I/O had + * been submitted to the bdev module. Since in this case it + * hadn't, bump io_outstanding to account for the decrement + * that spdk_bdev_io_complete() will do. + */ + if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { + ch->io_outstanding++; + } spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); } } @@ -739,6 +767,7 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel); _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); + _spdk_bdev_abort_queued_io(&ch->nomem_io, ch); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch); @@ -1201,6 +1230,7 @@ _spdk_bdev_reset_abort_channel(void *io_device, struct spdk_io_channel *ch, channel->flags |= BDEV_CH_RESET_IN_PROGRESS; + _spdk_bdev_abort_queued_io(&channel->nomem_io, channel); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); } @@ -1378,6 +1408,36 @@ spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) return 0; } +static void +_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) +{ + struct spdk_bdev *bdev = bdev_ch->bdev; + struct spdk_bdev_io *bdev_io; + + if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) { + /* + * Allow some more I/O to complete before retrying the nomem_io queue. + * Some drivers (such as nvme) cannot immediately take a new I/O in + * the context of a completion, because the resources for the I/O are + * not released until control returns to the bdev poller. Also, we + * may require several small I/O to complete before a larger I/O + * (that requires splitting) can be submitted. + */ + return; + } + + while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) { + bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io); + TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link); + bdev_ch->io_outstanding++; + bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; + bdev->fn_table->submit_request(bdev_ch->channel, bdev_io); + if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { + break; + } + } +} + static void _spdk_bdev_io_complete(void *ctx) { @@ -1396,6 +1456,9 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta bdev_io->status = status; if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { + if (status == SPDK_BDEV_IO_STATUS_NOMEM) { + SPDK_ERRLOG("NOMEM returned for reset\n"); + } pthread_mutex_lock(&bdev->mutex); if (bdev_io == bdev->reset_in_progress) { bdev->reset_in_progress = NULL; @@ -1408,6 +1471,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta } else { assert(bdev_ch->io_outstanding > 0); bdev_ch->io_outstanding--; + if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) { + if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) { + _spdk_bdev_ch_retry_io(bdev_ch); + } + } else { + TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link); + /* + * Wait for some of the outstanding I/O to complete before we + * retry any of the nomem_io. Normally we will wait for + * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue + * depth channels we will instead wait for half to complete. + */ + bdev_ch->nomem_threshold = spdk_max(bdev_ch->io_outstanding / 2, + bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT); + return; + } } if (status == SPDK_BDEV_IO_STATUS_SUCCESS) { diff --git a/lib/bdev/malloc/bdev_malloc.c b/lib/bdev/malloc/bdev_malloc.c index 1b6200dfe..42461ab3d 100644 --- a/lib/bdev/malloc/bdev_malloc.c +++ b/lib/bdev/malloc/bdev_malloc.c @@ -75,7 +75,11 @@ malloc_done(void *ref, int status) struct malloc_task *task = __malloc_task_from_copy_task(ref); if (status != 0) { - task->status = SPDK_BDEV_IO_STATUS_FAILED; + if (status == -ENOMEM) { + task->status = SPDK_BDEV_IO_STATUS_NOMEM; + } else { + task->status = SPDK_BDEV_IO_STATUS_FAILED; + } } if (--task->num_outstanding == 0) { diff --git a/lib/bdev/nvme/bdev_nvme.c b/lib/bdev/nvme/bdev_nvme.c index 7edde71a9..9a7a00515 100644 --- a/lib/bdev/nvme/bdev_nvme.c +++ b/lib/bdev/nvme/bdev_nvme.c @@ -347,7 +347,11 @@ bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.offset_blocks); - if (ret != 0) { + if (spdk_likely(ret == 0)) { + return; + } else if (ret == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); } } @@ -428,7 +432,11 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i int rc = _bdev_nvme_submit_request(ch, bdev_io); if (spdk_unlikely(rc != 0)) { - spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } } } @@ -1256,7 +1264,7 @@ bdev_nvme_queue_cmd(struct nvme_bdev *bdev, struct spdk_nvme_qpair *qpair, bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); } - if (rc != 0) { + if (rc != 0 && rc != -ENOMEM) { SPDK_ERRLOG("%s failed: rc = %d\n", direction == BDEV_DISK_READ ? "readv" : "writev", rc); } return rc; diff --git a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c index 780fd79b0..9e37164bd 100644 --- a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c +++ b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c @@ -99,12 +99,24 @@ stub_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) { struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) { + struct spdk_bdev_io *io; + + while (!TAILQ_EMPTY(&ch->outstanding_io)) { + io = TAILQ_FIRST(&ch->outstanding_io); + TAILQ_REMOVE(&ch->outstanding_io, io, module_link); + ch->outstanding_cnt--; + spdk_bdev_io_complete(io, SPDK_BDEV_IO_STATUS_FAILED); + ch->avail_cnt++; + } + } + if (ch->avail_cnt > 0) { TAILQ_INSERT_TAIL(&ch->outstanding_io, bdev_io, module_link); ch->outstanding_cnt++; ch->avail_cnt--; } else { - spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); } } @@ -413,6 +425,122 @@ io_during_reset(void) teardown_test(); } +static void +enomem_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + enum spdk_bdev_io_status *status = cb_arg; + + *status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + spdk_bdev_free_io(bdev_io); +} + +static uint32_t +bdev_io_tailq_cnt(bdev_io_tailq_t *tailq) +{ + struct spdk_bdev_io *io; + uint32_t cnt = 0; + + TAILQ_FOREACH(io, tailq, link) { + cnt++; + } + + return cnt; +} + +static void +enomem(void) +{ + struct spdk_io_channel *io_ch; + struct spdk_bdev_channel *bdev_ch; + struct ut_bdev_channel *ut_ch; + const uint32_t IO_ARRAY_SIZE = 64; + const uint32_t AVAIL = 20; + enum spdk_bdev_io_status status[IO_ARRAY_SIZE], status_reset; + uint32_t nomem_cnt, i; + struct spdk_bdev_io *first_io; + int rc; + + setup_test(); + + set_thread(0); + io_ch = spdk_bdev_get_io_channel(g_desc); + bdev_ch = spdk_io_channel_get_ctx(io_ch); + ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel); + ut_ch->avail_cnt = AVAIL; + + /* First submit a number of IOs equal to what the channel can support. */ + for (i = 0; i < AVAIL; i++) { + status[i] = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]); + CU_ASSERT(rc == 0); + } + CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io)); + + /* + * Next, submit one additional I/O. This one should fail with ENOMEM and then go onto + * the enomem_io list. + */ + status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]); + CU_ASSERT(rc == 0); + SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io)); + first_io = TAILQ_FIRST(&bdev_ch->nomem_io); + + /* + * Now submit a bunch more I/O. These should all fail with ENOMEM and get queued behind + * the first_io above. + */ + for (i = AVAIL + 1; i < IO_ARRAY_SIZE; i++) { + status[i] = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]); + CU_ASSERT(rc == 0); + } + + /* Assert that first_io is still at the head of the list. */ + CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io); + CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL)); + nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io); + CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT)); + + /* + * Complete 1 I/O only. The key check here is bdev_io_tailq_cnt - this should not have + * changed since completing just 1 I/O should not trigger retrying the queued nomem_io + * list. + */ + stub_complete_io(1); + CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt); + + /* + * Complete enough I/O to hit the nomem_theshold. This should trigger retrying nomem_io, + * and we should see I/O get resubmitted to the test bdev module. + */ + stub_complete_io(NOMEM_THRESHOLD_COUNT - 1); + CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt); + nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io); + + /* Complete 1 I/O only. This should not trigger retrying the queued nomem_io. */ + stub_complete_io(1); + CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt); + + /* + * Send a reset and confirm that all I/O are completed, including the ones that + * were queued on the nomem_io list. + */ + status_reset = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_reset(g_desc, io_ch, enomem_done, &status_reset); + poll_threads(); + CU_ASSERT(rc == 0); + /* This will complete the reset. */ + stub_complete_io(0); + + CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0); + CU_ASSERT(bdev_ch->io_outstanding == 0); + + spdk_put_io_channel(io_ch); + poll_threads(); + teardown_test(); +} + int main(int argc, char **argv) { @@ -433,7 +561,8 @@ main(int argc, char **argv) CU_add_test(suite, "basic", basic) == NULL || CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL || CU_add_test(suite, "aborted_reset", aborted_reset) == NULL || - CU_add_test(suite, "io_during_reset", io_during_reset) == NULL + CU_add_test(suite, "io_during_reset", io_during_reset) == NULL || + CU_add_test(suite, "enomem", enomem) == NULL ) { CU_cleanup_registry(); return CU_get_error();