diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index b2f3fca93..1c545e8fc 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -113,6 +113,8 @@ struct spdk_bdev_mgmt_channel { */ bdev_io_stailq_t per_thread_cache; uint32_t per_thread_cache_count; + + TAILQ_HEAD(, spdk_bdev_module_channel) module_channels; }; struct spdk_bdev_desc { @@ -136,14 +138,33 @@ struct spdk_bdev_channel { struct spdk_bdev_io_stat stat; + bdev_io_tailq_t queued_resets; + + uint32_t flags; + + /* Per-device channel */ + struct spdk_bdev_module_channel *module_ch; + +#ifdef SPDK_CONFIG_VTUNE + uint64_t start_tsc; + uint64_t interval_tsc; + __itt_string_handle *handle; +#endif + +}; + +/* + * Per-module (or per-io_device) channel. Multiple bdevs built on the same io_device + * will queue here their IO that awaits retry. It makes it posible to retry sending + * IO to one bdev after IO from other bdev completes. + */ +struct spdk_bdev_module_channel { /* * Count of I/O submitted to bdev module and waiting for completion. * Incremented before submit_request() is called on an spdk_bdev_io. */ uint64_t io_outstanding; - bdev_io_tailq_t queued_resets; - /* * Queue of IO awaiting retry because of a previous NOMEM status returned * on this channel. @@ -155,14 +176,12 @@ struct spdk_bdev_channel { */ uint64_t nomem_threshold; - uint32_t flags; + /* I/O channel allocated by a bdev module */ + struct spdk_io_channel *module_ch; -#ifdef SPDK_CONFIG_VTUNE - uint64_t start_tsc; - uint64_t interval_tsc; - __itt_string_handle *handle; -#endif + uint32_t ref; + TAILQ_ENTRY(spdk_bdev_module_channel) link; }; static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); @@ -379,6 +398,8 @@ spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) STAILQ_INIT(&ch->per_thread_cache); ch->per_thread_cache_count = 0; + TAILQ_INIT(&ch->module_channels); + return 0; } @@ -782,17 +803,18 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) struct spdk_bdev *bdev = bdev_io->bdev; struct spdk_bdev_channel *bdev_ch = bdev_io->ch; struct spdk_io_channel *ch = bdev_ch->channel; + struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); - bdev_ch->io_outstanding++; + shared_ch->io_outstanding++; bdev_io->in_submit_request = true; if (spdk_likely(bdev_ch->flags == 0)) { - if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) { + if (spdk_likely(TAILQ_EMPTY(&shared_ch->nomem_io))) { bdev->fn_table->submit_request(ch, bdev_io); } else { - bdev_ch->io_outstanding--; - TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link); + shared_ch->io_outstanding--; + TAILQ_INSERT_TAIL(&shared_ch->nomem_io, bdev_io, link); } } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); @@ -851,6 +873,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf) { struct spdk_bdev *bdev = io_device; struct spdk_bdev_channel *ch = ctx_buf; + struct spdk_bdev_mgmt_channel *mgmt_ch; + struct spdk_bdev_module_channel *shared_ch; ch->bdev = io_device; ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); @@ -864,12 +888,34 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf) return -1; } + mgmt_ch = spdk_io_channel_get_ctx(ch->mgmt_channel); + TAILQ_FOREACH(shared_ch, &mgmt_ch->module_channels, link) { + if (shared_ch->module_ch == ch->channel) { + shared_ch->ref++; + break; + } + } + + if (shared_ch == NULL) { + shared_ch = calloc(1, sizeof(*shared_ch)); + if (!shared_ch) { + spdk_put_io_channel(ch->channel); + spdk_put_io_channel(ch->mgmt_channel); + return -1; + } + + shared_ch->io_outstanding = 0; + TAILQ_INIT(&shared_ch->nomem_io); + shared_ch->nomem_threshold = 0; + shared_ch->module_ch = ch->channel; + shared_ch->ref = 1; + TAILQ_INSERT_TAIL(&mgmt_ch->module_channels, shared_ch, link); + } + memset(&ch->stat, 0, sizeof(ch->stat)); - ch->io_outstanding = 0; TAILQ_INIT(&ch->queued_resets); - TAILQ_INIT(&ch->nomem_io); - ch->nomem_threshold = 0; ch->flags = 0; + ch->module_ch = shared_ch; #ifdef SPDK_CONFIG_VTUNE { @@ -935,7 +981,7 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) * that spdk_bdev_io_complete() will do. */ if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { - ch->io_outstanding++; + ch->module_ch->io_outstanding++; } spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); } @@ -947,17 +993,24 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) { struct spdk_bdev_channel *ch = ctx_buf; struct spdk_bdev_mgmt_channel *mgmt_channel; + struct spdk_bdev_module_channel *shared_ch = ch->module_ch; mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel); _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); - _spdk_bdev_abort_queued_io(&ch->nomem_io, ch); + _spdk_bdev_abort_queued_io(&shared_ch->nomem_io, ch); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch); + assert(shared_ch->ref > 0); + shared_ch->ref--; + if (shared_ch->ref == 0) { + assert(shared_ch->io_outstanding == 0); + TAILQ_REMOVE(&mgmt_channel->module_channels, shared_ch, link); + free(shared_ch); + } spdk_put_io_channel(ch->channel); spdk_put_io_channel(ch->mgmt_channel); - assert(ch->io_outstanding == 0); } int @@ -1502,14 +1555,16 @@ _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) struct spdk_io_channel *ch; struct spdk_bdev_channel *channel; struct spdk_bdev_mgmt_channel *mgmt_channel; + struct spdk_bdev_module_channel *shared_ch; ch = spdk_io_channel_iter_get_channel(i); channel = spdk_io_channel_get_ctx(ch); mgmt_channel = spdk_io_channel_get_ctx(channel->mgmt_channel); + shared_ch = channel->module_ch; channel->flags |= BDEV_CH_RESET_IN_PROGRESS; - _spdk_bdev_abort_queued_io(&channel->nomem_io, channel); + _spdk_bdev_abort_queued_io(&shared_ch->nomem_io, channel); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); @@ -1724,9 +1779,10 @@ static void _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) { struct spdk_bdev *bdev = bdev_ch->bdev; + struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; struct spdk_bdev_io *bdev_io; - if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) { + if (shared_ch->io_outstanding > shared_ch->nomem_threshold) { /* * Allow some more I/O to complete before retrying the nomem_io queue. * Some drivers (such as nvme) cannot immediately take a new I/O in @@ -1738,12 +1794,12 @@ _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) return; } - while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) { - bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io); - TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link); - bdev_ch->io_outstanding++; + while (!TAILQ_EMPTY(&shared_ch->nomem_io)) { + bdev_io = TAILQ_FIRST(&shared_ch->nomem_io); + TAILQ_REMOVE(&shared_ch->nomem_io, bdev_io, link); + shared_ch->io_outstanding++; bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING; - bdev->fn_table->submit_request(bdev_ch->channel, bdev_io); + bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io); if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) { break; } @@ -1791,6 +1847,7 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta { struct spdk_bdev *bdev = bdev_io->bdev; struct spdk_bdev_channel *bdev_ch = bdev_io->ch; + struct spdk_bdev_module_channel *shared_ch = bdev_ch->module_ch; bdev_io->status = status; @@ -1813,22 +1870,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta return; } } else { - assert(bdev_ch->io_outstanding > 0); - bdev_ch->io_outstanding--; + assert(shared_ch->io_outstanding > 0); + shared_ch->io_outstanding--; if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) { - if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) { + if (spdk_unlikely(!TAILQ_EMPTY(&shared_ch->nomem_io))) { _spdk_bdev_ch_retry_io(bdev_ch); } } else { - TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link); + TAILQ_INSERT_HEAD(&shared_ch->nomem_io, bdev_io, link); /* * Wait for some of the outstanding I/O to complete before we * retry any of the nomem_io. Normally we will wait for * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue * depth channels we will instead wait for half to complete. */ - bdev_ch->nomem_threshold = spdk_max((int64_t)bdev_ch->io_outstanding / 2, - (int64_t)bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT); + shared_ch->nomem_threshold = spdk_max((int64_t)shared_ch->io_outstanding / 2, + (int64_t)shared_ch->io_outstanding - NOMEM_THRESHOLD_COUNT); return; } } diff --git a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c index f0c267976..8f4b04dd1 100644 --- a/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c +++ b/test/unit/lib/bdev/mt/bdev.c/bdev_ut.c @@ -486,6 +486,7 @@ enomem(void) { struct spdk_io_channel *io_ch; struct spdk_bdev_channel *bdev_ch; + struct spdk_bdev_module_channel *module_ch; struct ut_bdev_channel *ut_ch; const uint32_t IO_ARRAY_SIZE = 64; const uint32_t AVAIL = 20; @@ -499,6 +500,7 @@ enomem(void) set_thread(0); io_ch = spdk_bdev_get_io_channel(g_desc); bdev_ch = spdk_io_channel_get_ctx(io_ch); + module_ch = bdev_ch->module_ch; ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel); ut_ch->avail_cnt = AVAIL; @@ -508,7 +510,7 @@ enomem(void) rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]); CU_ASSERT(rc == 0); } - CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io)); + CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io)); /* * Next, submit one additional I/O. This one should fail with ENOMEM and then go onto @@ -517,8 +519,8 @@ enomem(void) status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING; rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]); CU_ASSERT(rc == 0); - SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io)); - first_io = TAILQ_FIRST(&bdev_ch->nomem_io); + SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io)); + first_io = TAILQ_FIRST(&module_ch->nomem_io); /* * Now submit a bunch more I/O. These should all fail with ENOMEM and get queued behind @@ -531,10 +533,10 @@ enomem(void) } /* Assert that first_io is still at the head of the list. */ - CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io); - CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL)); - nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io); - CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT)); + CU_ASSERT(TAILQ_FIRST(&module_ch->nomem_io) == first_io); + CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL)); + nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io); + CU_ASSERT(module_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT)); /* * Complete 1 I/O only. The key check here is bdev_io_tailq_cnt - this should not have @@ -542,19 +544,19 @@ enomem(void) * list. */ stub_complete_io(g_bdev.io_target, 1); - CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt); + CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt); /* * Complete enough I/O to hit the nomem_theshold. This should trigger retrying nomem_io, * and we should see I/O get resubmitted to the test bdev module. */ stub_complete_io(g_bdev.io_target, NOMEM_THRESHOLD_COUNT - 1); - CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt); - nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io); + CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) < nomem_cnt); + nomem_cnt = bdev_io_tailq_cnt(&module_ch->nomem_io); /* Complete 1 I/O only. This should not trigger retrying the queued nomem_io. */ stub_complete_io(g_bdev.io_target, 1); - CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt); + CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == nomem_cnt); /* * Send a reset and confirm that all I/O are completed, including the ones that @@ -567,14 +569,86 @@ enomem(void) /* This will complete the reset. */ stub_complete_io(g_bdev.io_target, 0); - CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0); - CU_ASSERT(bdev_ch->io_outstanding == 0); + CU_ASSERT(bdev_io_tailq_cnt(&module_ch->nomem_io) == 0); + CU_ASSERT(module_ch->io_outstanding == 0); spdk_put_io_channel(io_ch); poll_threads(); teardown_test(); } +static void +enomem_multi_bdev(void) +{ + struct spdk_io_channel *io_ch; + struct spdk_bdev_channel *bdev_ch; + struct spdk_bdev_module_channel *module_ch; + struct ut_bdev_channel *ut_ch; + const uint32_t IO_ARRAY_SIZE = 64; + const uint32_t AVAIL = 20; + enum spdk_bdev_io_status status[IO_ARRAY_SIZE]; + uint32_t i; + struct ut_bdev *second_bdev; + struct spdk_bdev_desc *second_desc; + struct spdk_bdev_channel *second_bdev_ch; + struct spdk_io_channel *second_ch; + int rc; + + setup_test(); + + /* Register second bdev with the same io_target */ + second_bdev = calloc(1, sizeof(*second_bdev)); + SPDK_CU_ASSERT_FATAL(second_bdev != NULL); + register_bdev(second_bdev, "ut_bdev2", g_bdev.io_target); + spdk_bdev_open(&second_bdev->bdev, true, NULL, NULL, &second_desc); + + set_thread(0); + io_ch = spdk_bdev_get_io_channel(g_desc); + bdev_ch = spdk_io_channel_get_ctx(io_ch); + module_ch = bdev_ch->module_ch; + ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel); + ut_ch->avail_cnt = AVAIL; + + second_ch = spdk_bdev_get_io_channel(second_desc); + second_bdev_ch = spdk_io_channel_get_ctx(second_ch); + SPDK_CU_ASSERT_FATAL(module_ch == second_bdev_ch->module_ch); + + /* Saturate io_target through bdev A. */ + for (i = 0; i < AVAIL; i++) { + status[i] = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]); + CU_ASSERT(rc == 0); + } + CU_ASSERT(TAILQ_EMPTY(&module_ch->nomem_io)); + + /* + * Now submit I/O through the second bdev. This should fail with ENOMEM + * and then go onto the nomem_io list. + */ + status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING; + rc = spdk_bdev_read_blocks(second_desc, second_ch, NULL, 0, 1, enomem_done, &status[AVAIL]); + CU_ASSERT(rc == 0); + SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&module_ch->nomem_io)); + + /* Complete first bdev's I/O. This should retry sending second bdev's nomem_io */ + stub_complete_io(g_bdev.io_target, AVAIL); + + SPDK_CU_ASSERT_FATAL(TAILQ_EMPTY(&module_ch->nomem_io)); + CU_ASSERT(module_ch->io_outstanding == 1); + + /* Now complete our retried I/O */ + stub_complete_io(g_bdev.io_target, 1); + SPDK_CU_ASSERT_FATAL(module_ch->io_outstanding == 0); + + spdk_put_io_channel(io_ch); + spdk_put_io_channel(second_ch); + spdk_bdev_close(second_desc); + unregister_bdev(second_bdev); + free(second_bdev); + poll_threads(); + teardown_test(); +} + int main(int argc, char **argv) { @@ -596,7 +670,8 @@ main(int argc, char **argv) CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL || CU_add_test(suite, "aborted_reset", aborted_reset) == NULL || CU_add_test(suite, "io_during_reset", io_during_reset) == NULL || - CU_add_test(suite, "enomem", enomem) == NULL + CU_add_test(suite, "enomem", enomem) == NULL || + CU_add_test(suite, "enomem_multi_bdev", enomem_multi_bdev) == NULL ) { CU_cleanup_registry(); return CU_get_error();