bdev: add ENOMEM handling

At very high queue depths, bdev modules may not have enough
internal resources to track all of the incoming I/O.  For example,
we allocate a finite number of nvme_request objects per allocated
queue pair.  Currently if these resources are exhausted, the
bdev module will return failure (with no indication why) which
gets propagated all the way back to the application.

So instead, add SPDK_BDEV_IO_STATUS_NOMEM to allow bdev modules
to indicate this type of failure.  Also add handling for this
status type in the generic bdev layer, involving queuing these
I/O for later retry after other I/O on the failing channel have
completed.

This does place an expectation on the bdev module that these
internal resources are allocated per io_channel.  Otherwise we
cannot guarantee forward progress solely on reception of
completions.  For example, without this guarantee, a bdev
module could theoretically return ENOMEM even if there were
no I/O oustanding for that io_channel.  nvme, aio, rbd,
virtio and null drivers comply with this expectation already.
malloc only complies though when not using copy offload.

This patch will fix malloc w/ copy engine to at least
return ENOMEM when no copy descriptors are available.  If the
condition above occurs, I/O waiting for resources will get
failed as part of a subsequent reset which matches the
behavior it has today.

Signed-off-by: Jim Harris <james.r.harris@intel.com>
Change-Id: Iea7cd51a611af8abe882794d0b2361fdbb74e84e

Reviewed-on: https://review.gerrithub.io/378853
Tested-by: SPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: Daniel Verkamp <daniel.verkamp@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
This commit is contained in:
Jim Harris 2017-09-15 13:47:17 -07:00
parent c5d8b108f2
commit 94bc8cfdba
6 changed files with 244 additions and 9 deletions

View File

@ -161,6 +161,13 @@ struct spdk_bdev_fn_table {
/** bdev I/O completion status */
enum spdk_bdev_io_status {
/*
* NOMEM should be returned when a bdev module cannot start an I/O because of
* some lack of resources. It may not be returned for RESET I/O. I/O completed
* with NOMEM status will be retried after some I/O from the same channel have
* completed.
*/
SPDK_BDEV_IO_STATUS_NOMEM = -4,
SPDK_BDEV_IO_STATUS_SCSI_ERROR = -3,
SPDK_BDEV_IO_STATUS_NVME_ERROR = -2,
SPDK_BDEV_IO_STATUS_FAILED = -1,

View File

@ -120,7 +120,11 @@ bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,
rc = io_submit(aio_ch->io_ctx, 1, &iocb);
if (rc < 0) {
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
if (rc == EAGAIN) {
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
} else {
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
}
SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
return -1;
}
@ -146,7 +150,11 @@ bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,
rc = io_submit(aio_ch->io_ctx, 1, &iocb);
if (rc < 0) {
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
if (rc == EAGAIN) {
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
} else {
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
}
SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
return -1;
}

View File

@ -57,6 +57,7 @@ int __itt_init_ittlib(const char *, __itt_group_id);
#define SPDK_BDEV_IO_POOL_SIZE (64 * 1024)
#define BUF_SMALL_POOL_SIZE 8192
#define BUF_LARGE_POOL_SIZE 1024
#define NOMEM_THRESHOLD_COUNT 8
typedef TAILQ_HEAD(, spdk_bdev_io) bdev_io_tailq_t;
@ -128,6 +129,17 @@ struct spdk_bdev_channel {
bdev_io_tailq_t queued_resets;
/*
* Queue of IO awaiting retry because of a previous NOMEM status returned
* on this channel.
*/
bdev_io_tailq_t nomem_io;
/*
* Threshold which io_outstanding must drop to before retrying nomem_io.
*/
uint64_t nomem_threshold;
uint32_t flags;
#ifdef SPDK_CONFIG_VTUNE
@ -612,7 +624,12 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
bdev_ch->io_outstanding++;
bdev_io->in_submit_request = true;
if (spdk_likely(bdev_ch->flags == 0)) {
bdev->fn_table->submit_request(ch, bdev_io);
if (spdk_likely(TAILQ_EMPTY(&bdev_ch->nomem_io))) {
bdev->fn_table->submit_request(ch, bdev_io);
} else {
bdev_ch->io_outstanding--;
TAILQ_INSERT_TAIL(&bdev_ch->nomem_io, bdev_io, link);
}
} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
} else {
@ -676,6 +693,8 @@ spdk_bdev_channel_create(void *io_device, void *ctx_buf)
memset(&ch->stat, 0, sizeof(ch->stat));
ch->io_outstanding = 0;
TAILQ_INIT(&ch->queued_resets);
TAILQ_INIT(&ch->nomem_io);
ch->nomem_threshold = 0;
ch->flags = 0;
#ifdef SPDK_CONFIG_VTUNE
@ -725,6 +744,15 @@ _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) {
if (bdev_io->ch == ch) {
TAILQ_REMOVE(queue, bdev_io, link);
/*
* spdk_bdev_io_complete() assumes that the completed I/O had
* been submitted to the bdev module. Since in this case it
* hadn't, bump io_outstanding to account for the decrement
* that spdk_bdev_io_complete() will do.
*/
if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
ch->io_outstanding++;
}
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
@ -739,6 +767,7 @@ spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
mgmt_channel = spdk_io_channel_get_ctx(ch->mgmt_channel);
_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
_spdk_bdev_abort_queued_io(&ch->nomem_io, ch);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, ch);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, ch);
@ -1201,6 +1230,7 @@ _spdk_bdev_reset_abort_channel(void *io_device, struct spdk_io_channel *ch,
channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
_spdk_bdev_abort_queued_io(&channel->nomem_io, channel);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);
}
@ -1378,6 +1408,36 @@ spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
return 0;
}
static void
_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
{
struct spdk_bdev *bdev = bdev_ch->bdev;
struct spdk_bdev_io *bdev_io;
if (bdev_ch->io_outstanding > bdev_ch->nomem_threshold) {
/*
* Allow some more I/O to complete before retrying the nomem_io queue.
* Some drivers (such as nvme) cannot immediately take a new I/O in
* the context of a completion, because the resources for the I/O are
* not released until control returns to the bdev poller. Also, we
* may require several small I/O to complete before a larger I/O
* (that requires splitting) can be submitted.
*/
return;
}
while (!TAILQ_EMPTY(&bdev_ch->nomem_io)) {
bdev_io = TAILQ_FIRST(&bdev_ch->nomem_io);
TAILQ_REMOVE(&bdev_ch->nomem_io, bdev_io, link);
bdev_ch->io_outstanding++;
bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
bdev->fn_table->submit_request(bdev_ch->channel, bdev_io);
if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
break;
}
}
}
static void
_spdk_bdev_io_complete(void *ctx)
{
@ -1396,6 +1456,9 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
bdev_io->status = status;
if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
SPDK_ERRLOG("NOMEM returned for reset\n");
}
pthread_mutex_lock(&bdev->mutex);
if (bdev_io == bdev->reset_in_progress) {
bdev->reset_in_progress = NULL;
@ -1408,6 +1471,22 @@ spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status sta
} else {
assert(bdev_ch->io_outstanding > 0);
bdev_ch->io_outstanding--;
if (spdk_likely(status != SPDK_BDEV_IO_STATUS_NOMEM)) {
if (spdk_unlikely(!TAILQ_EMPTY(&bdev_ch->nomem_io))) {
_spdk_bdev_ch_retry_io(bdev_ch);
}
} else {
TAILQ_INSERT_HEAD(&bdev_ch->nomem_io, bdev_io, link);
/*
* Wait for some of the outstanding I/O to complete before we
* retry any of the nomem_io. Normally we will wait for
* NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
* depth channels we will instead wait for half to complete.
*/
bdev_ch->nomem_threshold = spdk_max(bdev_ch->io_outstanding / 2,
bdev_ch->io_outstanding - NOMEM_THRESHOLD_COUNT);
return;
}
}
if (status == SPDK_BDEV_IO_STATUS_SUCCESS) {

View File

@ -75,7 +75,11 @@ malloc_done(void *ref, int status)
struct malloc_task *task = __malloc_task_from_copy_task(ref);
if (status != 0) {
task->status = SPDK_BDEV_IO_STATUS_FAILED;
if (status == -ENOMEM) {
task->status = SPDK_BDEV_IO_STATUS_NOMEM;
} else {
task->status = SPDK_BDEV_IO_STATUS_FAILED;
}
}
if (--task->num_outstanding == 0) {

View File

@ -347,7 +347,11 @@ bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
bdev_io->u.bdev.num_blocks,
bdev_io->u.bdev.offset_blocks);
if (ret != 0) {
if (spdk_likely(ret == 0)) {
return;
} else if (ret == -ENOMEM) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
} else {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
@ -428,7 +432,11 @@ bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_i
int rc = _bdev_nvme_submit_request(ch, bdev_io);
if (spdk_unlikely(rc != 0)) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
if (rc == -ENOMEM) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
} else {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
}
@ -1256,7 +1264,7 @@ bdev_nvme_queue_cmd(struct nvme_bdev *bdev, struct spdk_nvme_qpair *qpair,
bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge);
}
if (rc != 0) {
if (rc != 0 && rc != -ENOMEM) {
SPDK_ERRLOG("%s failed: rc = %d\n", direction == BDEV_DISK_READ ? "readv" : "writev", rc);
}
return rc;

View File

@ -99,12 +99,24 @@ stub_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
{
struct ut_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
struct spdk_bdev_io *io;
while (!TAILQ_EMPTY(&ch->outstanding_io)) {
io = TAILQ_FIRST(&ch->outstanding_io);
TAILQ_REMOVE(&ch->outstanding_io, io, module_link);
ch->outstanding_cnt--;
spdk_bdev_io_complete(io, SPDK_BDEV_IO_STATUS_FAILED);
ch->avail_cnt++;
}
}
if (ch->avail_cnt > 0) {
TAILQ_INSERT_TAIL(&ch->outstanding_io, bdev_io, module_link);
ch->outstanding_cnt++;
ch->avail_cnt--;
} else {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
}
}
@ -413,6 +425,122 @@ io_during_reset(void)
teardown_test();
}
static void
enomem_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
enum spdk_bdev_io_status *status = cb_arg;
*status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
spdk_bdev_free_io(bdev_io);
}
static uint32_t
bdev_io_tailq_cnt(bdev_io_tailq_t *tailq)
{
struct spdk_bdev_io *io;
uint32_t cnt = 0;
TAILQ_FOREACH(io, tailq, link) {
cnt++;
}
return cnt;
}
static void
enomem(void)
{
struct spdk_io_channel *io_ch;
struct spdk_bdev_channel *bdev_ch;
struct ut_bdev_channel *ut_ch;
const uint32_t IO_ARRAY_SIZE = 64;
const uint32_t AVAIL = 20;
enum spdk_bdev_io_status status[IO_ARRAY_SIZE], status_reset;
uint32_t nomem_cnt, i;
struct spdk_bdev_io *first_io;
int rc;
setup_test();
set_thread(0);
io_ch = spdk_bdev_get_io_channel(g_desc);
bdev_ch = spdk_io_channel_get_ctx(io_ch);
ut_ch = spdk_io_channel_get_ctx(bdev_ch->channel);
ut_ch->avail_cnt = AVAIL;
/* First submit a number of IOs equal to what the channel can support. */
for (i = 0; i < AVAIL; i++) {
status[i] = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
CU_ASSERT(rc == 0);
}
CU_ASSERT(TAILQ_EMPTY(&bdev_ch->nomem_io));
/*
* Next, submit one additional I/O. This one should fail with ENOMEM and then go onto
* the enomem_io list.
*/
status[AVAIL] = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[AVAIL]);
CU_ASSERT(rc == 0);
SPDK_CU_ASSERT_FATAL(!TAILQ_EMPTY(&bdev_ch->nomem_io));
first_io = TAILQ_FIRST(&bdev_ch->nomem_io);
/*
* Now submit a bunch more I/O. These should all fail with ENOMEM and get queued behind
* the first_io above.
*/
for (i = AVAIL + 1; i < IO_ARRAY_SIZE; i++) {
status[i] = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_read_blocks(g_desc, io_ch, NULL, 0, 1, enomem_done, &status[i]);
CU_ASSERT(rc == 0);
}
/* Assert that first_io is still at the head of the list. */
CU_ASSERT(TAILQ_FIRST(&bdev_ch->nomem_io) == first_io);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == (IO_ARRAY_SIZE - AVAIL));
nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
CU_ASSERT(bdev_ch->nomem_threshold == (AVAIL - NOMEM_THRESHOLD_COUNT));
/*
* Complete 1 I/O only. The key check here is bdev_io_tailq_cnt - this should not have
* changed since completing just 1 I/O should not trigger retrying the queued nomem_io
* list.
*/
stub_complete_io(1);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
/*
* Complete enough I/O to hit the nomem_theshold. This should trigger retrying nomem_io,
* and we should see I/O get resubmitted to the test bdev module.
*/
stub_complete_io(NOMEM_THRESHOLD_COUNT - 1);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) < nomem_cnt);
nomem_cnt = bdev_io_tailq_cnt(&bdev_ch->nomem_io);
/* Complete 1 I/O only. This should not trigger retrying the queued nomem_io. */
stub_complete_io(1);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == nomem_cnt);
/*
* Send a reset and confirm that all I/O are completed, including the ones that
* were queued on the nomem_io list.
*/
status_reset = SPDK_BDEV_IO_STATUS_PENDING;
rc = spdk_bdev_reset(g_desc, io_ch, enomem_done, &status_reset);
poll_threads();
CU_ASSERT(rc == 0);
/* This will complete the reset. */
stub_complete_io(0);
CU_ASSERT(bdev_io_tailq_cnt(&bdev_ch->nomem_io) == 0);
CU_ASSERT(bdev_ch->io_outstanding == 0);
spdk_put_io_channel(io_ch);
poll_threads();
teardown_test();
}
int
main(int argc, char **argv)
{
@ -433,7 +561,8 @@ main(int argc, char **argv)
CU_add_test(suite, "basic", basic) == NULL ||
CU_add_test(suite, "put_channel_during_reset", put_channel_during_reset) == NULL ||
CU_add_test(suite, "aborted_reset", aborted_reset) == NULL ||
CU_add_test(suite, "io_during_reset", io_during_reset) == NULL
CU_add_test(suite, "io_during_reset", io_during_reset) == NULL ||
CU_add_test(suite, "enomem", enomem) == NULL
) {
CU_cleanup_registry();
return CU_get_error();