diff --git a/lib/bdev/raid/bdev_raid.c b/lib/bdev/raid/bdev_raid.c index cbae9d0a4..c77796a13 100644 --- a/lib/bdev/raid/bdev_raid.c +++ b/lib/bdev/raid/bdev_raid.c @@ -327,75 +327,58 @@ raid_bdev_submit_children(struct spdk_bdev_io *bdev_io, uint64_t pd_lba; uint64_t pd_blocks; uint32_t pd_idx; - int ret; + int ret = 0; - for (uint64_t strip = cur_strip; strip <= end_strip; strip++) { + if (start_strip != end_strip) { + SPDK_ERRLOG("I/O spans strip boundary\n"); + assert(false); + } + + pd_strip = start_strip / raid_bdev->num_base_bdevs; + pd_idx = start_strip % raid_bdev->num_base_bdevs; + offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); + pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; + pd_blocks = bdev_io->u.bdev.num_blocks; + raid_io->splits_comp_outstanding++; + assert(raid_io->splits_pending); + raid_io->splits_pending--; + if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) { + SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); + assert(0); + } + + /* + * Submit child io to bdev layer with using base bdev descriptors, base + * bdev lba, base bdev child io length in blocks, buffer, completion + * function and function callback context + */ + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc, + raid_ch->base_channel[pd_idx], + buf, pd_lba, pd_blocks, raid_bdev_io_completion, + bdev_io); + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc, + raid_ch->base_channel[pd_idx], + buf, pd_lba, pd_blocks, raid_bdev_io_completion, + bdev_io); + } else { + SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); + assert(0); + } + if (ret != 0) { /* - * For each strip of parent bdev io, process for each strip and submit - * child io to bdev layer. Calculate base bdev level start lba, length - * and buffer for this child io + * If failed to submit child io to bdev layer then queue the parent + * bdev io with current active split information in the wait queue + * for that core. This will get resume from this point only. Assume + * if 4 splits are required and 2 childs are submitted, then parent + * io is queued to io waitq of this core and it will get resumed and + * try to submit the remaining 3 and 4 childs */ - pd_strip = strip / raid_bdev->num_base_bdevs; - pd_idx = strip % raid_bdev->num_base_bdevs; - if (strip == start_strip) { - offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); - pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; - if (strip == end_strip) { - pd_blocks = bdev_io->u.bdev.num_blocks; - } else { - pd_blocks = raid_bdev->strip_size - offset_in_strip; - } - } else if (strip == end_strip) { - pd_lba = pd_strip << raid_bdev->strip_size_shift; - pd_blocks = ((bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) & - (raid_bdev->strip_size - 1)) + 1; - } else { - pd_lba = pd_strip << raid_bdev->strip_size_shift; - pd_blocks = raid_bdev->strip_size; - } - raid_io->splits_comp_outstanding++; - assert(raid_io->splits_pending); - raid_io->splits_pending--; - if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) { - SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); - assert(0); - } - - /* - * Submit child io to bdev layer with using base bdev descriptors, base - * bdev lba, base bdev child io length in blocks, buffer, completion - * function and function callback context - */ - if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { - ret = spdk_bdev_read_blocks(raid_bdev->base_bdev_info[pd_idx].desc, - raid_ch->base_channel[pd_idx], - buf, pd_lba, pd_blocks, raid_bdev_io_completion, - bdev_io); - - } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { - ret = spdk_bdev_write_blocks(raid_bdev->base_bdev_info[pd_idx].desc, - raid_ch->base_channel[pd_idx], - buf, pd_lba, pd_blocks, raid_bdev_io_completion, - bdev_io); - } else { - SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); - assert(0); - } - if (ret != 0) { - /* - * If failed to submit child io to bdev layer then queue the parent - * bdev io with current active split information in the wait queue - * for that core. This will get resume from this point only. Assume - * if 4 splits are required and 2 childs are submitted, then parent - * io is queued to io waitq of this core and it will get resumed and - * try to submit the remaining 3 and 4 childs - */ - raid_io->buf = buf; - raid_io->splits_comp_outstanding--; - raid_io->splits_pending++; - return ret; - } - buf += (pd_blocks << raid_bdev->blocklen_shift); + raid_io->buf = buf; + raid_io->splits_comp_outstanding--; + raid_io->splits_pending++; + return ret; } return 0; @@ -1257,11 +1240,14 @@ raid_bdev_configure(struct raid_bdev *raid_bdev) raid_bdev_gen = &raid_bdev->bdev; raid_bdev_gen->write_cache = 0; raid_bdev_gen->blocklen = blocklen; - raid_bdev_gen->optimal_io_boundary = 0; - + raid_bdev_gen->ctxt = raid_bdev; + raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; + raid_bdev_gen->module = &g_raid_if; raid_bdev->strip_size = (raid_bdev->strip_size * 1024) / blocklen; raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size); raid_bdev->blocklen_shift = spdk_u32log2(blocklen); + raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size; + raid_bdev_gen->split_on_optimal_io_boundary = true; /* * RAID bdev logic is for striping so take the minimum block count based diff --git a/test/unit/lib/bdev/bdev_raid.c/bdev_raid_ut.c b/test/unit/lib/bdev/bdev_raid.c/bdev_raid_ut.c index fbbe98b56..d1733a5e3 100644 --- a/test/unit/lib/bdev/bdev_raid.c/bdev_raid_ut.c +++ b/test/unit/lib/bdev/bdev_raid.c/bdev_raid_ut.c @@ -863,7 +863,8 @@ verify_raid_bdev(struct rpc_construct_raid_bdev *r, bool presence, uint32_t raid CU_ASSERT(strcmp(pbdev->bdev.product_name, "Pooled Device") == 0); CU_ASSERT(pbdev->bdev.write_cache == 0); CU_ASSERT(pbdev->bdev.blocklen == g_block_len); - CU_ASSERT(pbdev->bdev.optimal_io_boundary == 0); + CU_ASSERT(pbdev->bdev.optimal_io_boundary == pbdev->strip_size); + CU_ASSERT(pbdev->bdev.split_on_optimal_io_boundary == true); CU_ASSERT(pbdev->bdev.ctxt == pbdev); CU_ASSERT(pbdev->bdev.fn_table == &g_raid_bdev_fn_table); CU_ASSERT(pbdev->bdev.module == &g_raid_if); @@ -1375,9 +1376,9 @@ test_write_io(void) for (count = 0; count < g_max_qd; count++) { bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io)); SPDK_CU_ASSERT_FATAL(bdev_io != NULL); - io_len = (rand() % g_max_io_size) + 1; + io_len = (rand() % g_strip_size) + 1; bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE); - lba += io_len; + lba += g_strip_size; memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output)); g_io_output_index = 0; raid_bdev_submit_request(ch, bdev_io); @@ -1454,9 +1455,9 @@ test_read_io(void) for (count = 0; count < g_max_qd; count++) { bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io)); SPDK_CU_ASSERT_FATAL(bdev_io != NULL); - io_len = (rand() % g_max_io_size) + 1; + io_len = (rand() % g_strip_size) + 1; bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_READ); - lba += io_len; + lba += g_strip_size; memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output)); g_io_output_index = 0; raid_bdev_submit_request(ch, bdev_io); @@ -1533,9 +1534,9 @@ test_io_failure(void) for (count = 0; count < 1; count++) { bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io)); SPDK_CU_ASSERT_FATAL(bdev_io != NULL); - io_len = (rand() % g_max_io_size) + 1; + io_len = (rand() % g_strip_size) + 1; bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_INVALID); - lba += io_len; + lba += g_strip_size; memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output)); g_io_output_index = 0; raid_bdev_submit_request(ch, bdev_io); @@ -1551,9 +1552,9 @@ test_io_failure(void) for (count = 0; count < 1; count++) { bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io)); SPDK_CU_ASSERT_FATAL(bdev_io != NULL); - io_len = (rand() % g_max_io_size) + 1; + io_len = (rand() % g_strip_size) + 1; bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE); - lba += io_len; + lba += g_strip_size; memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output)); g_io_output_index = 0; raid_bdev_submit_request(ch, bdev_io); @@ -1635,10 +1636,10 @@ test_io_waitq(void) bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io)); SPDK_CU_ASSERT_FATAL(bdev_io != NULL); TAILQ_INSERT_TAIL(&head_io, bdev_io, module_link); - io_len = (rand() % g_max_io_size) + 1; + io_len = (rand() % g_strip_size) + 1; bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, SPDK_BDEV_IO_TYPE_WRITE); g_bdev_io_submit_status = -ENOMEM; - lba += io_len; + lba += g_strip_size; raid_bdev_submit_request(ch, bdev_io); } @@ -1869,7 +1870,7 @@ test_multi_raid_with_io(void) for (count = 0; count < g_max_qd; count++) { bdev_io = calloc(1, sizeof(struct spdk_bdev_io) + sizeof(struct raid_bdev_io)); SPDK_CU_ASSERT_FATAL(bdev_io != NULL); - io_len = (rand() % g_max_io_size) + 1; + io_len = (rand() % g_strip_size) + 1; iotype = (rand() % 2) ? SPDK_BDEV_IO_TYPE_WRITE : SPDK_BDEV_IO_TYPE_READ; memset(g_io_output, 0, (g_max_io_size / g_strip_size) + 1 * sizeof(struct io_output)); g_io_output_index = 0; @@ -1882,7 +1883,7 @@ test_multi_raid_with_io(void) } } bdev_io_initialize(bdev_io, &pbdev->bdev, lba, io_len, iotype); - lba += io_len; + lba += g_strip_size; CU_ASSERT(pbdev != NULL); raid_bdev_submit_request(ch_random, bdev_io); verify_io(bdev_io, g_max_base_drives, ch_ctx_random, pbdev,