Spdk/module/bdev/raid/concat.c
yupeng 64eebbd132 bdev/raid: Add concat module
The concat module can combine multiple underlying bdevs to a single
bdev. It is a special raid level. You can add a new bdev to the end of
the concat bdev, then the concat bdev size is increased, and it won't
change the layout of the exist data. This is the major difference
between concat and raid0. If you add a new underling device to raid0,
the whole data layout will be changed. So the concat bdev is extentable.

Change-Id: Ibbeeaf0606ff79b595320c597a5605ab9e4e13c4
Signed-off-by: Peng Yu <yupeng0921@gmail.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11070
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
2022-04-05 07:39:00 +00:00

351 lines
10 KiB
C

/*-
* BSD LICENSE
*
* Copyright (c) Peng Yu yupeng0921@gmail.com.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bdev_raid.h"
#include "spdk/env.h"
#include "spdk/thread.h"
#include "spdk/string.h"
#include "spdk/util.h"
#include "spdk/log.h"
struct concat_block_range {
uint64_t start;
uint64_t length;
};
/*
* brief:
* concat_bdev_io_completion function is called by lower layers to notify raid
* module that particular bdev_io is completed.
* params:
* bdev_io - pointer to bdev io submitted to lower layers, like child io
* success - bdev_io status
* cb_arg - function callback context (parent raid_bdev_io)
* returns:
* none
*/
static void
concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct raid_bdev_io *raid_io = cb_arg;
spdk_bdev_free_io(bdev_io);
if (success) {
raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
} else {
raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
static void
concat_submit_rw_request(struct raid_bdev_io *raid_io);
static void
_concat_submit_rw_request(void *_raid_io)
{
struct raid_bdev_io *raid_io = _raid_io;
concat_submit_rw_request(raid_io);
}
/*
* brief:
* concat_submit_rw_request function is used to submit I/O to the correct
* member disk for concat bdevs.
* params:
* raid_io
* returns:
* none
*/
static void
concat_submit_rw_request(struct raid_bdev_io *raid_io)
{
struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
struct raid_bdev *raid_bdev = raid_io->raid_bdev;
struct concat_block_range *block_range = raid_bdev->module_private;
uint64_t pd_lba;
uint64_t pd_blocks;
int pd_idx;
int ret = 0;
struct raid_base_bdev_info *base_info;
struct spdk_io_channel *base_ch;
int i;
pd_idx = -1;
for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
if (block_range[i].start > bdev_io->u.bdev.offset_blocks) {
break;
}
pd_idx = i;
}
assert(pd_idx >= 0);
assert(bdev_io->u.bdev.offset_blocks >= block_range[pd_idx].start);
pd_lba = bdev_io->u.bdev.offset_blocks - block_range[pd_idx].start;
pd_blocks = bdev_io->u.bdev.num_blocks;
base_info = &raid_bdev->base_bdev_info[pd_idx];
if (base_info->desc == NULL) {
SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
assert(0);
}
/*
* Submit child io to bdev layer with using base bdev descriptors, base
* bdev lba, base bdev child io length in blocks, buffer, completion
* function and function callback context
*/
assert(raid_ch != NULL);
assert(raid_ch->base_channel);
base_ch = raid_ch->base_channel[pd_idx];
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
ret = spdk_bdev_readv_blocks(base_info->desc, base_ch,
bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
pd_lba, pd_blocks, concat_bdev_io_completion,
raid_io);
} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
ret = spdk_bdev_writev_blocks(base_info->desc, base_ch,
bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
pd_lba, pd_blocks, concat_bdev_io_completion,
raid_io);
} else {
SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
assert(0);
}
if (ret == -ENOMEM) {
raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
_concat_submit_rw_request);
} else if (ret != 0) {
SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
assert(false);
raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
static void
concat_submit_null_payload_request(struct raid_bdev_io *raid_io);
static void
_concat_submit_null_payload_request(void *_raid_io)
{
struct raid_bdev_io *raid_io = _raid_io;
concat_submit_null_payload_request(raid_io);
}
static void
concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct raid_bdev_io *raid_io = cb_arg;
raid_bdev_io_complete_part(raid_io, 1, success ?
SPDK_BDEV_IO_STATUS_SUCCESS :
SPDK_BDEV_IO_STATUS_FAILED);
spdk_bdev_free_io(bdev_io);
}
/*
* brief:
* concat_submit_null_payload_request function submits the next batch of
* io requests with range but without payload, like FLUSH and UNMAP, to member disks;
* it will submit as many as possible unless one base io request fails with -ENOMEM,
* in which case it will queue itself for later submission.
* params:
* bdev_io - pointer to parent bdev_io on raid bdev device
* returns:
* none
*/
static void
concat_submit_null_payload_request(struct raid_bdev_io *raid_io)
{
struct spdk_bdev_io *bdev_io;
struct raid_bdev *raid_bdev;
int ret;
struct raid_base_bdev_info *base_info;
struct spdk_io_channel *base_ch;
uint64_t pd_lba;
uint64_t pd_blocks;
uint64_t offset_blocks;
uint64_t num_blocks;
struct concat_block_range *block_range;
int i, start_idx, stop_idx;
bdev_io = spdk_bdev_io_from_ctx(raid_io);
raid_bdev = raid_io->raid_bdev;
block_range = raid_bdev->module_private;
offset_blocks = bdev_io->u.bdev.offset_blocks;
num_blocks = bdev_io->u.bdev.num_blocks;
start_idx = -1;
stop_idx = -1;
/*
* Go through all base bdevs, find the first bdev and the last bdev
*/
for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
/* skip the bdevs before the offset_blocks */
if (offset_blocks >= block_range[i].start + block_range[i].length) {
continue;
}
if (start_idx == -1) {
start_idx = i;
} else {
/*
* The offset_blocks might be at the middle of the first bdev.
* Besides the first bdev, the offset_blocks should be always
* at the start of the bdev.
*/
assert(offset_blocks == block_range[i].start);
}
pd_lba = offset_blocks - block_range[i].start;
pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
offset_blocks += pd_blocks;
num_blocks -= pd_blocks;
if (num_blocks == 0) {
stop_idx = i;
break;
}
}
assert(start_idx >= 0);
assert(stop_idx >= 0);
if (raid_io->base_bdev_io_remaining == 0) {
raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1;
}
offset_blocks = bdev_io->u.bdev.offset_blocks;
num_blocks = bdev_io->u.bdev.num_blocks;
for (i = start_idx; i <= stop_idx; i++) {
assert(offset_blocks >= block_range[i].start);
assert(offset_blocks < block_range[i].start + block_range[i].length);
pd_lba = offset_blocks - block_range[i].start;
pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
offset_blocks += pd_blocks;
num_blocks -= pd_blocks;
/*
* Skip the IOs we have submitted
*/
if (i < start_idx + raid_io->base_bdev_io_submitted) {
continue;
}
base_info = &raid_bdev->base_bdev_info[i];
base_ch = raid_io->raid_ch->base_channel[i];
switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_UNMAP:
ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
pd_lba, pd_blocks,
concat_base_io_complete, raid_io);
break;
case SPDK_BDEV_IO_TYPE_FLUSH:
ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
pd_lba, pd_blocks,
concat_base_io_complete, raid_io);
break;
default:
SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
assert(false);
ret = -EIO;
}
if (ret == 0) {
raid_io->base_bdev_io_submitted++;
} else if (ret == -ENOMEM) {
raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
_concat_submit_null_payload_request);
return;
} else {
SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
assert(false);
raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
return;
}
}
}
static int concat_start(struct raid_bdev *raid_bdev)
{
uint64_t total_blockcnt = 0;
struct raid_base_bdev_info *base_info;
struct concat_block_range *block_range;
block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range));
if (!block_range) {
SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u",
raid_bdev->num_base_bdevs);
return -ENOMEM;
}
int idx = 0;
RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
uint64_t strip_cnt = base_info->bdev->blockcnt >> raid_bdev->strip_size_shift;
uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift;
block_range[idx].start = total_blockcnt;
block_range[idx].length = pd_block_cnt;
total_blockcnt += pd_block_cnt;
idx++;
}
raid_bdev->module_private = block_range;
SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n",
total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
raid_bdev->bdev.blockcnt = total_blockcnt;
raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
raid_bdev->bdev.split_on_optimal_io_boundary = true;
return 0;
}
static void
concat_stop(struct raid_bdev *raid_bdev)
{
struct concat_block_range *block_range = raid_bdev->module_private;
free(block_range);
}
static struct raid_bdev_module g_concat_module = {
.level = CONCAT,
.base_bdevs_min = 1,
.start = concat_start,
.stop = concat_stop,
.submit_rw_request = concat_submit_rw_request,
.submit_null_payload_request = concat_submit_null_payload_request,
};
RAID_MODULE_REGISTER(&g_concat_module)
SPDK_LOG_REGISTER_COMPONENT(bdev_concat)