module/raid: allow assembly of a degraded raid

Add num_base_bdevs_operational to raid_bdev and use it to determine the
required number of base bdevs.

Change-Id: I31b39cc8ea708b6cdce748f015949e4c9fdeb3cd
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
This commit is contained in:
Artur Paszkiewicz 2022-12-30 15:43:40 +01:00 committed by David Ko
parent ea4b2f6d75
commit e325fbafec
5 changed files with 151 additions and 23 deletions

View File

@ -10045,6 +10045,7 @@ Example response:
"raid_level": "raid0",
"num_base_bdevs": 2,
"num_base_bdevs_discovered": 2,
"num_base_bdevs_operational": 2,
"base_bdevs_list": [
{
"name": "malloc0",
@ -10070,6 +10071,7 @@ Example response:
"raid_level": "raid0",
"num_base_bdevs": 2,
"num_base_bdevs_discovered": 1,
"num_base_bdevs_operational": 2,
"base_bdevs_list": [
{
"name": "malloc2",

View File

@ -640,6 +640,8 @@ raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ct
spdk_json_write_named_bool(w, "superblock", raid_bdev->sb != NULL);
spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
spdk_json_write_named_uint32(w, "num_base_bdevs_operational",
raid_bdev->num_base_bdevs_operational);
spdk_json_write_name(w, "base_bdevs_list");
spdk_json_write_array_begin(w);
RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
@ -1142,6 +1144,8 @@ raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs,
return rc;
}
raid_bdev->num_base_bdevs_operational = num_base_bdevs;
if (superblock) {
spdk_uuid_generate(&raid_bdev->bdev.uuid);
}
@ -1170,6 +1174,10 @@ raid_bdev_configure_md(struct raid_bdev *raid_bdev)
for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
base_bdev = raid_bdev->base_bdev_info[i].bdev;
if (base_bdev == NULL) {
continue;
}
if (i == 0) {
raid_bdev->bdev.md_len = spdk_bdev_get_md_size(base_bdev);
raid_bdev->bdev.md_interleave = spdk_bdev_is_md_interleaved(base_bdev);
@ -1350,10 +1358,12 @@ raid_bdev_configure(struct raid_bdev *raid_bdev)
int rc = 0;
assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING);
assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs);
assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational);
RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
assert(base_info->bdev != NULL);
if (base_info->bdev == NULL) {
continue;
}
/* Check blocklen for all base bdevs that it should be same */
if (blocklen == 0) {
blocklen = base_info->bdev->blocklen;
@ -1739,7 +1749,7 @@ raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev)
/* There is no base bdev for this raid, so free the raid device. */
raid_bdev_cleanup_and_free(raid_bdev);
}
} else if (raid_bdev->num_base_bdevs_discovered == raid_bdev->min_base_bdevs_operational) {
} else if (raid_bdev->num_base_bdevs_operational-- == raid_bdev->min_base_bdevs_operational) {
raid_bdev_deconfigure(raid_bdev, NULL, NULL);
} else {
return raid_bdev_suspend(raid_bdev, raid_bdev_remove_base_bdev_on_suspended, base_info);
@ -1878,8 +1888,10 @@ raid_bdev_configure_base_bdev_cont(struct raid_base_bdev_info *base_info)
raid_bdev->num_base_bdevs_discovered++;
assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
assert(raid_bdev->num_base_bdevs_operational <= raid_bdev->num_base_bdevs);
assert(raid_bdev->num_base_bdevs_operational >= raid_bdev->min_base_bdevs_operational);
if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) {
if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational) {
rc = raid_bdev_configure(raid_bdev);
if (rc != 0) {
SPDK_ERRLOG("Failed to configure raid bdev: %s\n", spdk_strerror(-rc));
@ -2047,8 +2059,6 @@ _raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name,
struct raid_base_bdev_info *base_info;
int rc;
assert(name != NULL || uuid != NULL);
if (slot >= raid_bdev->num_base_bdevs) {
return -EINVAL;
}
@ -2084,6 +2094,10 @@ _raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name,
base_info->data_offset = data_offset;
base_info->data_size = data_size;
if (name == NULL && uuid == NULL) {
return 0;
}
rc = raid_bdev_configure_base_bdev(base_info);
if (rc != 0) {
if (rc != -ENODEV) {
@ -2111,6 +2125,8 @@ _raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name,
int
raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot)
{
assert(name != NULL);
return _raid_bdev_add_base_device(raid_bdev, name, NULL, slot, 0, 0);
}
@ -2119,8 +2135,15 @@ raid_bdev_add_base_device_from_sb(struct raid_bdev *raid_bdev,
const struct raid_bdev_sb_base_bdev *sb_base_bdev)
{
int rc;
const struct spdk_uuid *uuid;
rc = _raid_bdev_add_base_device(raid_bdev, NULL, &sb_base_bdev->uuid, sb_base_bdev->slot,
if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) {
uuid = &sb_base_bdev->uuid;
} else {
uuid = NULL;
}
rc = _raid_bdev_add_base_device(raid_bdev, NULL, uuid, sb_base_bdev->slot,
sb_base_bdev->data_offset, sb_base_bdev->data_size);
if (rc == -ENODEV) {
@ -2149,13 +2172,15 @@ raid_bdev_create_from_sb(const struct raid_bdev_superblock *sb)
memcpy(raid_bdev->sb, sb, sb->length);
for (i = 0; i < sb->base_bdevs_size; i++) {
const struct raid_bdev_sb_base_bdev *sb_base_bdev = &sb->base_bdevs[i];
if (sb->base_bdevs[i].state == RAID_SB_BASE_BDEV_CONFIGURED) {
raid_bdev->num_base_bdevs_operational++;
}
}
if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) {
rc = raid_bdev_add_base_device_from_sb(raid_bdev, sb_base_bdev);
if (rc != 0) {
goto err;
}
for (i = 0; i < sb->base_bdevs_size; i++) {
rc = raid_bdev_add_base_device_from_sb(raid_bdev, &sb->base_bdevs[i]);
if (rc != 0) {
goto err;
}
}

View File

@ -153,6 +153,9 @@ struct raid_bdev {
/* number of base bdevs discovered */
uint8_t num_base_bdevs_discovered;
/* number of operational base bdevs */
uint8_t num_base_bdevs_operational;
/* minimum number of viable base bdevs that are required by array to operate */
uint8_t min_base_bdevs_operational;

View File

@ -1089,7 +1089,9 @@ raid5f_start(struct raid_bdev *raid_bdev)
RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev));
if (base_info->bdev) {
alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev));
}
}
base_bdev_data_size = (min_blockcnt / raid_bdev->strip_size) * raid_bdev->strip_size;

View File

@ -124,6 +124,7 @@ function verify_raid_bdev_state() (
local expected_state=$2
local raid_level=$3
local strip_size=$4
local num_base_bdevs_operational=$5
local raid_bdev
local raid_bdev_info
local num_base_bdevs
@ -173,6 +174,12 @@ function verify_raid_bdev_state() (
echo "incorrect num_base_bdevs_discovered: $tmp, expected: $num_base_bdevs_discovered"
return 1
fi
tmp=$(echo $raid_bdev_info | jq -r '.num_base_bdevs_operational')
if [ "$num_base_bdevs_operational" != "$tmp" ]; then
echo "incorrect num_base_bdevs_operational $tmp, expected: $num_base_bdevs_operational"
return 1
fi
)
function has_redundancy() {
@ -206,7 +213,7 @@ function raid_state_function_test() {
# Step1: create a RAID bdev with no base bdevs
# Expect state: CONFIGURING
$rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
$rpc_py bdev_raid_delete $raid_bdev_name
@ -216,7 +223,7 @@ function raid_state_function_test() {
$rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name
$rpc_py bdev_malloc_create 32 512 -b ${base_bdevs[0]}
waitforbdev ${base_bdevs[0]}
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
$rpc_py bdev_raid_delete $raid_bdev_name
@ -225,13 +232,13 @@ function raid_state_function_test() {
# Expect state: ONLINE
$rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name
for ((i = 1; i < num_base_bdevs; i++)); do
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
$rpc_py bdev_malloc_create 32 512 -b ${base_bdevs[$i]}
waitforbdev ${base_bdevs[$i]}
done
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
@ -243,7 +250,7 @@ function raid_state_function_test() {
else
expected_state="online"
fi
if ! verify_raid_bdev_state $raid_bdev_name $expected_state $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name $expected_state $raid_level $strip_size $((num_base_bdevs - 1)); then
return 1
fi
@ -353,7 +360,7 @@ function raid_superblock_test() {
# Create RAID bdev with superblock
$rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs_pt[*]}" -n $raid_bdev_name -s
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
@ -381,7 +388,7 @@ function raid_superblock_test() {
# Try to create new RAID bdev from malloc bdevs
# Should not reach online state due to superblock still present on base bdevs
$rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs_malloc[*]}" -n $raid_bdev_name
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
@ -396,7 +403,7 @@ function raid_superblock_test() {
$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[0]} -p ${base_bdevs_pt[0]} -u ${base_bdevs_pt_uuid[0]}
# Check if the RAID bdev was assembled from superblock
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
@ -406,7 +413,7 @@ function raid_superblock_test() {
done
# Check if the RAID bdev is in online state
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size; then
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
@ -415,6 +422,95 @@ function raid_superblock_test() {
return 1
fi
if has_redundancy $raid_level; then
# Delete one base bdev
$rpc_py bdev_passthru_delete ${base_bdevs_pt[0]}
# Check if the RAID bdev is in online state (degraded)
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1)); then
return 1
fi
# Stop the RAID bdev
$rpc_py bdev_raid_delete $raid_bdev_name
raid_bdev=$($rpc_py bdev_raid_get_bdevs all | jq -r '.[]')
if [ -n "$raid_bdev" ]; then
return 1
fi
# Delete remaining base bdevs
for ((i = 1; i < num_base_bdevs; i++)); do
$rpc_py bdev_passthru_delete ${base_bdevs_pt[$i]}
done
# Re-add base bdevs from the second up to (not including) the last one
for ((i = 1; i < num_base_bdevs - 1; i++)); do
$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}
# Check if the RAID bdev is in configuring state
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $((num_base_bdevs - 1)); then
return 1
fi
done
# Re-add the last base bdev
i=$((num_base_bdevs - 1))
$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}
# Check if the RAID bdev is in online state (degraded)
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1)); then
return 1
fi
if [ $num_base_bdevs -gt 2 ]; then
# Stop the RAID bdev
$rpc_py bdev_raid_delete $raid_bdev_name
raid_bdev=$($rpc_py bdev_raid_get_bdevs all | jq -r '.[]')
if [ -n "$raid_bdev" ]; then
return 1
fi
# Delete remaining base bdevs
for ((i = 1; i < num_base_bdevs; i++)); do
$rpc_py bdev_passthru_delete ${base_bdevs_pt[$i]}
done
# Re-add first base bdev
# This is the "failed" device and contains the "old" version of the superblock
$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[0]} -p ${base_bdevs_pt[0]} -u ${base_bdevs_pt_uuid[0]}
# Check if the RAID bdev is in configuring state
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then
return 1
fi
# Re-add the last base bdev
i=$((num_base_bdevs - 1))
$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}
# Check if the RAID bdev is in configuring state
# This should use the newer superblock version and have n-1 online base bdevs
if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $((num_base_bdevs - 1)); then
return 1
fi
# Re-add remaining base bdevs
for ((i = 1; i < num_base_bdevs - 1; i++)); do
$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}
done
# Check if the RAID bdev is in online state (degraded)
if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1)); then
return 1
fi
fi
# Check if the RAID bdev has the same UUID as when first created
if [ "$($rpc_py bdev_get_bdevs -b $raid_bdev_name | jq -r '.[] | .uuid')" != "$raid_bdev_uuid" ]; then
return 1
fi
fi
killprocess $raid_pid
return 0