From e325fbafecd9f0d749e3f0e17697f2de1acc7020 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Fri, 30 Dec 2022 15:43:40 +0100 Subject: [PATCH] module/raid: allow assembly of a degraded raid Add num_base_bdevs_operational to raid_bdev and use it to determine the required number of base bdevs. Change-Id: I31b39cc8ea708b6cdce748f015949e4c9fdeb3cd Signed-off-by: Artur Paszkiewicz --- doc/jsonrpc.md | 2 + module/bdev/raid/bdev_raid.c | 51 ++++++++++++---- module/bdev/raid/bdev_raid.h | 3 + module/bdev/raid/raid5f.c | 4 +- test/bdev/bdev_raid.sh | 114 ++++++++++++++++++++++++++++++++--- 5 files changed, 151 insertions(+), 23 deletions(-) diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 2ced5ef3c..4c894d556 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -10045,6 +10045,7 @@ Example response: "raid_level": "raid0", "num_base_bdevs": 2, "num_base_bdevs_discovered": 2, + "num_base_bdevs_operational": 2, "base_bdevs_list": [ { "name": "malloc0", @@ -10070,6 +10071,7 @@ Example response: "raid_level": "raid0", "num_base_bdevs": 2, "num_base_bdevs_discovered": 1, + "num_base_bdevs_operational": 2, "base_bdevs_list": [ { "name": "malloc2", diff --git a/module/bdev/raid/bdev_raid.c b/module/bdev/raid/bdev_raid.c index 4c6206b62..65ebf5996 100644 --- a/module/bdev/raid/bdev_raid.c +++ b/module/bdev/raid/bdev_raid.c @@ -640,6 +640,8 @@ raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ct spdk_json_write_named_bool(w, "superblock", raid_bdev->sb != NULL); spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs); spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered); + spdk_json_write_named_uint32(w, "num_base_bdevs_operational", + raid_bdev->num_base_bdevs_operational); spdk_json_write_name(w, "base_bdevs_list"); spdk_json_write_array_begin(w); RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { @@ -1142,6 +1144,8 @@ raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, return rc; } + raid_bdev->num_base_bdevs_operational = num_base_bdevs; + if (superblock) { spdk_uuid_generate(&raid_bdev->bdev.uuid); } @@ -1170,6 +1174,10 @@ raid_bdev_configure_md(struct raid_bdev *raid_bdev) for (i = 0; i < raid_bdev->num_base_bdevs; i++) { base_bdev = raid_bdev->base_bdev_info[i].bdev; + if (base_bdev == NULL) { + continue; + } + if (i == 0) { raid_bdev->bdev.md_len = spdk_bdev_get_md_size(base_bdev); raid_bdev->bdev.md_interleave = spdk_bdev_is_md_interleaved(base_bdev); @@ -1350,10 +1358,12 @@ raid_bdev_configure(struct raid_bdev *raid_bdev) int rc = 0; assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING); - assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs); + assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational); RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { - assert(base_info->bdev != NULL); + if (base_info->bdev == NULL) { + continue; + } /* Check blocklen for all base bdevs that it should be same */ if (blocklen == 0) { blocklen = base_info->bdev->blocklen; @@ -1739,7 +1749,7 @@ raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev) /* There is no base bdev for this raid, so free the raid device. */ raid_bdev_cleanup_and_free(raid_bdev); } - } else if (raid_bdev->num_base_bdevs_discovered == raid_bdev->min_base_bdevs_operational) { + } else if (raid_bdev->num_base_bdevs_operational-- == raid_bdev->min_base_bdevs_operational) { raid_bdev_deconfigure(raid_bdev, NULL, NULL); } else { return raid_bdev_suspend(raid_bdev, raid_bdev_remove_base_bdev_on_suspended, base_info); @@ -1878,8 +1888,10 @@ raid_bdev_configure_base_bdev_cont(struct raid_base_bdev_info *base_info) raid_bdev->num_base_bdevs_discovered++; assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); + assert(raid_bdev->num_base_bdevs_operational <= raid_bdev->num_base_bdevs); + assert(raid_bdev->num_base_bdevs_operational >= raid_bdev->min_base_bdevs_operational); - if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) { + if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational) { rc = raid_bdev_configure(raid_bdev); if (rc != 0) { SPDK_ERRLOG("Failed to configure raid bdev: %s\n", spdk_strerror(-rc)); @@ -2047,8 +2059,6 @@ _raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, struct raid_base_bdev_info *base_info; int rc; - assert(name != NULL || uuid != NULL); - if (slot >= raid_bdev->num_base_bdevs) { return -EINVAL; } @@ -2084,6 +2094,10 @@ _raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, base_info->data_offset = data_offset; base_info->data_size = data_size; + if (name == NULL && uuid == NULL) { + return 0; + } + rc = raid_bdev_configure_base_bdev(base_info); if (rc != 0) { if (rc != -ENODEV) { @@ -2111,6 +2125,8 @@ _raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, int raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot) { + assert(name != NULL); + return _raid_bdev_add_base_device(raid_bdev, name, NULL, slot, 0, 0); } @@ -2119,8 +2135,15 @@ raid_bdev_add_base_device_from_sb(struct raid_bdev *raid_bdev, const struct raid_bdev_sb_base_bdev *sb_base_bdev) { int rc; + const struct spdk_uuid *uuid; - rc = _raid_bdev_add_base_device(raid_bdev, NULL, &sb_base_bdev->uuid, sb_base_bdev->slot, + if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) { + uuid = &sb_base_bdev->uuid; + } else { + uuid = NULL; + } + + rc = _raid_bdev_add_base_device(raid_bdev, NULL, uuid, sb_base_bdev->slot, sb_base_bdev->data_offset, sb_base_bdev->data_size); if (rc == -ENODEV) { @@ -2149,13 +2172,15 @@ raid_bdev_create_from_sb(const struct raid_bdev_superblock *sb) memcpy(raid_bdev->sb, sb, sb->length); for (i = 0; i < sb->base_bdevs_size; i++) { - const struct raid_bdev_sb_base_bdev *sb_base_bdev = &sb->base_bdevs[i]; + if (sb->base_bdevs[i].state == RAID_SB_BASE_BDEV_CONFIGURED) { + raid_bdev->num_base_bdevs_operational++; + } + } - if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) { - rc = raid_bdev_add_base_device_from_sb(raid_bdev, sb_base_bdev); - if (rc != 0) { - goto err; - } + for (i = 0; i < sb->base_bdevs_size; i++) { + rc = raid_bdev_add_base_device_from_sb(raid_bdev, &sb->base_bdevs[i]); + if (rc != 0) { + goto err; } } diff --git a/module/bdev/raid/bdev_raid.h b/module/bdev/raid/bdev_raid.h index a98be2485..afec963a9 100644 --- a/module/bdev/raid/bdev_raid.h +++ b/module/bdev/raid/bdev_raid.h @@ -153,6 +153,9 @@ struct raid_bdev { /* number of base bdevs discovered */ uint8_t num_base_bdevs_discovered; + /* number of operational base bdevs */ + uint8_t num_base_bdevs_operational; + /* minimum number of viable base bdevs that are required by array to operate */ uint8_t min_base_bdevs_operational; diff --git a/module/bdev/raid/raid5f.c b/module/bdev/raid/raid5f.c index 199fcae71..6a0d296ec 100644 --- a/module/bdev/raid/raid5f.c +++ b/module/bdev/raid/raid5f.c @@ -1089,7 +1089,9 @@ raid5f_start(struct raid_bdev *raid_bdev) RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { min_blockcnt = spdk_min(min_blockcnt, base_info->data_size); - alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev)); + if (base_info->bdev) { + alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev)); + } } base_bdev_data_size = (min_blockcnt / raid_bdev->strip_size) * raid_bdev->strip_size; diff --git a/test/bdev/bdev_raid.sh b/test/bdev/bdev_raid.sh index fecedd2b7..e341f9e63 100755 --- a/test/bdev/bdev_raid.sh +++ b/test/bdev/bdev_raid.sh @@ -124,6 +124,7 @@ function verify_raid_bdev_state() ( local expected_state=$2 local raid_level=$3 local strip_size=$4 + local num_base_bdevs_operational=$5 local raid_bdev local raid_bdev_info local num_base_bdevs @@ -173,6 +174,12 @@ function verify_raid_bdev_state() ( echo "incorrect num_base_bdevs_discovered: $tmp, expected: $num_base_bdevs_discovered" return 1 fi + + tmp=$(echo $raid_bdev_info | jq -r '.num_base_bdevs_operational') + if [ "$num_base_bdevs_operational" != "$tmp" ]; then + echo "incorrect num_base_bdevs_operational $tmp, expected: $num_base_bdevs_operational" + return 1 + fi ) function has_redundancy() { @@ -206,7 +213,7 @@ function raid_state_function_test() { # Step1: create a RAID bdev with no base bdevs # Expect state: CONFIGURING $rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name - if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then return 1 fi $rpc_py bdev_raid_delete $raid_bdev_name @@ -216,7 +223,7 @@ function raid_state_function_test() { $rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name $rpc_py bdev_malloc_create 32 512 -b ${base_bdevs[0]} waitforbdev ${base_bdevs[0]} - if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then return 1 fi $rpc_py bdev_raid_delete $raid_bdev_name @@ -225,13 +232,13 @@ function raid_state_function_test() { # Expect state: ONLINE $rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name for ((i = 1; i < num_base_bdevs; i++)); do - if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then return 1 fi $rpc_py bdev_malloc_create 32 512 -b ${base_bdevs[$i]} waitforbdev ${base_bdevs[$i]} done - if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs; then return 1 fi @@ -243,7 +250,7 @@ function raid_state_function_test() { else expected_state="online" fi - if ! verify_raid_bdev_state $raid_bdev_name $expected_state $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name $expected_state $raid_level $strip_size $((num_base_bdevs - 1)); then return 1 fi @@ -353,7 +360,7 @@ function raid_superblock_test() { # Create RAID bdev with superblock $rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs_pt[*]}" -n $raid_bdev_name -s - if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs; then return 1 fi @@ -381,7 +388,7 @@ function raid_superblock_test() { # Try to create new RAID bdev from malloc bdevs # Should not reach online state due to superblock still present on base bdevs $rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs_malloc[*]}" -n $raid_bdev_name - if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then return 1 fi @@ -396,7 +403,7 @@ function raid_superblock_test() { $rpc_py bdev_passthru_create -b ${base_bdevs_malloc[0]} -p ${base_bdevs_pt[0]} -u ${base_bdevs_pt_uuid[0]} # Check if the RAID bdev was assembled from superblock - if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then return 1 fi @@ -406,7 +413,7 @@ function raid_superblock_test() { done # Check if the RAID bdev is in online state - if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size; then + if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs; then return 1 fi @@ -415,6 +422,95 @@ function raid_superblock_test() { return 1 fi + if has_redundancy $raid_level; then + # Delete one base bdev + $rpc_py bdev_passthru_delete ${base_bdevs_pt[0]} + + # Check if the RAID bdev is in online state (degraded) + if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1)); then + return 1 + fi + + # Stop the RAID bdev + $rpc_py bdev_raid_delete $raid_bdev_name + raid_bdev=$($rpc_py bdev_raid_get_bdevs all | jq -r '.[]') + if [ -n "$raid_bdev" ]; then + return 1 + fi + + # Delete remaining base bdevs + for ((i = 1; i < num_base_bdevs; i++)); do + $rpc_py bdev_passthru_delete ${base_bdevs_pt[$i]} + done + + # Re-add base bdevs from the second up to (not including) the last one + for ((i = 1; i < num_base_bdevs - 1; i++)); do + $rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]} + + # Check if the RAID bdev is in configuring state + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $((num_base_bdevs - 1)); then + return 1 + fi + done + + # Re-add the last base bdev + i=$((num_base_bdevs - 1)) + $rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]} + + # Check if the RAID bdev is in online state (degraded) + if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1)); then + return 1 + fi + + if [ $num_base_bdevs -gt 2 ]; then + # Stop the RAID bdev + $rpc_py bdev_raid_delete $raid_bdev_name + raid_bdev=$($rpc_py bdev_raid_get_bdevs all | jq -r '.[]') + if [ -n "$raid_bdev" ]; then + return 1 + fi + + # Delete remaining base bdevs + for ((i = 1; i < num_base_bdevs; i++)); do + $rpc_py bdev_passthru_delete ${base_bdevs_pt[$i]} + done + + # Re-add first base bdev + # This is the "failed" device and contains the "old" version of the superblock + $rpc_py bdev_passthru_create -b ${base_bdevs_malloc[0]} -p ${base_bdevs_pt[0]} -u ${base_bdevs_pt_uuid[0]} + + # Check if the RAID bdev is in configuring state + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs; then + return 1 + fi + + # Re-add the last base bdev + i=$((num_base_bdevs - 1)) + $rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]} + + # Check if the RAID bdev is in configuring state + # This should use the newer superblock version and have n-1 online base bdevs + if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $((num_base_bdevs - 1)); then + return 1 + fi + + # Re-add remaining base bdevs + for ((i = 1; i < num_base_bdevs - 1; i++)); do + $rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]} + done + + # Check if the RAID bdev is in online state (degraded) + if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1)); then + return 1 + fi + fi + + # Check if the RAID bdev has the same UUID as when first created + if [ "$($rpc_py bdev_get_bdevs -b $raid_bdev_name | jq -r '.[] | .uuid')" != "$raid_bdev_uuid" ]; then + return 1 + fi + fi + killprocess $raid_pid return 0