From 95aa1a7337a10833897be37438c0ec21e4cb9d21 Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Thu, 15 Dec 2022 14:45:32 +0100 Subject: [PATCH] sw_hotplug: avoid hotplug timeouts Avoid hotplug application timeouts on machines with multiple NVMe drives by scaling app run time to number of NVMe drives. Furthermore, change the way we wait for hotplug app initialization by using "perform_tests" RPC, and termination by starting it via timeout command. Second part of the series fixing #2201. Fixes #2201 Change-Id: Id82c8e8f6b9e870a55c4f43a11c755982855deeb Signed-off-by: Krzysztof Karas Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15965 Tested-by: SPDK CI Jenkins Reviewed-by: Konrad Sztyber Reviewed-by: Tomasz Zawadzki --- test/nvme/sw_hotplug.sh | 42 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/test/nvme/sw_hotplug.sh b/test/nvme/sw_hotplug.sh index 43ed89170..5bc149042 100755 --- a/test/nvme/sw_hotplug.sh +++ b/test/nvme/sw_hotplug.sh @@ -8,6 +8,9 @@ rootdir=$(readlink -f $testdir/../..) source $rootdir/scripts/common.sh source $rootdir/test/common/autotest_common.sh +export PYTHONPATH="$rootdir/examples/nvme/hotplug/" +rpc_py=$rootdir/scripts/rpc.py + # Pci bus hotplug # Helper function to remove/attach cotrollers remove_attach_helper() { @@ -16,12 +19,6 @@ remove_attach_helper() { local use_bdev=$3 local dev - # We need to make sure we wait long enough for hotplug to initialize the devices - # and start IO - if we start removing devices before that happens we will end up - # stepping on hotplug's toes forcing it to fail to report proper count of given - # events. - sleep "$hotplug_wait" - while ((hotplug_events--)); do for dev in "${nvmes[@]}"; do echo 1 > "/sys/bus/pci/devices/$dev/remove" @@ -72,30 +69,29 @@ remove_attach_helper() { run_hotplug() { trap 'killprocess $hotplug_pid; exit 1' SIGINT SIGTERM EXIT - "$SPDK_EXAMPLE_DIR/hotplug" \ + test_time=$((hotplug_events * hotplug_wait * nvme_count)) + + # Hotplug may sometimes hang, so start it via timeout command. + timeout -k 2s $((test_time + hotplug_wait)) "$SPDK_EXAMPLE_DIR/hotplug" \ -i 0 \ - -t $((hotplug_events * hotplug_wait + hotplug_wait * 3)) \ + -t $((test_time)) \ -n $((hotplug_events * nvme_count)) \ -r $((hotplug_events * nvme_count)) \ - -l warning & - hotplug_pid=$! + -l warning --wait-for-rpc & + timeout_pid=$! + hotplug_pid=$(ps -o pid= --ppid "$timeout_pid") + + # Make sure Hotplug started before removing and inserting devices. + waitforlisten "$hotplug_pid" + + $rpc_py --plugin hotplug_plugin perform_tests remove_attach_helper "$hotplug_events" "$hotplug_wait" false - # Wait in case hotplug app is lagging behind - # and kill it, if it hung. - sleep $hotplug_wait - - if ! kill -0 "$hotplug_pid"; then - # hotplug already finished, check for the error code. - wait "$hotplug_pid" - else - echo "Killing hotplug application" - killprocess $hotplug_pid - return 1 - fi - trap - SIGINT SIGTERM EXIT + + # Check timeout return code. + wait "$timeout_pid" } # SPDK target hotplug