diff --git a/test/vhost/common/common.sh b/test/vhost/common/common.sh index 2ebcb767a..fb001c31e 100644 --- a/test/vhost/common/common.sh +++ b/test/vhost/common/common.sh @@ -220,8 +220,9 @@ function spdk_vhost_kill() rm $vhost_pid_file rc=1 else - #check vhost return code, activate trap on error - wait $vhost_pid + while kill -0 $vhost_pid; do + echo "." + done fi elif /bin/kill -0 $vhost_pid; then error "vhost NOT killed - you need to kill it manually" @@ -294,9 +295,12 @@ function vm_create_ssh_config() echo " UserKnownHostsFile=/dev/null" echo " StrictHostKeyChecking=no" echo " User root" - echo " ControlPath=$VM_BASE_DIR/%r@%h:%p.ssh" + echo " ControlPath=/tmp/%r@%h:%p.ssh" echo "" ) > $ssh_config + # Control path created at /tmp because of live migration test case 3. + # In case of using sshfs share for the test - control path cannot be + # on share because remote server will fail on ssh commands. fi } diff --git a/test/vhost/migration/autotest.config b/test/vhost/migration/autotest.config new file mode 100644 index 000000000..a86f7a2f9 --- /dev/null +++ b/test/vhost/migration/autotest.config @@ -0,0 +1,14 @@ +vhost_0_reactor_mask=0x1 +vhost_0_master_core=0 + +vhost_1_reactor_mask=0x1 +vhost_1_master_core=0 + +VM_0_qemu_mask=0x1 +VM_0_qemu_numa_node=0 + +VM_1_qemu_mask=0x1 +VM_1_qemu_numa_node=0 + +VM_2_qemu_mask=0x1 +VM_2_qemu_numa_node=0 diff --git a/test/vhost/migration/migration-tc3.job b/test/vhost/migration/migration-tc3.job new file mode 100644 index 000000000..fe1929662 --- /dev/null +++ b/test/vhost/migration/migration-tc3.job @@ -0,0 +1,20 @@ +[global] +blocksize=4k-512k +iodepth=128 +ioengine=libaio +filename= +group_reporting +thread +numjobs=1 +direct=1 +do_verify=1 +verify=md5 +verify_fatal=1 +verify_dump=1 +verify_backlog=8 + +[randwrite] +rw=randwrite +runtime=15 +time_based +stonewall diff --git a/test/vhost/migration/migration-tc3a.sh b/test/vhost/migration/migration-tc3a.sh new file mode 100644 index 000000000..77dc56222 --- /dev/null +++ b/test/vhost/migration/migration-tc3a.sh @@ -0,0 +1,206 @@ +source $SPDK_BUILD_DIR/test/nvmf/common.sh +source $BASE_DIR/autotest.config + +MGMT_TARGET_IP="10.102.17.181" +MGMT_INITIATOR_IP="10.102.17.180" +RDMA_TARGET_IP="10.0.0.1" +RDMA_INITIATOR_IP="10.0.0.2" +incoming_vm=1 +target_vm=2 +incoming_vm_ctrlr=naa.VhostScsi0.$incoming_vm +target_vm_ctrlr=naa.VhostScsi0.$target_vm +share_dir=$TEST_DIR/share +job_file=$BASE_DIR/migration-tc3.job + +function ssh_remote() +{ + local ssh_cmd="ssh -i $SPDK_VHOST_SSH_KEY_FILE \ + -o UserKnownHostsFile=/dev/null \ + -o StrictHostKeyChecking=no -o ControlMaster=auto \ + root@$1" + + shift + $ssh_cmd "$@" +} + +function wait_for_remote() +{ + local timeout=40 + set +x + while [[ ! -f $share_dir/DONE ]]; do + echo -n "." + if (( timeout-- == 0 )); then + error "timeout while waiting for FIO!" + fi + sleep 1 + done + set -x + rm -f $share_dir/DONE +} + +function check_rdma_connection() +{ + local nic_name=$(ip -4 -o addr show to $RDMA_TARGET_IP up | cut -d' ' -f2) + if [[ -z $nic_name ]]; then + error "There is no NIC with IP address $RDMA_TARGET_IP configured" + fi + + if ! ls /sys/class/infiniband/*/device/net/$nic_name &> /dev/null; then + error "$nic_name with IP $RDMA_TARGET_IP is not a RDMA capable NIC" + fi + +} + +function host1_cleanup_nvmf() +{ + notice "Shutting down nvmf_tgt on local server" + if [[ ! -z "$1" ]]; then + pkill --signal $1 -F $nvmf_dir/nvmf_tgt.pid + else + pkill -F $nvmf_dir/nvmf_tgt.pid + fi + rm -f $nvmf_dir/nvmf_tgt.pid +} + +function host1_cleanup_vhost() +{ + trap 'host1_cleanup_nvmf SIGKILL; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT + notice "Shutting down VM $incoming_vm" + vm_kill $incoming_vm + + notice "Removing bdev & controller from vhost on local server" + $rpc_0 delete_bdev Nvme0n1 + $rpc_0 remove_vhost_controller $incoming_vm_ctrlr + + notice "Shutting down vhost app" + spdk_vhost_kill 0 + + host1_cleanup_nvmf +} + +function host1_start_nvmf() +{ + nvmf_dir="$TEST_DIR/nvmf_tgt" + rpc_nvmf="python $SPDK_BUILD_DIR/scripts/rpc.py -s $nvmf_dir/nvmf_rpc.sock" + + notice "Starting nvmf_tgt instance on local server" + mkdir -p $nvmf_dir + rm -rf $nvmf_dir/* + + cp $SPDK_BUILD_DIR/test/nvmf/nvmf.conf $nvmf_dir/nvmf.conf + $SPDK_BUILD_DIR/scripts/gen_nvme.sh >> $nvmf_dir/nvmf.conf + + trap 'host1_cleanup_nvmf SIGKILL; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT + $SPDK_BUILD_DIR/app/nvmf_tgt/nvmf_tgt -s 512 -c $nvmf_dir/nvmf.conf -r $nvmf_dir/nvmf_rpc.sock & + nvmf_tgt_pid=$! + echo $nvmf_tgt_pid > $nvmf_dir/nvmf_tgt.pid + waitforlisten "$nvmf_tgt_pid" "$nvmf_dir/nvmf_rpc.sock" + + $rpc_nvmf construct_nvmf_subsystem nqn.2018-02.io.spdk:cnode1 \ + "trtype:RDMA traddr:$RDMA_TARGET_IP trsvcid:4420" "" -a -s SPDK01 -n Nvme0n1 +} + +function host1_start_vhost() +{ + rpc_0="python $SPDK_BUILD_DIR/scripts/rpc.py -s $(get_vhost_dir 0)/rpc.sock" + + notice "Starting vhost0 instance on local server" + trap 'host1_cleanup_vhost; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT + spdk_vhost_run --conf-path=$BASE_DIR --vhost-num=0 + $rpc_0 construct_nvme_bdev -b Nvme0 -t rdma -f ipv4 -a $RDMA_TARGET_IP -s 4420 -n "nqn.2018-02.io.spdk:cnode1" + $rpc_0 construct_vhost_scsi_controller $incoming_vm_ctrlr + $rpc_0 add_vhost_scsi_lun $incoming_vm_ctrlr 0 Nvme0n1 + + vm_setup --os="$share_dir/migration.qcow2" --force=$incoming_vm --disk-type=spdk_vhost_scsi --disks=VhostScsi0 \ + --migrate-to=$target_vm --memory=512 --queue_num=1 + + # TODO: Fix loop calculating cpu_num in common.sh + # We need -smp 1 and -queue_num 1 for this test to work, and this loop + # in some cases calculates wrong cpu_num. + sed -i "s#smp 2#smp 1#g" $VM_BASE_DIR/$incoming_vm/run.sh + vm_run $incoming_vm + vm_wait_for_boot 300 $incoming_vm +} + +function cleanup_share() +{ + set +e + notice "Cleaning up share directory on remote and local server" + ssh_remote $MGMT_INITIATOR_IP "umount $VM_BASE_DIR" + ssh_remote $MGMT_INITIATOR_IP "umount $share_dir; rm -f $share_dir/*" + rm -f $share_dir/migration.qcow2 + rm -f $share_dir/spdk.tar.gz + set -e +} + +function host_1_create_share() +{ + notice "Creating share directory on local server to re-use on remote" + mkdir -p $share_dir + mkdir -p $VM_BASE_DIR # This dir would've been created later but we need it now + rm -rf $share_dir/spdk.tar.gz $share_dir/spdk || true + cp $os_image $share_dir/migration.qcow2 + tar --exclude="*.o"--exclude="*.d" --exclude="*.git" -C $SPDK_BUILD_DIR -zcf $share_dir/spdk.tar.gz . +} + +function host_2_create_share() +{ + # Copy & compile the sources for later use on remote server. + ssh_remote $MGMT_INITIATOR_IP "uname -a" + ssh_remote $MGMT_INITIATOR_IP "mkdir -p $share_dir" + ssh_remote $MGMT_INITIATOR_IP "mkdir -p $VM_BASE_DIR" + ssh_remote $MGMT_INITIATOR_IP "sshfs -o ssh_command=\"ssh -i $SPDK_VHOST_SSH_KEY_FILE\" root@$MGMT_TARGET_IP:$VM_BASE_DIR $VM_BASE_DIR" + ssh_remote $MGMT_INITIATOR_IP "sshfs -o ssh_command=\"ssh -i $SPDK_VHOST_SSH_KEY_FILE\" root@$MGMT_TARGET_IP:$share_dir $share_dir" + ssh_remote $MGMT_INITIATOR_IP "mkdir -p $share_dir/spdk" + ssh_remote $MGMT_INITIATOR_IP "tar -zxf $share_dir/spdk.tar.gz -C $share_dir/spdk --strip-components=1" + ssh_remote $MGMT_INITIATOR_IP "cd $share_dir/spdk; make clean; ./configure --with-rdma --enable-debug; make -j40" +} + +function host_2_start_vhost() +{ + ssh_remote $MGMT_INITIATOR_IP "nohup $share_dir/spdk/test/vhost/migration/migration.sh --test-cases=3b --work-dir=$TEST_DIR --os=$share_dir/migration.qcow2 &>$share_dir/output.log &" + notice "Waiting for remote to be done with vhost & VM setup..." + wait_for_remote +} + +function setup_share() +{ + trap 'cleanup_share; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT + host_1_create_share + host_2_create_share +} + +function migration_tc3() +{ + check_rdma_connection + setup_share + host1_start_nvmf + host1_start_vhost + host_2_start_vhost + + # Do migration + notice "Starting fio on local VM" + vm_check_scsi_location $incoming_vm + + run_fio $fio_bin --job-file="$job_file" --local --vm="${incoming_vm}$(printf ':/dev/%s' $SCSI_DISK)" + sleep 5 + + if ! is_fio_running $incoming_vm; then + vh_ssh $incoming_vm "cat /root/$(basename ${job_file}).out" + error "Fio not running on local VM before starting migration!" + fi + + vm_migrate $incoming_vm $RDMA_INITIATOR_IP + sleep 1 + + # Verify migration on remote host and clean up vhost + ssh_remote $MGMT_INITIATOR_IP "pkill -CONT -F $TEST_DIR/tc3b.pid" + notice "Waiting for remote to finish FIO on VM and clean up..." + wait_for_remote + + # Clean up local stuff + host1_cleanup_vhost + cleanup_share +} + +migration_tc3 diff --git a/test/vhost/migration/migration-tc3b.sh b/test/vhost/migration/migration-tc3b.sh new file mode 100755 index 000000000..79e21c541 --- /dev/null +++ b/test/vhost/migration/migration-tc3b.sh @@ -0,0 +1,80 @@ +# Set -m option is needed to be able to use "suspend" command +# as we are usin non-interactive session to connect to remote. +# Without -m it would be not possible to suspend the process. +set -m +source $BASE_DIR/autotest.config + +RDMA_TARGET_IP="10.0.0.1" +incoming_vm=1 +target_vm=2 +target_vm_ctrl=naa.VhostScsi0.$target_vm +rpc="python $SPDK_BUILD_DIR/scripts/rpc.py -s $(get_vhost_dir 1)/rpc.sock" +share_dir=$TEST_DIR/share + +function host_2_cleanup_vhost() +{ + notice "Shutting down VM $target_vm" + vm_kill $target_vm + + notice "Removing bdev & controller from vhost 1 on remote server" + $rpc delete_bdev Nvme0n1 + $rpc remove_vhost_controller $target_vm_ctrl + + notice "Shutting down vhost app" + spdk_vhost_kill 1 + sleep 1 +} + +function host_2_start_vhost() +{ + echo "BASE DIR $TEST_DIR" + vhost_work_dir=$TEST_DIR/vhost1 + mkdir -p $vhost_work_dir + rm -f $vhost_work_dir/* + + notice "Starting vhost 1 instance on remote server" + trap 'host_2_cleanup_vhost; error_exit "${FUNCNAME}" "${LINENO}"' INT ERR EXIT + spdk_vhost_run --conf-path=$BASE_DIR --vhost-num=1 + + $rpc construct_nvme_bdev -b Nvme0 -t rdma -f ipv4 -a $RDMA_TARGET_IP -s 4420 -n "nqn.2018-02.io.spdk:cnode1" + $rpc construct_vhost_scsi_controller $target_vm_ctrl + $rpc add_vhost_scsi_lun $target_vm_ctrl 0 Nvme0n1 + + vm_setup --os="$os_image" --force=$target_vm --disk-type=spdk_vhost_scsi --disks=VhostScsi0 \ + --memory=512 --vhost-num=1 --incoming=$incoming_vm + vm_run $target_vm + sleep 1 + + # Use this file as a flag to notify main script + # that setup on remote server is done + echo "DONE" > $share_dir/DONE +} + +echo $$ > $TEST_DIR/tc3b.pid +host_2_start_vhost +suspend -f + +if ! vm_os_booted $target_vm; then + fail "VM$target_vm is not running!" +fi + +if ! is_fio_running $target_vm; then + vm_ssh $target_vm "cat /root/migration-tc3.job.out" + error "FIO is not running on remote server after migration!" +fi + +notice "Waiting for FIO to finish on remote server VM" +timeout=40 +while is_fio_running $target_vm; do + sleep 1 + echo -n "." + if (( timeout-- == 0 )); then + error "timeout while waiting for FIO!" + fi +done + +notice "FIO result after migration:" +vm_ssh $target_vm "cat /root/migration-tc3.job.out" + +host_2_cleanup_vhost +echo "DONE" > $share_dir/DONE diff --git a/test/vhost/migration/migration.sh b/test/vhost/migration/migration.sh index 719625c05..5fb8cdd00 100755 --- a/test/vhost/migration/migration.sh +++ b/test/vhost/migration/migration.sh @@ -70,6 +70,11 @@ function vm_migrate() local target_vm_dir="$(readlink -e $from_vm_dir/vm_migrate_to)" local target_vm="$(basename $target_vm_dir)" local target_vm_migration_port="$(cat $target_vm_dir/migration_port)" + if [[ -n "$2" ]]; then + local target_ip=$2 + else + local target_ip="127.0.0.1" + fi # Sanity check if target VM (QEMU) is configured to accept source VM (QEMU) migration if [[ "$(readlink -e ${target_vm_dir}/vm_incoming)" != "$(readlink -e ${from_vm_dir})" ]]; then @@ -80,7 +85,7 @@ function vm_migrate() notice "Migrating VM $1 to VM "$(basename $target_vm_dir) echo -e \ "migrate_set_speed 1g\n" \ - "migrate tcp:127.0.0.1:$target_vm_migration_port\n" \ + "migrate tcp:$target_ip:$target_vm_migration_port\n" \ "info migrate\n" \ "quit" | vm_monitor_send $1 "$from_vm_dir/migration_result" @@ -90,9 +95,14 @@ function vm_migrate() fail "Migration failed:\n" fi - if ! vm_os_booted $target_vm; then - fail "VM$target_vm is not running" - cat $target_vm $target_vm_dir/cont_result + # Don't perform the following check if target VM is on remote server + # as we won't have access to it. + # If you need this check then perform it on your own. + if [[ "$target_ip" == "127.0.0.1" ]]; then + if ! vm_os_booted $target_vm; then + fail "VM$target_vm is not running" + cat $target_vm $target_vm_dir/cont_result + fi fi notice "Migration complete" @@ -114,9 +124,6 @@ function is_fio_running() return $ret } -# FIXME: this shoul'd not be needed -vm_kill_all - for test_case in ${test_cases//,/ }; do assert_number "$test_case" notice "==============================="