diff --git a/lib/nvme/nvme_rdma.c b/lib/nvme/nvme_rdma.c index 57fc8a738..fc0119d4a 100644 --- a/lib/nvme/nvme_rdma.c +++ b/lib/nvme/nvme_rdma.c @@ -244,6 +244,7 @@ struct nvme_rdma_qpair { bool in_connect_poll; uint8_t stale_conn_retry_count; + bool need_destroy; }; enum NVME_RDMA_COMPLETION_FLAGS { @@ -509,6 +510,7 @@ nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair) break; case RDMA_CM_EVENT_DEVICE_REMOVAL: rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + rqpair->need_destroy = true; break; case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: @@ -1889,9 +1891,6 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair) spdk_rdma_qp_destroy(rqpair->rdma_qp); rqpair->rdma_qp = NULL; } - - rdma_destroy_id(rqpair->cm_id); - rqpair->cm_id = NULL; } if (rqpair->poller) { @@ -1916,6 +1915,12 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair) nvme_rdma_free_reqs(rqpair); nvme_rdma_free_rsps(rqpair->rsps); rqpair->rsps = NULL; + + /* destroy cm_id last so cma device will not be freed before we destroy the cq. */ + if (rqpair->cm_id) { + rdma_destroy_id(rqpair->cm_id); + rqpair->cm_id = NULL; + } } static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); @@ -1941,8 +1946,9 @@ nvme_rdma_qpair_disconnected(struct nvme_rdma_qpair *rqpair, int ret) goto quiet; } - if (rqpair->current_num_sends != 0 || - (!rqpair->srq && rqpair->rsps->current_num_recvs != 0)) { + if (rqpair->need_destroy || + (rqpair->current_num_sends != 0 || + (!rqpair->srq && rqpair->rsps->current_num_recvs != 0))) { rqpair->state = NVME_RDMA_QPAIR_STATE_LINGERING; rqpair->evt_timeout_ticks = (NVME_RDMA_DISCONNECTED_QPAIR_TIMEOUT_US * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC + spdk_get_ticks(); @@ -2570,13 +2576,13 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller, struct spdk_nvme_rdma_req *rdma_req; rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr); + rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL; + if (!rqpair) { + rqpair = rdma_qpair != NULL ? rdma_qpair : get_rdma_qpair_from_wc(poller->group, wc); + } /* If we are flushing I/O */ if (wc->status) { - rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL; - if (!rqpair) { - rqpair = rdma_qpair != NULL ? rdma_qpair : get_rdma_qpair_from_wc(poller->group, wc); - } if (!rqpair) { /* When poll_group is used, several qpairs share the same CQ and it is possible to * receive a completion with error (e.g. IBV_WC_WR_FLUSH_ERR) for already disconnected qpair @@ -2598,9 +2604,19 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller, /* We do not support Soft Roce anymore. Other than Soft Roce's bug, we should not * receive a completion without error status after qpair is disconnected/destroyed. */ - assert(rdma_req->req != NULL); + if (spdk_unlikely(rdma_req->req == NULL)) { + /* + * Some infiniband drivers do not guarantee the previous assumption after we + * received a RDMA_CM_EVENT_DEVICE_REMOVAL event. + */ + SPDK_ERRLOG("Received malformed completion: request 0x%"PRIx64" type %d\n", wc->wr_id, + rdma_wr->type); + if (!rqpair || !rqpair->need_destroy) { + assert(0); + } + return -ENXIO; + } - rqpair = nvme_rdma_qpair(rdma_req->req->qpair); rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED; assert(rqpair->current_num_sends > 0); rqpair->current_num_sends--; diff --git a/test/nvmf/target/device_removal.sh b/test/nvmf/target/device_removal.sh index 7ce9d1049..18d7dce19 100755 --- a/test/nvmf/target/device_removal.sh +++ b/test/nvmf/target/device_removal.sh @@ -10,6 +10,11 @@ source $rootdir/test/setup/common.sh source $rootdir/test/common/autotest_common.sh source $rootdir/test/nvmf/common.sh +tgt_core_mask='0x3' +bdevperf_core_mask='0x4' +bdevperf_rpc_sock=/var/tmp/bdevperf.sock +bdevperf_rpc_pid=-1 + nvmftestinit function get_subsystem_nqn() { @@ -33,15 +38,6 @@ function create_subsystem_and_connect_on_netdev() { $rpc_py nvmf_subsystem_add_ns $nqn $malloc_name $rpc_py nvmf_subsystem_add_listener $nqn -t $TEST_TRANSPORT -a $ip -s $NVMF_PORT - if ! nvme connect -t $TEST_TRANSPORT -n $nqn -a $ip -s $NVMF_PORT; then - exit 1 - fi - - waitforserial "$serial" - nvme_name=$(lsblk -l -o NAME,SERIAL | grep -oP "([\w]*)(?=\s+${serial})") - nvme_size=$(sec_size_to_bytes $nvme_name) - - echo "${nvme_name}" return 0 } @@ -87,16 +83,56 @@ function get_rdma_dev_count_in_nvmf_tgt() { $rpc_py nvmf_get_stats | jq -r '.poll_groups[0].transports[].devices | length' } +function generate_io_traffic_with_bdevperf() { + local dev_names=("$@") + + mkdir -p $testdir + $rootdir/build/examples/bdevperf -m $bdevperf_core_mask -z -r $bdevperf_rpc_sock -q 128 -o 4096 -w verify -t 90 &> $testdir/try.txt & + bdevperf_pid=$! + + trap 'process_shm --id $NVMF_APP_SHM_ID; cat $testdir/try.txt; rm -f $testdir/try.txt; kill -9 $bdevperf_pid; nvmftestfini; exit 1' SIGINT SIGTERM EXIT + waitforlisten $bdevperf_pid $bdevperf_rpc_sock + + # Create a controller and set multipath behavior + # bdev_retry_count is set to -1 means infinite reconnects + $rpc_py -s $bdevperf_rpc_sock bdev_nvme_set_options -r -1 + + for dev_name in "${dev_names[@]}"; do + nqn=$(get_subsystem_nqn $dev_name) + tgt_ip=$(get_ip_address "$dev_name") + + # -l -1 ctrlr_loss_timeout_sec -1 means infinite reconnects + # -o 1 reconnect_delay_sec time to delay a reconnect retry is limited to 1 sec + $rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b Nvme_$dev_name -t $TEST_TRANSPORT -a $tgt_ip -s $NVMF_PORT -f ipv4 -n $nqn -l -1 -o 1 + done + + $rootdir/examples/bdev/bdevperf/bdevperf.py -t 120 -s $bdevperf_rpc_sock perform_tests & + bdevperf_rpc_pid=$! + + sleep 5 +} + +function stop_bdevperf() { + wait $bdevperf_rpc_pid + + # NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f). + killprocess $bdevperf_pid || true + bdevperf_pid= + + cat $testdir/try.txt + + trap - SIGINT SIGTERM EXIT + rm -f $testdir/try.txt +} + function test_remove_and_rescan() { - nvmfappstart -m 0xF + nvmfappstart -m "$tgt_core_mask" create_subsystem_and_connect "$@" - for net_dev in "${!netdev_nvme_dict[@]}"; do - $rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 40 & - fio_pid=$! - sleep 3 + generate_io_traffic_with_bdevperf "${!netdev_nvme_dict[@]}" + for net_dev in "${!netdev_nvme_dict[@]}"; do nvme_dev=${netdev_nvme_dict[$net_dev]} rdma_dev_name=$(get_rdma_device_name $net_dev) origin_ip=$(get_ip_address "$net_dev") @@ -162,6 +198,8 @@ function test_remove_and_rescan() { done done + stop_bdevperf + # NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f). killprocess $nvmfpid || true nvmfpid= @@ -229,7 +267,7 @@ function test_bonding_slaves_on_nics() { # wait ib driver activated on bond device sleep 5 - nvmfappstart -m 0xF + nvmfappstart -m "$tgt_core_mask" $rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192 create_subsystem_and_connect_on_netdev $BOND_NAME @@ -237,8 +275,7 @@ function test_bonding_slaves_on_nics() { ib_count=$(get_rdma_dev_count_in_nvmf_tgt) echo "IB Count: " $ib_count - $rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 10 & - fio_pid=$! + generate_io_traffic_with_bdevperf $BOND_NAME sleep 2 echo -$nic1 | sudo tee /sys/class/net/${BOND_NAME}/bonding/slaves @@ -257,6 +294,8 @@ function test_bonding_slaves_on_nics() { exit 1 fi + stop_bdevperf + # NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f). killprocess $nvmfpid || true nvmfpid=