nvme_rdma: handle DEVICE_REMOVAL event in RDMA initiator

When IBV_EVENT_DEVICE_FATAL & RDMA_CM_EVENT_DEVICE_REMOVAL occurs,
destroy qpair immediately and do no assume that no successful WQE will
be received after rdma_disconnect.

Signed-off-by: sijie.sun <sijie.sun@smartx.com>
Change-Id: I23e44dd32c8adea301e5251659b1be519f5dfdf7
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/16314
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Community-CI: Mellanox Build Bot
This commit is contained in:
sijie.sun 2023-01-17 03:35:43 +00:00 committed by Tomasz Zawadzki
parent 549be9ad81
commit e44d631724
2 changed files with 83 additions and 28 deletions

View File

@ -244,6 +244,7 @@ struct nvme_rdma_qpair {
bool in_connect_poll; bool in_connect_poll;
uint8_t stale_conn_retry_count; uint8_t stale_conn_retry_count;
bool need_destroy;
}; };
enum NVME_RDMA_COMPLETION_FLAGS { enum NVME_RDMA_COMPLETION_FLAGS {
@ -509,6 +510,7 @@ nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair)
break; break;
case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DEVICE_REMOVAL:
rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
rqpair->need_destroy = true;
break; break;
case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_JOIN:
case RDMA_CM_EVENT_MULTICAST_ERROR: case RDMA_CM_EVENT_MULTICAST_ERROR:
@ -1889,9 +1891,6 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)
spdk_rdma_qp_destroy(rqpair->rdma_qp); spdk_rdma_qp_destroy(rqpair->rdma_qp);
rqpair->rdma_qp = NULL; rqpair->rdma_qp = NULL;
} }
rdma_destroy_id(rqpair->cm_id);
rqpair->cm_id = NULL;
} }
if (rqpair->poller) { if (rqpair->poller) {
@ -1916,6 +1915,12 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)
nvme_rdma_free_reqs(rqpair); nvme_rdma_free_reqs(rqpair);
nvme_rdma_free_rsps(rqpair->rsps); nvme_rdma_free_rsps(rqpair->rsps);
rqpair->rsps = NULL; rqpair->rsps = NULL;
/* destroy cm_id last so cma device will not be freed before we destroy the cq. */
if (rqpair->cm_id) {
rdma_destroy_id(rqpair->cm_id);
rqpair->cm_id = NULL;
}
} }
static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
@ -1941,8 +1946,9 @@ nvme_rdma_qpair_disconnected(struct nvme_rdma_qpair *rqpair, int ret)
goto quiet; goto quiet;
} }
if (rqpair->current_num_sends != 0 || if (rqpair->need_destroy ||
(!rqpair->srq && rqpair->rsps->current_num_recvs != 0)) { (rqpair->current_num_sends != 0 ||
(!rqpair->srq && rqpair->rsps->current_num_recvs != 0))) {
rqpair->state = NVME_RDMA_QPAIR_STATE_LINGERING; rqpair->state = NVME_RDMA_QPAIR_STATE_LINGERING;
rqpair->evt_timeout_ticks = (NVME_RDMA_DISCONNECTED_QPAIR_TIMEOUT_US * spdk_get_ticks_hz()) / rqpair->evt_timeout_ticks = (NVME_RDMA_DISCONNECTED_QPAIR_TIMEOUT_US * spdk_get_ticks_hz()) /
SPDK_SEC_TO_USEC + spdk_get_ticks(); SPDK_SEC_TO_USEC + spdk_get_ticks();
@ -2570,13 +2576,13 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
struct spdk_nvme_rdma_req *rdma_req; struct spdk_nvme_rdma_req *rdma_req;
rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr); rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr);
rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL;
if (!rqpair) {
rqpair = rdma_qpair != NULL ? rdma_qpair : get_rdma_qpair_from_wc(poller->group, wc);
}
/* If we are flushing I/O */ /* If we are flushing I/O */
if (wc->status) { if (wc->status) {
rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL;
if (!rqpair) {
rqpair = rdma_qpair != NULL ? rdma_qpair : get_rdma_qpair_from_wc(poller->group, wc);
}
if (!rqpair) { if (!rqpair) {
/* When poll_group is used, several qpairs share the same CQ and it is possible to /* When poll_group is used, several qpairs share the same CQ and it is possible to
* receive a completion with error (e.g. IBV_WC_WR_FLUSH_ERR) for already disconnected qpair * receive a completion with error (e.g. IBV_WC_WR_FLUSH_ERR) for already disconnected qpair
@ -2598,9 +2604,19 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
/* We do not support Soft Roce anymore. Other than Soft Roce's bug, we should not /* We do not support Soft Roce anymore. Other than Soft Roce's bug, we should not
* receive a completion without error status after qpair is disconnected/destroyed. * receive a completion without error status after qpair is disconnected/destroyed.
*/ */
assert(rdma_req->req != NULL); if (spdk_unlikely(rdma_req->req == NULL)) {
/*
* Some infiniband drivers do not guarantee the previous assumption after we
* received a RDMA_CM_EVENT_DEVICE_REMOVAL event.
*/
SPDK_ERRLOG("Received malformed completion: request 0x%"PRIx64" type %d\n", wc->wr_id,
rdma_wr->type);
if (!rqpair || !rqpair->need_destroy) {
assert(0);
}
return -ENXIO;
}
rqpair = nvme_rdma_qpair(rdma_req->req->qpair);
rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED; rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED;
assert(rqpair->current_num_sends > 0); assert(rqpair->current_num_sends > 0);
rqpair->current_num_sends--; rqpair->current_num_sends--;

View File

@ -10,6 +10,11 @@ source $rootdir/test/setup/common.sh
source $rootdir/test/common/autotest_common.sh source $rootdir/test/common/autotest_common.sh
source $rootdir/test/nvmf/common.sh source $rootdir/test/nvmf/common.sh
tgt_core_mask='0x3'
bdevperf_core_mask='0x4'
bdevperf_rpc_sock=/var/tmp/bdevperf.sock
bdevperf_rpc_pid=-1
nvmftestinit nvmftestinit
function get_subsystem_nqn() { function get_subsystem_nqn() {
@ -33,15 +38,6 @@ function create_subsystem_and_connect_on_netdev() {
$rpc_py nvmf_subsystem_add_ns $nqn $malloc_name $rpc_py nvmf_subsystem_add_ns $nqn $malloc_name
$rpc_py nvmf_subsystem_add_listener $nqn -t $TEST_TRANSPORT -a $ip -s $NVMF_PORT $rpc_py nvmf_subsystem_add_listener $nqn -t $TEST_TRANSPORT -a $ip -s $NVMF_PORT
if ! nvme connect -t $TEST_TRANSPORT -n $nqn -a $ip -s $NVMF_PORT; then
exit 1
fi
waitforserial "$serial"
nvme_name=$(lsblk -l -o NAME,SERIAL | grep -oP "([\w]*)(?=\s+${serial})")
nvme_size=$(sec_size_to_bytes $nvme_name)
echo "${nvme_name}"
return 0 return 0
} }
@ -87,16 +83,56 @@ function get_rdma_dev_count_in_nvmf_tgt() {
$rpc_py nvmf_get_stats | jq -r '.poll_groups[0].transports[].devices | length' $rpc_py nvmf_get_stats | jq -r '.poll_groups[0].transports[].devices | length'
} }
function generate_io_traffic_with_bdevperf() {
local dev_names=("$@")
mkdir -p $testdir
$rootdir/build/examples/bdevperf -m $bdevperf_core_mask -z -r $bdevperf_rpc_sock -q 128 -o 4096 -w verify -t 90 &> $testdir/try.txt &
bdevperf_pid=$!
trap 'process_shm --id $NVMF_APP_SHM_ID; cat $testdir/try.txt; rm -f $testdir/try.txt; kill -9 $bdevperf_pid; nvmftestfini; exit 1' SIGINT SIGTERM EXIT
waitforlisten $bdevperf_pid $bdevperf_rpc_sock
# Create a controller and set multipath behavior
# bdev_retry_count is set to -1 means infinite reconnects
$rpc_py -s $bdevperf_rpc_sock bdev_nvme_set_options -r -1
for dev_name in "${dev_names[@]}"; do
nqn=$(get_subsystem_nqn $dev_name)
tgt_ip=$(get_ip_address "$dev_name")
# -l -1 ctrlr_loss_timeout_sec -1 means infinite reconnects
# -o 1 reconnect_delay_sec time to delay a reconnect retry is limited to 1 sec
$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b Nvme_$dev_name -t $TEST_TRANSPORT -a $tgt_ip -s $NVMF_PORT -f ipv4 -n $nqn -l -1 -o 1
done
$rootdir/examples/bdev/bdevperf/bdevperf.py -t 120 -s $bdevperf_rpc_sock perform_tests &
bdevperf_rpc_pid=$!
sleep 5
}
function stop_bdevperf() {
wait $bdevperf_rpc_pid
# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
killprocess $bdevperf_pid || true
bdevperf_pid=
cat $testdir/try.txt
trap - SIGINT SIGTERM EXIT
rm -f $testdir/try.txt
}
function test_remove_and_rescan() { function test_remove_and_rescan() {
nvmfappstart -m 0xF nvmfappstart -m "$tgt_core_mask"
create_subsystem_and_connect "$@" create_subsystem_and_connect "$@"
for net_dev in "${!netdev_nvme_dict[@]}"; do generate_io_traffic_with_bdevperf "${!netdev_nvme_dict[@]}"
$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 40 &
fio_pid=$!
sleep 3
for net_dev in "${!netdev_nvme_dict[@]}"; do
nvme_dev=${netdev_nvme_dict[$net_dev]} nvme_dev=${netdev_nvme_dict[$net_dev]}
rdma_dev_name=$(get_rdma_device_name $net_dev) rdma_dev_name=$(get_rdma_device_name $net_dev)
origin_ip=$(get_ip_address "$net_dev") origin_ip=$(get_ip_address "$net_dev")
@ -162,6 +198,8 @@ function test_remove_and_rescan() {
done done
done done
stop_bdevperf
# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f). # NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
killprocess $nvmfpid || true killprocess $nvmfpid || true
nvmfpid= nvmfpid=
@ -229,7 +267,7 @@ function test_bonding_slaves_on_nics() {
# wait ib driver activated on bond device # wait ib driver activated on bond device
sleep 5 sleep 5
nvmfappstart -m 0xF nvmfappstart -m "$tgt_core_mask"
$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192 $rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192
create_subsystem_and_connect_on_netdev $BOND_NAME create_subsystem_and_connect_on_netdev $BOND_NAME
@ -237,8 +275,7 @@ function test_bonding_slaves_on_nics() {
ib_count=$(get_rdma_dev_count_in_nvmf_tgt) ib_count=$(get_rdma_dev_count_in_nvmf_tgt)
echo "IB Count: " $ib_count echo "IB Count: " $ib_count
$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 10 & generate_io_traffic_with_bdevperf $BOND_NAME
fio_pid=$!
sleep 2 sleep 2
echo -$nic1 | sudo tee /sys/class/net/${BOND_NAME}/bonding/slaves echo -$nic1 | sudo tee /sys/class/net/${BOND_NAME}/bonding/slaves
@ -257,6 +294,8 @@ function test_bonding_slaves_on_nics() {
exit 1 exit 1
fi fi
stop_bdevperf
# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f). # NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
killprocess $nvmfpid || true killprocess $nvmfpid || true
nvmfpid= nvmfpid=