nvme_rdma: handle DEVICE_REMOVAL event in RDMA initiator
When IBV_EVENT_DEVICE_FATAL & RDMA_CM_EVENT_DEVICE_REMOVAL occurs, destroy qpair immediately and do no assume that no successful WQE will be received after rdma_disconnect. Signed-off-by: sijie.sun <sijie.sun@smartx.com> Change-Id: I23e44dd32c8adea301e5251659b1be519f5dfdf7 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/16314 Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com> Community-CI: Mellanox Build Bot
This commit is contained in:
parent
549be9ad81
commit
e44d631724
@ -244,6 +244,7 @@ struct nvme_rdma_qpair {
|
||||
bool in_connect_poll;
|
||||
|
||||
uint8_t stale_conn_retry_count;
|
||||
bool need_destroy;
|
||||
};
|
||||
|
||||
enum NVME_RDMA_COMPLETION_FLAGS {
|
||||
@ -509,6 +510,7 @@ nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair)
|
||||
break;
|
||||
case RDMA_CM_EVENT_DEVICE_REMOVAL:
|
||||
rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
|
||||
rqpair->need_destroy = true;
|
||||
break;
|
||||
case RDMA_CM_EVENT_MULTICAST_JOIN:
|
||||
case RDMA_CM_EVENT_MULTICAST_ERROR:
|
||||
@ -1889,9 +1891,6 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)
|
||||
spdk_rdma_qp_destroy(rqpair->rdma_qp);
|
||||
rqpair->rdma_qp = NULL;
|
||||
}
|
||||
|
||||
rdma_destroy_id(rqpair->cm_id);
|
||||
rqpair->cm_id = NULL;
|
||||
}
|
||||
|
||||
if (rqpair->poller) {
|
||||
@ -1916,6 +1915,12 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)
|
||||
nvme_rdma_free_reqs(rqpair);
|
||||
nvme_rdma_free_rsps(rqpair->rsps);
|
||||
rqpair->rsps = NULL;
|
||||
|
||||
/* destroy cm_id last so cma device will not be freed before we destroy the cq. */
|
||||
if (rqpair->cm_id) {
|
||||
rdma_destroy_id(rqpair->cm_id);
|
||||
rqpair->cm_id = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
|
||||
@ -1941,8 +1946,9 @@ nvme_rdma_qpair_disconnected(struct nvme_rdma_qpair *rqpair, int ret)
|
||||
goto quiet;
|
||||
}
|
||||
|
||||
if (rqpair->current_num_sends != 0 ||
|
||||
(!rqpair->srq && rqpair->rsps->current_num_recvs != 0)) {
|
||||
if (rqpair->need_destroy ||
|
||||
(rqpair->current_num_sends != 0 ||
|
||||
(!rqpair->srq && rqpair->rsps->current_num_recvs != 0))) {
|
||||
rqpair->state = NVME_RDMA_QPAIR_STATE_LINGERING;
|
||||
rqpair->evt_timeout_ticks = (NVME_RDMA_DISCONNECTED_QPAIR_TIMEOUT_US * spdk_get_ticks_hz()) /
|
||||
SPDK_SEC_TO_USEC + spdk_get_ticks();
|
||||
@ -2570,13 +2576,13 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
|
||||
struct spdk_nvme_rdma_req *rdma_req;
|
||||
|
||||
rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr);
|
||||
|
||||
/* If we are flushing I/O */
|
||||
if (wc->status) {
|
||||
rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL;
|
||||
if (!rqpair) {
|
||||
rqpair = rdma_qpair != NULL ? rdma_qpair : get_rdma_qpair_from_wc(poller->group, wc);
|
||||
}
|
||||
|
||||
/* If we are flushing I/O */
|
||||
if (wc->status) {
|
||||
if (!rqpair) {
|
||||
/* When poll_group is used, several qpairs share the same CQ and it is possible to
|
||||
* receive a completion with error (e.g. IBV_WC_WR_FLUSH_ERR) for already disconnected qpair
|
||||
@ -2598,9 +2604,19 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
|
||||
/* We do not support Soft Roce anymore. Other than Soft Roce's bug, we should not
|
||||
* receive a completion without error status after qpair is disconnected/destroyed.
|
||||
*/
|
||||
assert(rdma_req->req != NULL);
|
||||
if (spdk_unlikely(rdma_req->req == NULL)) {
|
||||
/*
|
||||
* Some infiniband drivers do not guarantee the previous assumption after we
|
||||
* received a RDMA_CM_EVENT_DEVICE_REMOVAL event.
|
||||
*/
|
||||
SPDK_ERRLOG("Received malformed completion: request 0x%"PRIx64" type %d\n", wc->wr_id,
|
||||
rdma_wr->type);
|
||||
if (!rqpair || !rqpair->need_destroy) {
|
||||
assert(0);
|
||||
}
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
rqpair = nvme_rdma_qpair(rdma_req->req->qpair);
|
||||
rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED;
|
||||
assert(rqpair->current_num_sends > 0);
|
||||
rqpair->current_num_sends--;
|
||||
|
@ -10,6 +10,11 @@ source $rootdir/test/setup/common.sh
|
||||
source $rootdir/test/common/autotest_common.sh
|
||||
source $rootdir/test/nvmf/common.sh
|
||||
|
||||
tgt_core_mask='0x3'
|
||||
bdevperf_core_mask='0x4'
|
||||
bdevperf_rpc_sock=/var/tmp/bdevperf.sock
|
||||
bdevperf_rpc_pid=-1
|
||||
|
||||
nvmftestinit
|
||||
|
||||
function get_subsystem_nqn() {
|
||||
@ -33,15 +38,6 @@ function create_subsystem_and_connect_on_netdev() {
|
||||
$rpc_py nvmf_subsystem_add_ns $nqn $malloc_name
|
||||
$rpc_py nvmf_subsystem_add_listener $nqn -t $TEST_TRANSPORT -a $ip -s $NVMF_PORT
|
||||
|
||||
if ! nvme connect -t $TEST_TRANSPORT -n $nqn -a $ip -s $NVMF_PORT; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
waitforserial "$serial"
|
||||
nvme_name=$(lsblk -l -o NAME,SERIAL | grep -oP "([\w]*)(?=\s+${serial})")
|
||||
nvme_size=$(sec_size_to_bytes $nvme_name)
|
||||
|
||||
echo "${nvme_name}"
|
||||
return 0
|
||||
}
|
||||
|
||||
@ -87,16 +83,56 @@ function get_rdma_dev_count_in_nvmf_tgt() {
|
||||
$rpc_py nvmf_get_stats | jq -r '.poll_groups[0].transports[].devices | length'
|
||||
}
|
||||
|
||||
function generate_io_traffic_with_bdevperf() {
|
||||
local dev_names=("$@")
|
||||
|
||||
mkdir -p $testdir
|
||||
$rootdir/build/examples/bdevperf -m $bdevperf_core_mask -z -r $bdevperf_rpc_sock -q 128 -o 4096 -w verify -t 90 &> $testdir/try.txt &
|
||||
bdevperf_pid=$!
|
||||
|
||||
trap 'process_shm --id $NVMF_APP_SHM_ID; cat $testdir/try.txt; rm -f $testdir/try.txt; kill -9 $bdevperf_pid; nvmftestfini; exit 1' SIGINT SIGTERM EXIT
|
||||
waitforlisten $bdevperf_pid $bdevperf_rpc_sock
|
||||
|
||||
# Create a controller and set multipath behavior
|
||||
# bdev_retry_count is set to -1 means infinite reconnects
|
||||
$rpc_py -s $bdevperf_rpc_sock bdev_nvme_set_options -r -1
|
||||
|
||||
for dev_name in "${dev_names[@]}"; do
|
||||
nqn=$(get_subsystem_nqn $dev_name)
|
||||
tgt_ip=$(get_ip_address "$dev_name")
|
||||
|
||||
# -l -1 ctrlr_loss_timeout_sec -1 means infinite reconnects
|
||||
# -o 1 reconnect_delay_sec time to delay a reconnect retry is limited to 1 sec
|
||||
$rpc_py -s $bdevperf_rpc_sock bdev_nvme_attach_controller -b Nvme_$dev_name -t $TEST_TRANSPORT -a $tgt_ip -s $NVMF_PORT -f ipv4 -n $nqn -l -1 -o 1
|
||||
done
|
||||
|
||||
$rootdir/examples/bdev/bdevperf/bdevperf.py -t 120 -s $bdevperf_rpc_sock perform_tests &
|
||||
bdevperf_rpc_pid=$!
|
||||
|
||||
sleep 5
|
||||
}
|
||||
|
||||
function stop_bdevperf() {
|
||||
wait $bdevperf_rpc_pid
|
||||
|
||||
# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
|
||||
killprocess $bdevperf_pid || true
|
||||
bdevperf_pid=
|
||||
|
||||
cat $testdir/try.txt
|
||||
|
||||
trap - SIGINT SIGTERM EXIT
|
||||
rm -f $testdir/try.txt
|
||||
}
|
||||
|
||||
function test_remove_and_rescan() {
|
||||
nvmfappstart -m 0xF
|
||||
nvmfappstart -m "$tgt_core_mask"
|
||||
|
||||
create_subsystem_and_connect "$@"
|
||||
|
||||
for net_dev in "${!netdev_nvme_dict[@]}"; do
|
||||
$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 40 &
|
||||
fio_pid=$!
|
||||
sleep 3
|
||||
generate_io_traffic_with_bdevperf "${!netdev_nvme_dict[@]}"
|
||||
|
||||
for net_dev in "${!netdev_nvme_dict[@]}"; do
|
||||
nvme_dev=${netdev_nvme_dict[$net_dev]}
|
||||
rdma_dev_name=$(get_rdma_device_name $net_dev)
|
||||
origin_ip=$(get_ip_address "$net_dev")
|
||||
@ -162,6 +198,8 @@ function test_remove_and_rescan() {
|
||||
done
|
||||
done
|
||||
|
||||
stop_bdevperf
|
||||
|
||||
# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
|
||||
killprocess $nvmfpid || true
|
||||
nvmfpid=
|
||||
@ -229,7 +267,7 @@ function test_bonding_slaves_on_nics() {
|
||||
# wait ib driver activated on bond device
|
||||
sleep 5
|
||||
|
||||
nvmfappstart -m 0xF
|
||||
nvmfappstart -m "$tgt_core_mask"
|
||||
$rpc_py nvmf_create_transport $NVMF_TRANSPORT_OPTS -u 8192
|
||||
|
||||
create_subsystem_and_connect_on_netdev $BOND_NAME
|
||||
@ -237,8 +275,7 @@ function test_bonding_slaves_on_nics() {
|
||||
ib_count=$(get_rdma_dev_count_in_nvmf_tgt)
|
||||
echo "IB Count: " $ib_count
|
||||
|
||||
$rootdir/scripts/fio-wrapper -p nvmf -i 4096 -d 1 -t randrw -r 10 &
|
||||
fio_pid=$!
|
||||
generate_io_traffic_with_bdevperf $BOND_NAME
|
||||
|
||||
sleep 2
|
||||
echo -$nic1 | sudo tee /sys/class/net/${BOND_NAME}/bonding/slaves
|
||||
@ -257,6 +294,8 @@ function test_bonding_slaves_on_nics() {
|
||||
exit 1
|
||||
fi
|
||||
|
||||
stop_bdevperf
|
||||
|
||||
# NOTE: rdma-core <= v43.0 has memleak bug (fixed in commit 7720071f).
|
||||
killprocess $nvmfpid || true
|
||||
nvmfpid=
|
||||
|
Loading…
Reference in New Issue
Block a user