nvme_rdma: Handle stale connection asynchronously
Include delayed disconnect/connect retries with finite times into the state machine of asynchronous qpair connnection. We do not need to call back to the common transport layer but we need to do the following, clear rqpair->cq before starting disconnection if qpair uses poll group, and clear qpair->transport_failure_reason after disconnected. Additionally locate the new state STALE_CONN before INITIALIZING because cq is not ready to use for admin qpair when the state is STALE_CONN. Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com> Change-Id: Ibc779a2b772be9506ffd8226d5f64d6d12102ff2 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11690 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
This commit is contained in:
parent
77c4657140
commit
20cf90801e
@ -74,6 +74,13 @@
|
||||
/* The default size for a shared rdma completion queue. */
|
||||
#define DEFAULT_NVME_RDMA_CQ_SIZE 4096
|
||||
|
||||
/*
|
||||
* In the special case of a stale connection we don't expose a mechanism
|
||||
* for the user to retry the connection so we need to handle it internally.
|
||||
*/
|
||||
#define NVME_RDMA_STALE_CONN_RETRY_MAX 5
|
||||
#define NVME_RDMA_STALE_CONN_RETRY_DELAY_US 10000
|
||||
|
||||
/*
|
||||
* Maximum value of transport_retry_count used by RDMA controller
|
||||
*/
|
||||
@ -186,6 +193,7 @@ union nvme_rdma_mr {
|
||||
|
||||
enum nvme_rdma_qpair_state {
|
||||
NVME_RDMA_QPAIR_STATE_INVALID = 0,
|
||||
NVME_RDMA_QPAIR_STATE_STALE_CONN,
|
||||
NVME_RDMA_QPAIR_STATE_INITIALIZING,
|
||||
NVME_RDMA_QPAIR_STATE_FABRIC_CONNECT_SEND,
|
||||
NVME_RDMA_QPAIR_STATE_FABRIC_CONNECT_POLL,
|
||||
@ -259,6 +267,8 @@ struct nvme_rdma_qpair {
|
||||
|
||||
/* Used by poll group to keep the qpair around until it is ready to remove it. */
|
||||
bool defer_deletion_to_pg;
|
||||
|
||||
uint8_t stale_conn_retry_count;
|
||||
};
|
||||
|
||||
enum NVME_RDMA_COMPLETION_FLAGS {
|
||||
@ -1225,10 +1235,14 @@ nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair,
|
||||
nvme_rdma_addr_resolved);
|
||||
}
|
||||
|
||||
static int nvme_rdma_stale_conn_retry(struct nvme_rdma_qpair *rqpair);
|
||||
|
||||
static int
|
||||
nvme_rdma_connect_established(struct nvme_rdma_qpair *rqpair, int ret)
|
||||
{
|
||||
if (ret) {
|
||||
if (ret == -ESTALE) {
|
||||
return nvme_rdma_stale_conn_retry(rqpair);
|
||||
} else if (ret) {
|
||||
SPDK_ERRLOG("RDMA connect error %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
@ -1406,6 +1420,18 @@ nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qp
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
nvme_rdma_stale_conn_reconnect(struct nvme_rdma_qpair *rqpair)
|
||||
{
|
||||
struct spdk_nvme_qpair *qpair = &rqpair->qpair;
|
||||
|
||||
if (spdk_get_ticks() < rqpair->evt_timeout_ticks) {
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
return nvme_rdma_ctrlr_connect_qpair(qpair->ctrlr, qpair);
|
||||
}
|
||||
|
||||
static int
|
||||
nvme_rdma_ctrlr_connect_qpair_poll(struct spdk_nvme_ctrlr *ctrlr,
|
||||
struct spdk_nvme_qpair *qpair)
|
||||
@ -1425,6 +1451,7 @@ nvme_rdma_ctrlr_connect_qpair_poll(struct spdk_nvme_ctrlr *ctrlr,
|
||||
break;
|
||||
|
||||
case NVME_RDMA_QPAIR_STATE_INITIALIZING:
|
||||
case NVME_RDMA_QPAIR_STATE_EXITING:
|
||||
if (!nvme_qpair_is_admin_queue(qpair)) {
|
||||
nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
|
||||
}
|
||||
@ -1442,6 +1469,12 @@ nvme_rdma_ctrlr_connect_qpair_poll(struct spdk_nvme_ctrlr *ctrlr,
|
||||
|
||||
return rc;
|
||||
|
||||
case NVME_RDMA_QPAIR_STATE_STALE_CONN:
|
||||
rc = nvme_rdma_stale_conn_reconnect(rqpair);
|
||||
if (rc == 0) {
|
||||
rc = -EAGAIN;
|
||||
}
|
||||
break;
|
||||
case NVME_RDMA_QPAIR_STATE_FABRIC_CONNECT_SEND:
|
||||
rc = nvme_fabric_qpair_connect_async(qpair, rqpair->num_entries + 1);
|
||||
if (rc == 0) {
|
||||
@ -2068,6 +2101,52 @@ nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
nvme_rdma_stale_conn_disconnected(struct nvme_rdma_qpair *rqpair, int ret)
|
||||
{
|
||||
struct spdk_nvme_qpair *qpair = &rqpair->qpair;
|
||||
|
||||
if (ret) {
|
||||
SPDK_DEBUGLOG(nvme, "Target did not respond to qpair disconnect.\n");
|
||||
}
|
||||
|
||||
nvme_rdma_qpair_destroy(rqpair);
|
||||
|
||||
qpair->last_transport_failure_reason = qpair->transport_failure_reason;
|
||||
qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE;
|
||||
|
||||
rqpair->state = NVME_RDMA_QPAIR_STATE_STALE_CONN;
|
||||
rqpair->evt_timeout_ticks = (NVME_RDMA_STALE_CONN_RETRY_DELAY_US * spdk_get_ticks_hz()) /
|
||||
SPDK_SEC_TO_USEC + spdk_get_ticks();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
nvme_rdma_stale_conn_retry(struct nvme_rdma_qpair *rqpair)
|
||||
{
|
||||
struct spdk_nvme_qpair *qpair = &rqpair->qpair;
|
||||
|
||||
if (rqpair->stale_conn_retry_count >= NVME_RDMA_STALE_CONN_RETRY_MAX) {
|
||||
SPDK_ERRLOG("Retry failed %d times, give up stale connection to qpair (cntlid:%u, qid:%u).\n",
|
||||
NVME_RDMA_STALE_CONN_RETRY_MAX, qpair->ctrlr->cntlid, qpair->id);
|
||||
return -ESTALE;
|
||||
}
|
||||
|
||||
rqpair->stale_conn_retry_count++;
|
||||
|
||||
SPDK_NOTICELOG("%d times, retry stale connnection to qpair (cntlid:%u, qid:%u).\n",
|
||||
rqpair->stale_conn_retry_count, qpair->ctrlr->cntlid, qpair->id);
|
||||
|
||||
if (qpair->poll_group) {
|
||||
rqpair->cq = NULL;
|
||||
}
|
||||
|
||||
_nvme_rdma_ctrlr_disconnect_qpair(qpair->ctrlr, qpair, nvme_rdma_stale_conn_disconnected);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
|
||||
|
||||
static int
|
||||
|
Loading…
Reference in New Issue
Block a user