From 6d9d3f87e3bde952cb49ea58b5412aaedeb5577c Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Wed, 9 Dec 2020 23:11:08 +0000 Subject: [PATCH] bdev_nvme: don't try to read csts on fabrics cmd timeout It is recommended to read CSTS when there is a timeout. If CSTS.CFS (Controller Fatal Status) is set, we should reset the controller. But if an admin command on a fabrics controller times out, reading CSTS submits another fabrics command that could also timeout. Even worse, we are recursively polling the admin queue for completions in this case. Fixes issue #1716. Signed-off-by: Jim Harris Change-Id: I23d31f6302375c52eba6f4370748d622fbd25ca7 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/5513 Tested-by: SPDK CI Jenkins Reviewed-by: Changpeng Liu Reviewed-by: Shuhei Matsumoto Reviewed-by: Aleksey Marchuk --- module/bdev/nvme/bdev_nvme.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index dd9da6947..16ae5d123 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -1231,11 +1231,18 @@ timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); - csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); - if (csts.bits.cfs) { - SPDK_ERRLOG("Controller Fatal Status, reset required\n"); - _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); - return; + /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O + * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we + * would submit another fabrics cmd on the admin queue to read CSTS and check for its + * completion recursively. + */ + if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { + csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); + if (csts.bits.cfs) { + SPDK_ERRLOG("Controller Fatal Status, reset required\n"); + _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + return; + } } switch (g_opts.action_on_timeout) {