From 6fac5e5b23c52c8f0f037fcb571da4b80ed87160 Mon Sep 17 00:00:00 2001 From: Changpeng Liu Date: Mon, 6 Aug 2018 20:17:42 -0400 Subject: [PATCH] bdev/nvme: detect Controller Fatal Status when timeout happens If the controller has a serious error and set the Controller Fatal Status field to 1, host driver does not know this error, while here, when timeout happens, try to detect the CFS and reset the controller to recover from such fatal status. Change-Id: I9fa5b263b34edc52d0f359d874b2920f7570d1f3 Signed-off-by: Changpeng Liu Reviewed-on: https://review.gerrithub.io/417622 Chandler-Test-Pool: SPDK Automated Test System Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Shuhei Matsumoto Reviewed-by: Jim Harris --- include/spdk/nvme.h | 5 ++++- lib/bdev/nvme/bdev_nvme.c | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/spdk/nvme.h b/include/spdk/nvme.h index 9f0f2fee0..10f333416 100644 --- a/include/spdk/nvme.h +++ b/include/spdk/nvme.h @@ -669,7 +669,10 @@ struct spdk_nvme_qpair; * request. * * For timeouts detected on the admin queue pair, the qpair returned here will - * be NULL. + * be NULL. If the controller has a serious error condition and is unable to + * communicate with driver via completion queue, the controller can set Controller + * Fatal Status field to 1, then reset is required to recover from such error. + * Users may detect Controller Fatal Status when timeout happens. * * \param cb_arg Argument passed to callback funciton. * \param ctrlr Opaque handle to NVMe controller. diff --git a/lib/bdev/nvme/bdev_nvme.c b/lib/bdev/nvme/bdev_nvme.c index 53b62d666..a5099af46 100644 --- a/lib/bdev/nvme/bdev_nvme.c +++ b/lib/bdev/nvme/bdev_nvme.c @@ -851,9 +851,20 @@ timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, uint16_t cid) { int rc; + union spdk_nvme_csts_register csts; SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); + csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); + if (csts.bits.cfs) { + SPDK_ERRLOG("Controller Fatal Status, reset required\n"); + rc = spdk_nvme_ctrlr_reset(ctrlr); + if (rc) { + SPDK_ERRLOG("Resetting controller failed.\n"); + } + return; + } + switch (g_opts.action_on_timeout) { case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: if (qpair) {