nvme: capture ticks for timeout before checking state

In some extreme use cases, an SPDK process could get
swapped out for a long period of time just after
we checked the state but before we called spdk_get_ticks().
So now we will only timeout if the timer expired before
we checked the state *and* the state did not advance.

It's possible we could just move the timeout check
to before the ctrlr->state switch, but I was
hesitant to change the flow for this case.

Fixes issue #1720.

Signed-off-by: Jim Harris <james.r.harris@intel.com>
Change-Id: I95b1db3365b5d2d8a65e528f53c302a724d44460

Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/5596
Community-CI: Broadcom CI
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
This commit is contained in:
Jim Harris 2020-12-16 10:50:14 -07:00 committed by Tomasz Zawadzki
parent 0dc567eb2d
commit 3c2190c214

View File

@ -2936,15 +2936,18 @@ nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
union spdk_nvme_cc_register cc; union spdk_nvme_cc_register cc;
union spdk_nvme_csts_register csts; union spdk_nvme_csts_register csts;
uint32_t ready_timeout_in_ms; uint32_t ready_timeout_in_ms;
uint64_t ticks;
int rc = 0; int rc = 0;
ticks = spdk_get_ticks();
/* /*
* May need to avoid accessing any register on the target controller * May need to avoid accessing any register on the target controller
* for a while. Return early without touching the FSM. * for a while. Return early without touching the FSM.
* Check sleep_timeout_tsc > 0 for unit test. * Check sleep_timeout_tsc > 0 for unit test.
*/ */
if ((ctrlr->sleep_timeout_tsc > 0) && if ((ctrlr->sleep_timeout_tsc > 0) &&
(spdk_get_ticks() <= ctrlr->sleep_timeout_tsc)) { (ticks <= ctrlr->sleep_timeout_tsc)) {
return 0; return 0;
} }
ctrlr->sleep_timeout_tsc = 0; ctrlr->sleep_timeout_tsc = 0;
@ -2980,7 +2983,7 @@ nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
* - directly after a VFIO reset. * - directly after a VFIO reset.
*/ */
SPDK_DEBUGLOG(nvme, "Adding 2 second delay before initializing the controller\n"); SPDK_DEBUGLOG(nvme, "Adding 2 second delay before initializing the controller\n");
ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2000 * spdk_get_ticks_hz() / 1000); ctrlr->sleep_timeout_tsc = ticks + (2000 * spdk_get_ticks_hz() / 1000);
} }
break; break;
@ -3015,7 +3018,7 @@ nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
*/ */
if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) { if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) {
SPDK_DEBUGLOG(nvme, "Applying quirk: delay 2.5 seconds before reading registers\n"); SPDK_DEBUGLOG(nvme, "Applying quirk: delay 2.5 seconds before reading registers\n");
ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2500 * spdk_get_ticks_hz() / 1000); ctrlr->sleep_timeout_tsc = ticks + (2500 * spdk_get_ticks_hz() / 1000);
} }
return 0; return 0;
} else { } else {
@ -3177,8 +3180,14 @@ nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
} }
init_timeout: init_timeout:
/* Note: we use the ticks captured when we entered this function.
* This covers environments where the SPDK process gets swapped out after
* we tried to advance the state but before we check the timeout here.
* It is not normal for this to happen, but harmless to handle it in this
* way.
*/
if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE && if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE &&
spdk_get_ticks() > ctrlr->state_timeout_tsc) { ticks > ctrlr->state_timeout_tsc) {
SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state); SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state);
return -1; return -1;
} }