nvmf/vfio-user: only process SQs in VFIO_USER_CTRLR_RUNNING state

While we are quiesced, we're not allowed to access guest memory via the
SGL APIs. Refuse to process any commands unless we're in RUNNING state.

We need to synchronize with each poll group via a message before we can
call vfu_device_quiesced(), otherwise we could still be processing
commands via nvmf_vfio_user_sq_poll().

For interrupt mode, we then might miss processing commands in a
corresponding interrupt callback, so make sure we process them when we
return to RUNNING state.

Signed-off-by: John Levon <john.levon@nutanix.com>
Change-Id: Ieae5a9ae8d9de722e0bdf4bb8d61e7e678159f1f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12912
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
This commit is contained in:
John Levon 2022-06-06 18:33:00 +01:00 committed by Tomasz Zawadzki
parent 667809a4ae
commit 0a153e8af4

View File

@ -576,6 +576,12 @@ ctrlr_to_poll_group(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
group); group);
} }
static inline struct spdk_thread *
poll_group_to_thread(struct nvmf_vfio_user_poll_group *vu_pg)
{
return vu_pg->group.group->thread;
}
static inline size_t static inline size_t
vfio_user_migr_data_len(void) vfio_user_migr_data_len(void)
{ {
@ -2896,45 +2902,48 @@ init_pci_config_space(vfu_pci_config_space_t *p)
p->hdr.intr.ipin = 0x1; p->hdr.intr.ipin = 0x1;
} }
struct subsystem_pause_ctx { struct ctrlr_quiesce_ctx {
struct nvmf_vfio_user_ctrlr *ctrlr; struct nvmf_vfio_user_ctrlr *ctrlr;
struct nvmf_vfio_user_poll_group *group;
int status; int status;
}; };
static void static void
vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr);
void *cb_arg, int status);
static void static void
_vfio_user_endpoint_resume_done_msg(void *ctx) _vfio_user_endpoint_resume_done_msg(void *ctx)
{ {
struct nvmf_vfio_user_endpoint *endpoint = ctx; struct nvmf_vfio_user_endpoint *endpoint = ctx;
struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem;
struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
int ret;
endpoint->need_resume = false; endpoint->need_resume = false;
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
if (!vu_ctrlr->queued_quiesce) { if (!vu_ctrlr->queued_quiesce) {
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
/*
* We might have ignored new SQ entries while we were quiesced:
* kick ourselves so we'll definitely check again while in
* VFIO_USER_CTRLR_RUNNING state.
*/
ctrlr_kick(vu_ctrlr);
return; return;
} }
/* Basically, once we call `vfu_device_quiesced` the device is unquiesced from
* libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns /*
* libvfio-user might quiesce the device again. However, because the NVMf subsytem is * Basically, once we call `vfu_device_quiesced` the device is
* an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has * unquiesced from libvfio-user's perspective so from the moment
* been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check * `vfio_user_quiesce_done` returns libvfio-user might quiesce the device
* whether a quiesce was requested. * again. However, because the NVMf subsytem is an asynchronous
* operation, this quiesce might come _before_ the NVMf subsystem has
* been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we
* need to check whether a quiesce was requested.
*/ */
SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr)); SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, quiesce again\n",
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; ctrlr_id(vu_ctrlr));
ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG, ctrlr_quiesce(vu_ctrlr);
vfio_user_dev_quiesce_done, vu_ctrlr);
if (ret < 0) {
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
}
} }
static void static void
@ -2954,18 +2963,20 @@ vfio_user_endpoint_resume_done(struct spdk_nvmf_subsystem *subsystem,
} }
static void static void
_vfio_user_dev_quiesce_done_msg(void *ctx) vfio_user_quiesce_done(void *ctx)
{ {
struct subsystem_pause_ctx *subsystem_ctx = ctx; struct ctrlr_quiesce_ctx *quiesce_ctx = ctx;
struct nvmf_vfio_user_ctrlr *vu_ctrlr = subsystem_ctx->ctrlr; struct nvmf_vfio_user_ctrlr *vu_ctrlr = quiesce_ctx->ctrlr;
struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint;
int ret; int ret;
SPDK_DEBUGLOG(nvmf_vfio, "%s device quiesced\n", ctrlr_id(vu_ctrlr));
assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING);
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED;
vfu_device_quiesced(endpoint->vfu_ctx, subsystem_ctx->status); vfu_device_quiesced(endpoint->vfu_ctx, quiesce_ctx->status);
vu_ctrlr->queued_quiesce = false; vu_ctrlr->queued_quiesce = false;
free(subsystem_ctx); free(quiesce_ctx);
/* `vfu_device_quiesced` can change the migration state, /* `vfu_device_quiesced` can change the migration state,
* so we need to re-check `vu_ctrlr->state`. * so we need to re-check `vu_ctrlr->state`.
@ -2986,25 +2997,79 @@ _vfio_user_dev_quiesce_done_msg(void *ctx)
} }
static void static void
vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, vfio_user_pause_done(struct spdk_nvmf_subsystem *subsystem,
void *cb_arg, int status) void *ctx, int status)
{ {
struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; struct ctrlr_quiesce_ctx *quiesce_ctx = ctx;
struct subsystem_pause_ctx *ctx;
SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status); quiesce_ctx->status = status;
ctx = calloc(1, sizeof(*ctx)); SPDK_DEBUGLOG(nvmf_vfio, "%s pause done with status %d\n",
if (!ctx) { ctrlr_id(quiesce_ctx->ctrlr), status);
spdk_thread_send_msg(quiesce_ctx->ctrlr->thread,
vfio_user_quiesce_done, ctx);
}
/*
* Ensure that, for this PG, we've stopped running in nvmf_vfio_user_sq_poll();
* we've already set ctrlr->state, so we won't process new entries, but we need
* to ensure that this PG is quiesced. This only works because there's no
* callback context set up between polling the SQ and spdk_nvmf_request_exec().
*
* Once we've walked all PGs, we need to pause any submitted I/O via
* spdk_nvmf_subsystem_pause(SPDK_NVME_GLOBAL_NS_TAG).
*/
static void
vfio_user_quiesce_pg(void *ctx)
{
struct ctrlr_quiesce_ctx *quiesce_ctx = ctx;
struct nvmf_vfio_user_ctrlr *vu_ctrlr = quiesce_ctx->ctrlr;
struct spdk_nvmf_subsystem *subsystem = vu_ctrlr->endpoint->subsystem;
struct nvmf_vfio_user_poll_group *vu_group = quiesce_ctx->group;
int ret;
SPDK_DEBUGLOG(nvmf_vfio, "quiesced pg:%p\n", vu_group);
quiesce_ctx->group = TAILQ_NEXT(vu_group, link);
if (quiesce_ctx->group != NULL) {
spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group),
vfio_user_quiesce_pg, quiesce_ctx);
return;
}
ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG,
vfio_user_pause_done, quiesce_ctx);
if (ret < 0) {
SPDK_ERRLOG("%s: failed to pause, ret=%d\n",
endpoint_id(vu_ctrlr->endpoint), ret);
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
fail_ctrlr(vu_ctrlr);
free(quiesce_ctx);
}
}
static void
ctrlr_quiesce(struct nvmf_vfio_user_ctrlr *vu_ctrlr)
{
struct ctrlr_quiesce_ctx *quiesce_ctx;
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING;
quiesce_ctx = calloc(1, sizeof(*quiesce_ctx));
if (!quiesce_ctx) {
SPDK_ERRLOG("Failed to allocate subsystem pause context\n"); SPDK_ERRLOG("Failed to allocate subsystem pause context\n");
assert(false); assert(false);
return; return;
} }
ctx->ctrlr = vu_ctrlr; quiesce_ctx->ctrlr = vu_ctrlr;
ctx->status = status; quiesce_ctx->status = 0;
quiesce_ctx->group = TAILQ_FIRST(&vu_ctrlr->transport->poll_groups);
spdk_thread_send_msg(vu_ctrlr->thread, _vfio_user_dev_quiesce_done_msg, ctx); spdk_thread_send_msg(poll_group_to_thread(quiesce_ctx->group),
vfio_user_quiesce_pg, quiesce_ctx);
} }
static int static int
@ -3014,7 +3079,6 @@ vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx)
struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem; struct spdk_nvmf_subsystem *subsystem = endpoint->subsystem;
struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr;
int ret;
if (!vu_ctrlr) { if (!vu_ctrlr) {
return 0; return 0;
@ -3045,14 +3109,7 @@ vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx)
case VFIO_USER_CTRLR_MIGRATING: case VFIO_USER_CTRLR_MIGRATING:
return 0; return 0;
case VFIO_USER_CTRLR_RUNNING: case VFIO_USER_CTRLR_RUNNING:
vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; ctrlr_quiesce(vu_ctrlr);
ret = spdk_nvmf_subsystem_pause(subsystem, SPDK_NVME_GLOBAL_NS_TAG,
vfio_user_dev_quiesce_done, vu_ctrlr);
if (ret < 0) {
vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING;
SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret);
return 0;
}
break; break;
case VFIO_USER_CTRLR_RESUMING: case VFIO_USER_CTRLR_RESUMING:
vu_ctrlr->queued_quiesce = true; vu_ctrlr->queued_quiesce = true;
@ -5195,6 +5252,14 @@ nvmf_vfio_user_sq_poll(struct nvmf_vfio_user_sq *sq)
ctrlr = sq->ctrlr; ctrlr = sq->ctrlr;
/*
* A quiesced, or migrating, controller should never process new
* commands.
*/
if (ctrlr->state != VFIO_USER_CTRLR_RUNNING) {
return SPDK_POLLER_IDLE;
}
if (ctrlr->adaptive_irqs_enabled) { if (ctrlr->adaptive_irqs_enabled) {
handle_suppressed_irq(ctrlr, sq); handle_suppressed_irq(ctrlr, sq);
} }