From 1d44693f66841e764213371339853948d5d29efc Mon Sep 17 00:00:00 2001 From: Changpeng Liu Date: Mon, 6 Dec 2021 20:49:14 +0800 Subject: [PATCH] nvmf/vfio-user: implement device quiesce APIs libvfio-user will call quiesce callback when there are memory region add/remove and device state change requests from client, and in the quiesce callback, we will pause the subsystem so that it's safe to do everything after it, then after quiesce callback, we will resume the subsystem. The quiesce callback is also used in live migration, each device state change will quiesce the device first. Change-Id: I3a6a0320ad76c6b2d1d65c754b9f79cce5c9c683 Signed-off-by: Changpeng Liu Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10620 Community-CI: Broadcom CI Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Jim Harris --- lib/nvmf/vfio_user.c | 131 +++++++++++++++++- libvfio-user | 2 +- test/unit/lib/nvmf/vfio_user.c/vfio_user_ut.c | 6 + 3 files changed, 133 insertions(+), 6 deletions(-) diff --git a/lib/nvmf/vfio_user.c b/lib/nvmf/vfio_user.c index e019c969b..1b0f25339 100644 --- a/lib/nvmf/vfio_user.c +++ b/lib/nvmf/vfio_user.c @@ -214,6 +214,8 @@ struct nvmf_vfio_user_ctrlr { struct spdk_thread *thread; struct spdk_poller *vfu_ctx_poller; + bool queued_quiesce; + bool reset_shn; uint16_t cntlid; @@ -1622,7 +1624,7 @@ memory_region_add_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) pthread_mutex_unlock(&endpoint->lock); } -static int +static void memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) { struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); @@ -1632,7 +1634,7 @@ memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) int ret = 0; if (!info->vaddr) { - return 0; + return; } if (((uintptr_t)info->mapping.iov_base & MASK_2MB) || @@ -1640,7 +1642,7 @@ memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) SPDK_DEBUGLOG(nvmf_vfio, "Invalid memory region vaddr %p, IOVA %#lx-%#lx\n", info->vaddr, (uintptr_t)info->mapping.iov_base, (uintptr_t)info->mapping.iov_base + info->mapping.iov_len); - return 0; + return; } assert(endpoint != NULL); @@ -1678,8 +1680,6 @@ memory_region_remove_cb(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) ret); } } - - return 0; } static int @@ -1943,6 +1943,125 @@ init_pci_config_space(vfu_pci_config_space_t *p) p->hdr.intr.ipin = 0x1; } +static void +vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status); + +static void +vfio_user_dev_quiesce_resume_done(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; + struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; + int ret; + + SPDK_DEBUGLOG(nvmf_vfio, "%s resumed done with status %d\n", ctrlr_id(vu_ctrlr), status); + + vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; + + /* Basically, once we call `vfu_device_quiesced` the device is unquiesced from + * libvfio-user's perspective so from the moment `vfio_user_dev_quiesce_done` returns + * libvfio-user might quiesce the device again. However, because the NVMf subsytem is + * an asynchronous operation, this quiesce might come _before_ the NVMf subsystem has + * been resumed, so in the callback of `spdk_nvmf_subsystem_resume` we need to check + * whether a quiesce was requested. + */ + if (vu_ctrlr->queued_quiesce) { + SPDK_DEBUGLOG(nvmf_vfio, "%s has queued quiesce event, pause again\n", ctrlr_id(vu_ctrlr)); + vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; + ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, + vfio_user_dev_quiesce_done, vu_ctrlr); + if (ret < 0) { + vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; + SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); + } + } +} + +static void +vfio_user_dev_quiesce_done(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_vfio_user_ctrlr *vu_ctrlr = cb_arg; + struct nvmf_vfio_user_endpoint *endpoint = vu_ctrlr->endpoint; + int ret; + + SPDK_DEBUGLOG(nvmf_vfio, "%s paused done with status %d\n", ctrlr_id(vu_ctrlr), status); + + assert(vu_ctrlr->state == VFIO_USER_CTRLR_PAUSING); + vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; + vfu_device_quiesced(endpoint->vfu_ctx, status); + vu_ctrlr->queued_quiesce = false; + + SPDK_DEBUGLOG(nvmf_vfio, "%s start to resume\n", ctrlr_id(vu_ctrlr)); + vu_ctrlr->state = VFIO_USER_CTRLR_RESUMING; + ret = spdk_nvmf_subsystem_resume((struct spdk_nvmf_subsystem *)endpoint->subsystem, + vfio_user_dev_quiesce_resume_done, vu_ctrlr); + if (ret < 0) { + vu_ctrlr->state = VFIO_USER_CTRLR_PAUSED; + SPDK_ERRLOG("%s: failed to resume, ret=%d\n", endpoint_id(endpoint), ret); + } +} + +static int +vfio_user_dev_quiesce_cb(vfu_ctx_t *vfu_ctx) +{ + struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx); + struct nvmf_vfio_user_ctrlr *vu_ctrlr = endpoint->ctrlr; + int ret; + + if (!vu_ctrlr) { + return 0; + } + + /* NVMf library will destruct controller when no + * connected queue pairs. + */ + if (!nvmf_subsystem_get_ctrlr((struct spdk_nvmf_subsystem *)endpoint->subsystem, + vu_ctrlr->cntlid)) { + return 0; + } + + SPDK_DEBUGLOG(nvmf_vfio, "%s starts to quiesce\n", ctrlr_id(vu_ctrlr)); + + /* There is no race condition here as device quiesce callback + * and nvmf_prop_set_cc() are running in the same thread context. + */ + if (!vu_ctrlr->ctrlr->vcprop.cc.bits.en) { + return 0; + } else if (!vu_ctrlr->ctrlr->vcprop.csts.bits.rdy) { + return 0; + } else if (vu_ctrlr->ctrlr->vcprop.csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { + return 0; + } + + switch (vu_ctrlr->state) { + case VFIO_USER_CTRLR_PAUSED: + return 0; + case VFIO_USER_CTRLR_RUNNING: + vu_ctrlr->state = VFIO_USER_CTRLR_PAUSING; + ret = spdk_nvmf_subsystem_pause((struct spdk_nvmf_subsystem *)endpoint->subsystem, 0, + vfio_user_dev_quiesce_done, vu_ctrlr); + if (ret < 0) { + vu_ctrlr->state = VFIO_USER_CTRLR_RUNNING; + SPDK_ERRLOG("%s: failed to pause, ret=%d\n", endpoint_id(endpoint), ret); + return 0; + } + break; + case VFIO_USER_CTRLR_RESUMING: + vu_ctrlr->queued_quiesce = true; + SPDK_DEBUGLOG(nvmf_vfio, "%s is busy to quiesce, current state %u\n", ctrlr_id(vu_ctrlr), + vu_ctrlr->state); + break; + default: + assert(vu_ctrlr->state != VFIO_USER_CTRLR_PAUSING); + break; + } + + errno = EBUSY; + return -1; +} + static int vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, struct nvmf_vfio_user_endpoint *endpoint) @@ -2058,6 +2177,8 @@ vfio_user_dev_info_fill(struct nvmf_vfio_user_transport *vu_transport, return ret; } + vfu_setup_device_quiesce_cb(vfu_ctx, vfio_user_dev_quiesce_cb); + ret = vfu_realize_ctx(vfu_ctx); if (ret < 0) { SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx); diff --git a/libvfio-user b/libvfio-user index 568429675..17769cf1a 160000 --- a/libvfio-user +++ b/libvfio-user @@ -1 +1 @@ -Subproject commit 56842967566dcf4b89514949a6e88ba89eeaf268 +Subproject commit 17769cf1af093dfb4b9bc3347ae39324029989ac diff --git a/test/unit/lib/nvmf/vfio_user.c/vfio_user_ut.c b/test/unit/lib/nvmf/vfio_user.c/vfio_user_ut.c index f0fa7ce75..d469a3f36 100644 --- a/test/unit/lib/nvmf/vfio_user.c/vfio_user_ut.c +++ b/test/unit/lib/nvmf/vfio_user.c/vfio_user_ut.c @@ -50,6 +50,10 @@ DEFINE_STUB(spdk_nvmf_qpair_disconnect, int, (struct spdk_nvmf_qpair *qpair, DEFINE_STUB(spdk_nvmf_subsystem_get_nqn, const char *, (const struct spdk_nvmf_subsystem *subsystem), NULL); DEFINE_STUB(spdk_bdev_get_block_size, uint32_t, (const struct spdk_bdev *bdev), 512); +DEFINE_STUB(spdk_nvmf_subsystem_pause, int, (struct spdk_nvmf_subsystem *subsystem, + uint32_t nsid, spdk_nvmf_subsystem_state_change_done cb_fn, void *cb_arg), 0); +DEFINE_STUB(spdk_nvmf_subsystem_resume, int, (struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, void *cb_arg), 0); DEFINE_STUB_V(nvmf_ctrlr_abort_aer, (struct spdk_nvmf_ctrlr *ctrlr)); DEFINE_STUB(nvmf_ctrlr_async_event_error_event, int, (struct spdk_nvmf_ctrlr *ctrlr, union spdk_nvme_async_event_completion event), 0); @@ -58,6 +62,8 @@ DEFINE_STUB(spdk_nvmf_qpair_get_listen_trid, int, (struct spdk_nvmf_qpair *qpair struct spdk_nvme_transport_id *trid), 0); DEFINE_STUB(spdk_nvme_transport_id_compare, int, (const struct spdk_nvme_transport_id *trid1, const struct spdk_nvme_transport_id *trid2), 0); +DEFINE_STUB(nvmf_subsystem_get_ctrlr, struct spdk_nvmf_ctrlr *, + (struct spdk_nvmf_subsystem *subsystem, uint16_t cntlid), NULL); static void * gpa_to_vva(void *prv, uint64_t addr, uint64_t len, int prot)