From 81523d9dd23dd8d808cc66c2c3e3da7441270b5e Mon Sep 17 00:00:00 2001 From: Darek Stojaczyk Date: Sat, 23 Mar 2019 22:38:14 +0100 Subject: [PATCH] env/dpdk: register VFIO hotremove callback This is an attempt to fix device hotremove with VFIO. A soft device hotremove request through sysfs [1] would currently just block until the SPDK process manually releases that device - e.g. upon an RPC request. VFIO won't get unbound from the device untill userspace releases all its resources. VFIO can signal a pending hotremove request by kicking any file descriptor provided by the userspace - and DPDK does provide such descriptor - but SPDK does not listen on it. DPDK does offer handy API to listen and in this patch we make use of it inside our env/pci layer. Within a DPDK callback we set an internal per-device hotremove flag, which upper-layer SPDK drivers can poll with a new env API - spdk_pci_device_is_removed(). The VFIO hotremove event will be sent to primary processes only, so that's where we listen. We make use of this new API in the NVMe hotplug poller, which will process it just like any other supported hotremove event. Fixes #595 Fixes #690 [1] # echo 1 > /sys/bus/pci/devices//remove Change-Id: I03d88271c2089c740e232056d9340e5a640d442c Signed-off-by: Darek Stojaczyk Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/448927 Tested-by: SPDK CI Jenkins Reviewed-by: Shuhei Matsumoto Reviewed-by: Jim Harris --- CHANGELOG.md | 6 +++++ include/spdk/env.h | 14 +++++++++++ lib/env_dpdk/pci.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ lib/nvme/nvme_pcie.c | 42 ++++++++++++++++++++++----------- 4 files changed, 103 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f32e4e542..552c11654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ performance. New API spdk_nvme_ctrlr_get_flags() was added. +NVMe hotplug poller is now able to detach devices hotremoved from the system +via `/sys/bus/pci/devices//remove` and `/sys/bus/pci/devices//driver/unbind`. + ### raid Added new strip_size_kb rpc param on create to replace the more ambiguous @@ -60,6 +63,9 @@ Default size is 4096. The `phys_addr` parameter in spdk_malloc() and spdk_zmalloc() has been deprecated. For retrieving physical addresses, spdk_vtophys() should be used instead. +spdk_pci_device_is_removed() has been added to let the upper-layer SPDK drivers know +that device has a pending external hotremove request. + ### DPDK Dropped support for DPDK 17.07 and earlier, which SPDK won't even compile with right now. diff --git a/include/spdk/env.h b/include/spdk/env.h index 534983d78..8273f1e32 100644 --- a/include/spdk/env.h +++ b/include/spdk/env.h @@ -653,6 +653,7 @@ struct spdk_pci_device { struct _spdk_pci_device_internal { struct spdk_pci_driver *driver; bool attached; + bool pending_removal; TAILQ_ENTRY(spdk_pci_device) tailq; } internal; }; @@ -977,6 +978,19 @@ int spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uin */ int spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset); +/** + * Check if device was requested to be removed from the process. This can be + * caused either by physical device hotremoval or OS-triggered removal. In the + * latter case, the device may continue to function properly even if this + * function returns \c true . The upper-layer driver may check this function + * periodically and eventually detach the device. + * + * \param dev PCI device. + * + * \return if device was requested to be removed + */ +bool spdk_pci_device_is_removed(struct spdk_pci_device *dev); + /** * Compare two PCI addresses. * diff --git a/lib/env_dpdk/pci.c b/lib/env_dpdk/pci.c index 657d95ecd..124735ae2 100644 --- a/lib/env_dpdk/pci.c +++ b/lib/env_dpdk/pci.c @@ -120,6 +120,42 @@ spdk_pci_driver_register(struct spdk_pci_driver *driver) TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq); } +#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) +static void +spdk_pci_device_rte_hotremove(const char *device_name, + enum rte_dev_event_type event, + void *cb_arg) +{ + struct spdk_pci_device *dev; + + if (event != RTE_DEV_EVENT_REMOVE) { + return; + } + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + struct rte_pci_device *rte_dev = dev->dev_handle; + + if (strcmp(rte_dev->name, device_name) == 0) { + if (!dev->internal.pending_removal && + !dev->internal.attached) { + /* if device is not attached, we + * can remove it right away. + */ + spdk_detach_rte(dev); + } else { + /* otherwise we let the upper layers + * detach it first. + */ + dev->internal.pending_removal = true; + } + break; + } + } + pthread_mutex_unlock(&g_pci_mutex); +} +#endif + void spdk_pci_init(void) { @@ -145,6 +181,13 @@ spdk_pci_init(void) rte_pci_register(&driver->driver); } #endif + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) + /* Register a single hotremove callback for all devices. */ + if (spdk_process_is_primary()) { + rte_dev_event_callback_register(NULL, spdk_pci_device_rte_hotremove, NULL); + } +#endif } void @@ -159,6 +202,12 @@ spdk_pci_fini(void) fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf); } } + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0) + if (spdk_process_is_primary()) { + rte_dev_event_callback_unregister(NULL, spdk_pci_device_rte_hotremove, NULL); + } +#endif } int @@ -535,6 +584,12 @@ spdk_pci_device_get_addr(struct spdk_pci_device *dev) return dev->addr; } +bool +spdk_pci_device_is_removed(struct spdk_pci_device *dev) +{ + return dev->internal.pending_removal; +} + int spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2) { diff --git a/lib/nvme/nvme_pcie.c b/lib/nvme/nvme_pcie.c index 21459716a..97ec78631 100644 --- a/lib/nvme/nvme_pcie.c +++ b/lib/nvme/nvme_pcie.c @@ -250,6 +250,13 @@ nvme_pcie_ctrlr_setup_signal(void) sigaction(SIGBUS, &sa, NULL); } +static inline struct nvme_pcie_ctrlr * +nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); + return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); +} + static int _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) { @@ -298,30 +305,37 @@ _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) /* This is a work around for vfio-attached device hot remove detection. */ TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { - /* NVMe controller BAR must be mapped to secondary process space before any access. */ + bool do_remove = false; + + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + if (spdk_pci_device_is_removed(pctrlr->devhandle)) { + do_remove = true; + } + } + + /* NVMe controller BAR must be mapped in the current process before any access. */ proc = spdk_nvme_ctrlr_get_current_process(ctrlr); if (proc) { csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); if (csts.raw == 0xffffffffU) { - nvme_ctrlr_fail(ctrlr, true); - if (probe_ctx->remove_cb) { - nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); - probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); - nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); - } + do_remove = true; + } + } + + if (do_remove) { + nvme_ctrlr_fail(ctrlr, true); + if (probe_ctx->remove_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); } } } return 0; } -static inline struct nvme_pcie_ctrlr * -nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) -{ - assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); - return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); -} - static inline struct nvme_pcie_qpair * nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) {