env/dpdk: register VFIO hotremove callback

This is an attempt to fix device hotremove with VFIO.

A soft device hotremove request through sysfs [1] would
currently just block until the SPDK process manually
releases that device - e.g. upon an RPC request.

VFIO won't get unbound from the device untill userspace
releases all its resources. VFIO can signal a pending
hotremove request by kicking any file descriptor provided
by the userspace - and DPDK does provide such descriptor -
but SPDK does not listen on it.

DPDK does offer handy API to listen and in this patch
we make use of it inside our env/pci layer. Within
a DPDK callback we set an internal per-device hotremove
flag, which upper-layer SPDK drivers can poll with a new
env API - spdk_pci_device_is_removed().

The VFIO hotremove event will be sent to primary
processes only, so that's where we listen.

We make use of this new API in the NVMe hotplug poller,
which will process it just like any other supported
hotremove event.

Fixes #595
Fixes #690

[1] # echo 1 > /sys/bus/pci/devices/<bdf>/remove

Change-Id: I03d88271c2089c740e232056d9340e5a640d442c
Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/448927
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Darek Stojaczyk 2019-03-23 22:38:14 +01:00 committed by Jim Harris
parent 468c6c18bd
commit 81523d9dd2
4 changed files with 103 additions and 14 deletions

View File

@ -25,6 +25,9 @@ performance.
New API spdk_nvme_ctrlr_get_flags() was added.
NVMe hotplug poller is now able to detach devices hotremoved from the system
via `/sys/bus/pci/devices/<bdf>/remove` and `/sys/bus/pci/devices/<bdf>/driver/unbind`.
### raid
Added new strip_size_kb rpc param on create to replace the more ambiguous
@ -60,6 +63,9 @@ Default size is 4096.
The `phys_addr` parameter in spdk_malloc() and spdk_zmalloc() has been deprecated.
For retrieving physical addresses, spdk_vtophys() should be used instead.
spdk_pci_device_is_removed() has been added to let the upper-layer SPDK drivers know
that device has a pending external hotremove request.
### DPDK
Dropped support for DPDK 17.07 and earlier, which SPDK won't even compile with right now.

View File

@ -653,6 +653,7 @@ struct spdk_pci_device {
struct _spdk_pci_device_internal {
struct spdk_pci_driver *driver;
bool attached;
bool pending_removal;
TAILQ_ENTRY(spdk_pci_device) tailq;
} internal;
};
@ -977,6 +978,19 @@ int spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uin
*/
int spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset);
/**
* Check if device was requested to be removed from the process. This can be
* caused either by physical device hotremoval or OS-triggered removal. In the
* latter case, the device may continue to function properly even if this
* function returns \c true . The upper-layer driver may check this function
* periodically and eventually detach the device.
*
* \param dev PCI device.
*
* \return if device was requested to be removed
*/
bool spdk_pci_device_is_removed(struct spdk_pci_device *dev);
/**
* Compare two PCI addresses.
*

View File

@ -120,6 +120,42 @@ spdk_pci_driver_register(struct spdk_pci_driver *driver)
TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
}
#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0)
static void
spdk_pci_device_rte_hotremove(const char *device_name,
enum rte_dev_event_type event,
void *cb_arg)
{
struct spdk_pci_device *dev;
if (event != RTE_DEV_EVENT_REMOVE) {
return;
}
pthread_mutex_lock(&g_pci_mutex);
TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
struct rte_pci_device *rte_dev = dev->dev_handle;
if (strcmp(rte_dev->name, device_name) == 0) {
if (!dev->internal.pending_removal &&
!dev->internal.attached) {
/* if device is not attached, we
* can remove it right away.
*/
spdk_detach_rte(dev);
} else {
/* otherwise we let the upper layers
* detach it first.
*/
dev->internal.pending_removal = true;
}
break;
}
}
pthread_mutex_unlock(&g_pci_mutex);
}
#endif
void
spdk_pci_init(void)
{
@ -145,6 +181,13 @@ spdk_pci_init(void)
rte_pci_register(&driver->driver);
}
#endif
#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0)
/* Register a single hotremove callback for all devices. */
if (spdk_process_is_primary()) {
rte_dev_event_callback_register(NULL, spdk_pci_device_rte_hotremove, NULL);
}
#endif
}
void
@ -159,6 +202,12 @@ spdk_pci_fini(void)
fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf);
}
}
#if RTE_VERSION >= RTE_VERSION_NUM(18, 5, 0, 0)
if (spdk_process_is_primary()) {
rte_dev_event_callback_unregister(NULL, spdk_pci_device_rte_hotremove, NULL);
}
#endif
}
int
@ -535,6 +584,12 @@ spdk_pci_device_get_addr(struct spdk_pci_device *dev)
return dev->addr;
}
bool
spdk_pci_device_is_removed(struct spdk_pci_device *dev)
{
return dev->internal.pending_removal;
}
int
spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
{

View File

@ -250,6 +250,13 @@ nvme_pcie_ctrlr_setup_signal(void)
sigaction(SIGBUS, &sa, NULL);
}
static inline struct nvme_pcie_ctrlr *
nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
{
assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
}
static int
_nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
{
@ -298,30 +305,37 @@ _nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
/* This is a work around for vfio-attached device hot remove detection. */
TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
/* NVMe controller BAR must be mapped to secondary process space before any access. */
bool do_remove = false;
if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
if (spdk_pci_device_is_removed(pctrlr->devhandle)) {
do_remove = true;
}
}
/* NVMe controller BAR must be mapped in the current process before any access. */
proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
if (proc) {
csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
if (csts.raw == 0xffffffffU) {
nvme_ctrlr_fail(ctrlr, true);
if (probe_ctx->remove_cb) {
nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
}
do_remove = true;
}
}
if (do_remove) {
nvme_ctrlr_fail(ctrlr, true);
if (probe_ctx->remove_cb) {
nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
probe_ctx->remove_cb(probe_ctx->cb_ctx, ctrlr);
nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
}
}
}
return 0;
}
static inline struct nvme_pcie_ctrlr *
nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
{
assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
}
static inline struct nvme_pcie_qpair *
nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
{