env_dpdk/pci: delay device initialization on hotplug
A workaround for kernel deadlocks surfaced in #1275. DPDK basically offers two APIs for hotplugging all PCI devices: rte_bus_scan() and rte_bus_probe(). Scan iterates through /sys/bus/pci/devices/* and creates corresponding rte_pci_device-s, then rte_bus_probe() tries to initialize each device with the supporting driver. Previously we did scan and probe together, one after another, now we'll have an intermediate step. After scanning the bus, we'll iterate through all rte_pci_device-s and temporarily blacklist any newly detected devices. We'll use devargs->data field to a store a timeout value (integer) after which the device can be un-blacklisted and initialized. devargs->data is documented in DPDK as "Device string storage" and it's a char*, but it's not referenced anywhere in DPDK. rte_bus_probe() respects the blacklist and doesn't do absolutely anything with blacklisted ones. The timeout value is 2 seconds, which should be plenty enough for an NVMe device to reset, leave the critical lock sections in kernel, and let us initialize it safely. Note that direct attach by BDF doesn't respect the blacklist, so an NVMe attach RPC won't be delayed in any way, it will continue to work as it always did. Only the automatic discovery & enumeration is deferred. Change-Id: I62b719271bd0755bc2882331ea33f69897b1e5e5 Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com> Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1733 Community-CI: Mellanox Build Bot Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
parent
701d17f6d6
commit
814072fa4e
@ -34,6 +34,7 @@
|
||||
#include "env_internal.h"
|
||||
|
||||
#include <rte_alarm.h>
|
||||
#include <rte_devargs.h>
|
||||
#include "spdk/env.h"
|
||||
|
||||
#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers"
|
||||
@ -235,6 +236,8 @@ cleanup_pci_devices(void)
|
||||
pthread_mutex_unlock(&g_pci_mutex);
|
||||
}
|
||||
|
||||
static int scan_pci_bus(bool delay_init);
|
||||
|
||||
void
|
||||
pci_env_init(void)
|
||||
{
|
||||
@ -259,6 +262,12 @@ pci_env_init(void)
|
||||
rte_pci_register(&driver->driver);
|
||||
}
|
||||
|
||||
/* We assume devices were present on the bus for more than 2 seconds
|
||||
* before initializing SPDK and there's no need to wait more. We scan
|
||||
* the bus, but we don't blacklist any devices.
|
||||
*/
|
||||
scan_pci_bus(false);
|
||||
|
||||
/* Register a single hotremove callback for all devices. */
|
||||
if (spdk_process_is_primary()) {
|
||||
rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL);
|
||||
@ -352,6 +361,11 @@ pci_device_fini(struct rte_pci_device *_dev)
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* remove our whitelist_at option */
|
||||
if (_dev->device.devargs) {
|
||||
_dev->device.devargs->data = NULL;
|
||||
}
|
||||
|
||||
assert(!dev->internal.removed);
|
||||
dev->internal.removed = true;
|
||||
pthread_mutex_unlock(&g_pci_mutex);
|
||||
@ -374,12 +388,76 @@ spdk_pci_device_detach(struct spdk_pci_device *dev)
|
||||
cleanup_pci_devices();
|
||||
}
|
||||
|
||||
static int
|
||||
scan_pci_bus(bool delay_init)
|
||||
{
|
||||
struct spdk_pci_driver *driver;
|
||||
struct rte_pci_device *rte_dev;
|
||||
uint64_t now;
|
||||
|
||||
rte_bus_scan();
|
||||
now = spdk_get_ticks();
|
||||
|
||||
driver = TAILQ_FIRST(&g_pci_drivers);
|
||||
if (!driver) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
|
||||
struct rte_devargs *da;
|
||||
|
||||
da = rte_dev->device.devargs;
|
||||
if (!da) {
|
||||
char devargs_str[128];
|
||||
|
||||
/* the device was never blacklisted or whitelisted */
|
||||
da = calloc(1, sizeof(*da));
|
||||
if (!da) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
|
||||
if (rte_devargs_parse(da, devargs_str) != 0) {
|
||||
free(da);
|
||||
return -1;
|
||||
}
|
||||
|
||||
rte_devargs_insert(&da);
|
||||
rte_dev->device.devargs = da;
|
||||
}
|
||||
|
||||
if (da->data) {
|
||||
uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data;
|
||||
|
||||
/* this device was seen by spdk before... */
|
||||
if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) {
|
||||
da->policy = RTE_DEV_WHITELISTED;
|
||||
}
|
||||
} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST &&
|
||||
da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) {
|
||||
/* override the policy only if not permanently blacklisted */
|
||||
|
||||
if (delay_init) {
|
||||
da->policy = RTE_DEV_BLACKLISTED;
|
||||
da->data = (void *)(now + 2 * spdk_get_ticks_hz());
|
||||
} else {
|
||||
da->policy = RTE_DEV_WHITELISTED;
|
||||
da->data = (void *)(uintptr_t)now;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
spdk_pci_device_attach(struct spdk_pci_driver *driver,
|
||||
spdk_pci_enum_cb enum_cb,
|
||||
void *enum_ctx, struct spdk_pci_addr *pci_address)
|
||||
{
|
||||
struct spdk_pci_device *dev;
|
||||
struct rte_pci_device *rte_dev;
|
||||
struct rte_devargs *da;
|
||||
int rc;
|
||||
char bdf[32];
|
||||
|
||||
@ -433,7 +511,29 @@ spdk_pci_device_attach(struct spdk_pci_driver *driver,
|
||||
driver->cb_fn = NULL;
|
||||
|
||||
cleanup_pci_devices();
|
||||
return rc == 0 ? 0 : -1;
|
||||
|
||||
if (rc != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* explicit attach ignores the whitelist, so if we blacklisted this
|
||||
* device before let's enable it now - just for clarity.
|
||||
*/
|
||||
TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
|
||||
if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(dev != NULL);
|
||||
|
||||
rte_dev = dev->dev_handle;
|
||||
da = rte_dev->device.devargs;
|
||||
if (da && da->data) {
|
||||
da->data = (void *)(uintptr_t)spdk_get_ticks();
|
||||
da->policy = RTE_DEV_WHITELISTED;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Note: You can call spdk_pci_enumerate from more than one thread
|
||||
@ -473,7 +573,9 @@ spdk_pci_enumerate(struct spdk_pci_driver *driver,
|
||||
rte_pci_register(&driver->driver);
|
||||
}
|
||||
|
||||
rte_bus_scan();
|
||||
if (scan_pci_bus(true) != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
driver->cb_fn = enum_cb;
|
||||
driver->cb_arg = enum_ctx;
|
||||
|
@ -112,11 +112,11 @@ timing_enter hotplug_test
|
||||
ssh_vm "LD_LIBRARY_PATH=/root//build/lib:/root/dpdk/build/lib:$LD_LIBRARY_PATH build/examples/hotplug -i 0 -t 25 -n 4 -r 8" &
|
||||
example_pid=$!
|
||||
|
||||
sleep 4
|
||||
sleep 6
|
||||
remove_devices
|
||||
sleep 4
|
||||
insert_devices
|
||||
sleep 4
|
||||
sleep 6
|
||||
remove_devices
|
||||
devices_delete
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user