env_dpdk/pci: delay device initialization on hotplug

A workaround for kernel deadlocks surfaced in #1275.

DPDK basically offers two APIs for hotplugging all PCI devices:
rte_bus_scan() and rte_bus_probe(). Scan iterates through
/sys/bus/pci/devices/* and creates corresponding rte_pci_device-s,
then rte_bus_probe() tries to initialize each device with the
supporting driver.

Previously we did scan and probe together, one after another, now
we'll have an intermediate step. After scanning the bus, we'll
iterate through all rte_pci_device-s and temporarily blacklist any
newly detected devices. We'll use devargs->data field to a store
a timeout value (integer) after which the device can be un-blacklisted
and initialized. devargs->data is documented in DPDK as "Device
string storage" and it's a char*, but it's not referenced anywhere
in DPDK. rte_bus_probe() respects the blacklist and doesn't do
absolutely anything with blacklisted ones.

The timeout value is 2 seconds, which should be plenty enough
for an NVMe device to reset, leave the critical lock sections in
kernel, and let us initialize it safely.

Note that direct attach by BDF doesn't respect the blacklist,
so an NVMe attach RPC won't be delayed in any way, it will continue
to work as it always did. Only the automatic discovery & enumeration
is deferred.

Change-Id: I62b719271bd0755bc2882331ea33f69897b1e5e5
Signed-off-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1733
Community-CI: Mellanox Build Bot
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Darek Stojaczyk 2020-04-07 19:21:54 +02:00 committed by Jim Harris
parent 701d17f6d6
commit 814072fa4e
2 changed files with 106 additions and 4 deletions

View File

@ -34,6 +34,7 @@
#include "env_internal.h"
#include <rte_alarm.h>
#include <rte_devargs.h>
#include "spdk/env.h"
#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers"
@ -235,6 +236,8 @@ cleanup_pci_devices(void)
pthread_mutex_unlock(&g_pci_mutex);
}
static int scan_pci_bus(bool delay_init);
void
pci_env_init(void)
{
@ -259,6 +262,12 @@ pci_env_init(void)
rte_pci_register(&driver->driver);
}
/* We assume devices were present on the bus for more than 2 seconds
* before initializing SPDK and there's no need to wait more. We scan
* the bus, but we don't blacklist any devices.
*/
scan_pci_bus(false);
/* Register a single hotremove callback for all devices. */
if (spdk_process_is_primary()) {
rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL);
@ -352,6 +361,11 @@ pci_device_fini(struct rte_pci_device *_dev)
return -1;
}
/* remove our whitelist_at option */
if (_dev->device.devargs) {
_dev->device.devargs->data = NULL;
}
assert(!dev->internal.removed);
dev->internal.removed = true;
pthread_mutex_unlock(&g_pci_mutex);
@ -374,12 +388,76 @@ spdk_pci_device_detach(struct spdk_pci_device *dev)
cleanup_pci_devices();
}
static int
scan_pci_bus(bool delay_init)
{
struct spdk_pci_driver *driver;
struct rte_pci_device *rte_dev;
uint64_t now;
rte_bus_scan();
now = spdk_get_ticks();
driver = TAILQ_FIRST(&g_pci_drivers);
if (!driver) {
return 0;
}
TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
struct rte_devargs *da;
da = rte_dev->device.devargs;
if (!da) {
char devargs_str[128];
/* the device was never blacklisted or whitelisted */
da = calloc(1, sizeof(*da));
if (!da) {
return -1;
}
snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
if (rte_devargs_parse(da, devargs_str) != 0) {
free(da);
return -1;
}
rte_devargs_insert(&da);
rte_dev->device.devargs = da;
}
if (da->data) {
uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data;
/* this device was seen by spdk before... */
if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) {
da->policy = RTE_DEV_WHITELISTED;
}
} else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST &&
da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) {
/* override the policy only if not permanently blacklisted */
if (delay_init) {
da->policy = RTE_DEV_BLACKLISTED;
da->data = (void *)(now + 2 * spdk_get_ticks_hz());
} else {
da->policy = RTE_DEV_WHITELISTED;
da->data = (void *)(uintptr_t)now;
}
}
}
return 0;
}
int
spdk_pci_device_attach(struct spdk_pci_driver *driver,
spdk_pci_enum_cb enum_cb,
void *enum_ctx, struct spdk_pci_addr *pci_address)
{
struct spdk_pci_device *dev;
struct rte_pci_device *rte_dev;
struct rte_devargs *da;
int rc;
char bdf[32];
@ -433,7 +511,29 @@ spdk_pci_device_attach(struct spdk_pci_driver *driver,
driver->cb_fn = NULL;
cleanup_pci_devices();
return rc == 0 ? 0 : -1;
if (rc != 0) {
return -1;
}
/* explicit attach ignores the whitelist, so if we blacklisted this
* device before let's enable it now - just for clarity.
*/
TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
break;
}
}
assert(dev != NULL);
rte_dev = dev->dev_handle;
da = rte_dev->device.devargs;
if (da && da->data) {
da->data = (void *)(uintptr_t)spdk_get_ticks();
da->policy = RTE_DEV_WHITELISTED;
}
return 0;
}
/* Note: You can call spdk_pci_enumerate from more than one thread
@ -473,7 +573,9 @@ spdk_pci_enumerate(struct spdk_pci_driver *driver,
rte_pci_register(&driver->driver);
}
rte_bus_scan();
if (scan_pci_bus(true) != 0) {
return -1;
}
driver->cb_fn = enum_cb;
driver->cb_arg = enum_ctx;

View File

@ -112,11 +112,11 @@ timing_enter hotplug_test
ssh_vm "LD_LIBRARY_PATH=/root//build/lib:/root/dpdk/build/lib:$LD_LIBRARY_PATH build/examples/hotplug -i 0 -t 25 -n 4 -r 8" &
example_pid=$!
sleep 4
sleep 6
remove_devices
sleep 4
insert_devices
sleep 4
sleep 6
remove_devices
devices_delete