From 814072fa4e5b957b028a4592666f72ec2e365b1d Mon Sep 17 00:00:00 2001 From: Darek Stojaczyk Date: Tue, 7 Apr 2020 19:21:54 +0200 Subject: [PATCH] env_dpdk/pci: delay device initialization on hotplug A workaround for kernel deadlocks surfaced in #1275. DPDK basically offers two APIs for hotplugging all PCI devices: rte_bus_scan() and rte_bus_probe(). Scan iterates through /sys/bus/pci/devices/* and creates corresponding rte_pci_device-s, then rte_bus_probe() tries to initialize each device with the supporting driver. Previously we did scan and probe together, one after another, now we'll have an intermediate step. After scanning the bus, we'll iterate through all rte_pci_device-s and temporarily blacklist any newly detected devices. We'll use devargs->data field to a store a timeout value (integer) after which the device can be un-blacklisted and initialized. devargs->data is documented in DPDK as "Device string storage" and it's a char*, but it's not referenced anywhere in DPDK. rte_bus_probe() respects the blacklist and doesn't do absolutely anything with blacklisted ones. The timeout value is 2 seconds, which should be plenty enough for an NVMe device to reset, leave the critical lock sections in kernel, and let us initialize it safely. Note that direct attach by BDF doesn't respect the blacklist, so an NVMe attach RPC won't be delayed in any way, it will continue to work as it always did. Only the automatic discovery & enumeration is deferred. Change-Id: I62b719271bd0755bc2882331ea33f69897b1e5e5 Signed-off-by: Darek Stojaczyk Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1733 Community-CI: Mellanox Build Bot Reviewed-by: Jim Harris Reviewed-by: Ben Walker Tested-by: SPDK CI Jenkins --- lib/env_dpdk/pci.c | 106 ++++++++++++++++++++++++++++++++++++++++++- test/nvme/hotplug.sh | 4 +- 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/lib/env_dpdk/pci.c b/lib/env_dpdk/pci.c index fe5ceb306..06cb5dbd0 100644 --- a/lib/env_dpdk/pci.c +++ b/lib/env_dpdk/pci.c @@ -34,6 +34,7 @@ #include "env_internal.h" #include +#include #include "spdk/env.h" #define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" @@ -235,6 +236,8 @@ cleanup_pci_devices(void) pthread_mutex_unlock(&g_pci_mutex); } +static int scan_pci_bus(bool delay_init); + void pci_env_init(void) { @@ -259,6 +262,12 @@ pci_env_init(void) rte_pci_register(&driver->driver); } + /* We assume devices were present on the bus for more than 2 seconds + * before initializing SPDK and there's no need to wait more. We scan + * the bus, but we don't blacklist any devices. + */ + scan_pci_bus(false); + /* Register a single hotremove callback for all devices. */ if (spdk_process_is_primary()) { rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL); @@ -352,6 +361,11 @@ pci_device_fini(struct rte_pci_device *_dev) return -1; } + /* remove our whitelist_at option */ + if (_dev->device.devargs) { + _dev->device.devargs->data = NULL; + } + assert(!dev->internal.removed); dev->internal.removed = true; pthread_mutex_unlock(&g_pci_mutex); @@ -374,12 +388,76 @@ spdk_pci_device_detach(struct spdk_pci_device *dev) cleanup_pci_devices(); } +static int +scan_pci_bus(bool delay_init) +{ + struct spdk_pci_driver *driver; + struct rte_pci_device *rte_dev; + uint64_t now; + + rte_bus_scan(); + now = spdk_get_ticks(); + + driver = TAILQ_FIRST(&g_pci_drivers); + if (!driver) { + return 0; + } + + TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) { + struct rte_devargs *da; + + da = rte_dev->device.devargs; + if (!da) { + char devargs_str[128]; + + /* the device was never blacklisted or whitelisted */ + da = calloc(1, sizeof(*da)); + if (!da) { + return -1; + } + + snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name); + if (rte_devargs_parse(da, devargs_str) != 0) { + free(da); + return -1; + } + + rte_devargs_insert(&da); + rte_dev->device.devargs = da; + } + + if (da->data) { + uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data; + + /* this device was seen by spdk before... */ + if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) { + da->policy = RTE_DEV_WHITELISTED; + } + } else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST && + da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) { + /* override the policy only if not permanently blacklisted */ + + if (delay_init) { + da->policy = RTE_DEV_BLACKLISTED; + da->data = (void *)(now + 2 * spdk_get_ticks_hz()); + } else { + da->policy = RTE_DEV_WHITELISTED; + da->data = (void *)(uintptr_t)now; + } + } + } + + return 0; +} + int spdk_pci_device_attach(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, void *enum_ctx, struct spdk_pci_addr *pci_address) { struct spdk_pci_device *dev; + struct rte_pci_device *rte_dev; + struct rte_devargs *da; int rc; char bdf[32]; @@ -433,7 +511,29 @@ spdk_pci_device_attach(struct spdk_pci_driver *driver, driver->cb_fn = NULL; cleanup_pci_devices(); - return rc == 0 ? 0 : -1; + + if (rc != 0) { + return -1; + } + + /* explicit attach ignores the whitelist, so if we blacklisted this + * device before let's enable it now - just for clarity. + */ + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { + break; + } + } + assert(dev != NULL); + + rte_dev = dev->dev_handle; + da = rte_dev->device.devargs; + if (da && da->data) { + da->data = (void *)(uintptr_t)spdk_get_ticks(); + da->policy = RTE_DEV_WHITELISTED; + } + + return 0; } /* Note: You can call spdk_pci_enumerate from more than one thread @@ -473,7 +573,9 @@ spdk_pci_enumerate(struct spdk_pci_driver *driver, rte_pci_register(&driver->driver); } - rte_bus_scan(); + if (scan_pci_bus(true) != 0) { + return -1; + } driver->cb_fn = enum_cb; driver->cb_arg = enum_ctx; diff --git a/test/nvme/hotplug.sh b/test/nvme/hotplug.sh index af20b2699..13011e193 100755 --- a/test/nvme/hotplug.sh +++ b/test/nvme/hotplug.sh @@ -112,11 +112,11 @@ timing_enter hotplug_test ssh_vm "LD_LIBRARY_PATH=/root//build/lib:/root/dpdk/build/lib:$LD_LIBRARY_PATH build/examples/hotplug -i 0 -t 25 -n 4 -r 8" & example_pid=$! -sleep 4 +sleep 6 remove_devices sleep 4 insert_devices -sleep 4 +sleep 6 remove_devices devices_delete