From 55df83ceb6203cccb266cec93dba3315315e2f96 Mon Sep 17 00:00:00 2001 From: Michael Haeuptle Date: Mon, 30 Mar 2020 21:28:31 +0000 Subject: [PATCH] ENV_DPDK/VFIO: Increase PCI tear down timeout When removing large number of devices (>8) in parallel, the 20ms timeout is not long enough. As part of spdk_detach_cb, DPDK calls into the VFIO driver which may get delayed due to multiple hot removes being processed by pciehp driver (pciehp IRQ thread function is handling the actual removal of a device in paralle but all of the IRQ thread function compete for a global mutex increasing processing time and race conditions). Signed-off-by: Michael Haeuptle Change-Id: I470fbbee92dac9677082c873781efe41e2941cd5 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1588 Tested-by: SPDK CI Jenkins Reviewed-by: Darek Stojaczyk Reviewed-by: Ben Walker --- lib/env_dpdk/pci.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/env_dpdk/pci.c b/lib/env_dpdk/pci.c index 5cfa63619..7cf9be9b0 100644 --- a/lib/env_dpdk/pci.c +++ b/lib/env_dpdk/pci.c @@ -133,8 +133,8 @@ spdk_detach_rte(struct spdk_pci_device *dev) dev->internal.pending_removal = true; if (spdk_process_is_primary() && !pthread_equal(g_dpdk_tid, pthread_self())) { rte_eal_alarm_set(1, spdk_detach_rte_cb, rte_dev); - /* wait up to 20ms for the cb to start executing */ - for (i = 20; i > 0; i--) { + /* wait up to 2s for the cb to finish executing */ + for (i = 2000; i > 0; i--) { spdk_delay_us(1000); pthread_mutex_lock(&g_pci_mutex); @@ -149,7 +149,7 @@ spdk_detach_rte(struct spdk_pci_device *dev) /* besides checking the removed flag, we also need to wait * for the dpdk detach function to unwind, as it's doing some * operations even after calling our detach callback. Simply - * cancell the alarm - if it started executing already, this + * cancel the alarm - if it started executing already, this * call will block and wait for it to finish. */ rte_eal_alarm_cancel(spdk_detach_rte_cb, rte_dev); @@ -163,6 +163,8 @@ spdk_detach_rte(struct spdk_pci_device *dev) if (!removed) { fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n", rte_dev->name); + /* If we reach this state, then the device couldn't be removed and most likely + a subsequent hot add of a device in the same BDF will fail */ } } else { spdk_detach_rte_cb(rte_dev);