env: Register external memory with DPDK

DPDK has added APIs for registering externally allocated
memory regions. Use them instead of doing our own thing.

We have to postpone spdk_mem_unregister call in
memory_hotplug_cb() because SPDK mutex (g_spdk_mem_map_mutex)
and DPDK mutex (memory_hotplug_lock) may overlap
and cause deadlock when one thread is calling spdk_free()
(locks memory_hotplug_lock first and then tries to lock
g_spdk_mem_map_mutex) and another one is calling
vhost_session_mem_unregister() (locks g_spdk_mem_map_mutex
first and then tries to lock memory_hotplug_lock).


Change-Id: I547b4ffc3987ef088a1b659addba1456ad760a71
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Signed-off-by: Maciej Szwed <maciej.szwed@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/3560
Community-CI: Broadcom CI
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Maciej Szwed 2020-10-02 12:23:46 +02:00 committed by Tomasz Zawadzki
parent 09a1028e0f
commit aaac48880d
4 changed files with 136 additions and 162 deletions

View File

@ -98,7 +98,19 @@ spdk_realloc(void *buf, size_t size, size_t align)
void void
spdk_free(void *buf) spdk_free(void *buf)
{ {
struct spdk_mem_region *region, *tmp;
rte_free(buf); rte_free(buf);
pthread_mutex_lock(&g_spdk_mem_region_mutex);
g_spdk_mem_do_not_notify = true;
/* Perform memory unregister previously postponed in memory_hotplug_cb() */
TAILQ_FOREACH_SAFE(region, &g_spdk_mem_regions, tailq, tmp) {
spdk_mem_unregister(region->addr, region->len);
TAILQ_REMOVE(&g_spdk_mem_regions, region, tailq);
free(region);
}
g_spdk_mem_do_not_notify = false;
pthread_mutex_unlock(&g_spdk_mem_region_mutex);
} }
void * void *

View File

@ -50,6 +50,18 @@
#error RTE_VERSION is too old! Minimum 19.11 is required. #error RTE_VERSION is too old! Minimum 19.11 is required.
#endif #endif
extern __thread bool g_spdk_mem_do_not_notify;
extern struct spdk_mem_region_head g_spdk_mem_regions;
extern pthread_mutex_t g_spdk_mem_region_mutex;
struct spdk_mem_region {
void *addr;
size_t len;
TAILQ_ENTRY(spdk_mem_region) tailq;
};
TAILQ_HEAD(spdk_mem_region_head, spdk_mem_region);
/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47], /* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47],
* which is enough to cover 256 TB. * which is enough to cover 256 TB.
*/ */

View File

@ -146,7 +146,10 @@ struct spdk_mem_map {
static struct spdk_mem_map *g_mem_reg_map; static struct spdk_mem_map *g_mem_reg_map;
static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
struct spdk_mem_region_head g_spdk_mem_regions = TAILQ_HEAD_INITIALIZER(g_spdk_mem_regions);
static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t g_spdk_mem_region_mutex = PTHREAD_MUTEX_INITIALIZER;
__thread bool g_spdk_mem_do_not_notify;
static bool g_legacy_mem; static bool g_legacy_mem;
@ -732,7 +735,19 @@ memory_hotplug_cb(enum rte_mem_event event_type,
len -= seg->hugepage_sz; len -= seg->hugepage_sz;
} }
} else if (event_type == RTE_MEM_EVENT_FREE) { } else if (event_type == RTE_MEM_EVENT_FREE) {
spdk_mem_unregister((void *)addr, len); /* We need to postpone spdk_mem_unregister() call here to avoid
* double lock of SPDK mutex (g_spdk_mem_map_mutex) and DPDK mutex
* (memory_hotplug_lock) which may happen when one thread is calling
* spdk_free() and another one is calling vhost_session_mem_unregister() */
struct spdk_mem_region *region;
region = calloc(1, sizeof(*region));
assert(region != NULL);
region->addr = (void *)addr;
region->len = len;
pthread_mutex_lock(&g_spdk_mem_region_mutex);
TAILQ_INSERT_TAIL(&g_spdk_mem_regions, region, tailq);
pthread_mutex_unlock(&g_spdk_mem_region_mutex);
} }
} }
@ -785,126 +800,6 @@ static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
static struct spdk_mem_map *g_vtophys_map; static struct spdk_mem_map *g_vtophys_map;
static struct spdk_mem_map *g_phys_ref_map; static struct spdk_mem_map *g_phys_ref_map;
#if VFIO_ENABLED
static int
vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
{
struct spdk_vfio_dma_map *dma_map;
uint64_t refcount;
int ret;
refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
assert(refcount < UINT64_MAX);
if (refcount > 0) {
spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
return 0;
}
dma_map = calloc(1, sizeof(*dma_map));
if (dma_map == NULL) {
return -ENOMEM;
}
dma_map->map.argsz = sizeof(dma_map->map);
dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
dma_map->map.vaddr = vaddr;
dma_map->map.iova = iova;
dma_map->map.size = size;
pthread_mutex_lock(&g_vfio.mutex);
if (g_vfio.device_ref == 0) {
/* VFIO requires at least one device (IOMMU group) to be added to
* a VFIO container before it is possible to perform any IOMMU
* operations on that container. This memory will be mapped once
* the first device (IOMMU group) is hotplugged.
*
* Since the vfio container is managed internally by DPDK, it is
* also possible that some device is already in that container, but
* it's not managed by SPDK - e.g. an NIC attached internally
* inside DPDK. We could map the memory straight away in such
* scenario, but there's no need to do it. DPDK devices clearly
* don't need our mappings and hence we defer the mapping
* unconditionally until the first SPDK-managed device is
* hotplugged.
*/
goto out_insert;
}
ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
if (ret) {
DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
pthread_mutex_unlock(&g_vfio.mutex);
free(dma_map);
return ret;
}
out_insert:
TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
pthread_mutex_unlock(&g_vfio.mutex);
spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
return 0;
}
static int
vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
{
struct spdk_vfio_dma_map *dma_map;
uint64_t refcount;
int ret;
struct vfio_iommu_type1_dma_unmap unmap = {};
pthread_mutex_lock(&g_vfio.mutex);
TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
if (dma_map->map.iova == iova) {
break;
}
}
if (dma_map == NULL) {
DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
pthread_mutex_unlock(&g_vfio.mutex);
return -ENXIO;
}
refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
assert(refcount < UINT64_MAX);
if (refcount > 0) {
spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
}
/* We still have outstanding references, don't clear it. */
if (refcount > 1) {
pthread_mutex_unlock(&g_vfio.mutex);
return 0;
}
/** don't support partial or multiple-page unmap for now */
assert(dma_map->map.size == size);
if (g_vfio.device_ref == 0) {
/* Memory is not mapped anymore, just remove it's references */
goto out_remove;
}
unmap.argsz = sizeof(unmap);
unmap.flags = 0;
unmap.iova = dma_map->map.iova;
unmap.size = dma_map->map.size;
ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
if (ret) {
DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
pthread_mutex_unlock(&g_vfio.mutex);
return ret;
}
out_remove:
TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
pthread_mutex_unlock(&g_vfio.mutex);
free(dma_map);
return 0;
}
#endif
static uint64_t static uint64_t
vtophys_get_paddr_memseg(uint64_t vaddr) vtophys_get_paddr_memseg(uint64_t vaddr)
{ {
@ -988,6 +883,14 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
{ {
int rc = 0, pci_phys = 0; int rc = 0, pci_phys = 0;
uint64_t paddr; uint64_t paddr;
#if VFIO_ENABLED
uint32_t num_pages, i;
rte_iova_t *iovas = NULL;
rte_iova_t iova;
struct rte_dev_iterator dev_iter;
struct rte_device *dev;
const char *devstr = "bus=pci";
#endif
if ((uintptr_t)vaddr & ~MASK_256TB) { if ((uintptr_t)vaddr & ~MASK_256TB) {
DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
@ -1006,7 +909,9 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
switch (action) { switch (action) {
case SPDK_MEM_MAP_NOTIFY_REGISTER: case SPDK_MEM_MAP_NOTIFY_REGISTER:
if (paddr == SPDK_VTOPHYS_ERROR) { if (paddr == SPDK_VTOPHYS_ERROR) {
/* This is not an address that DPDK is managing. */ /* This is not an address that DPDK is managing currently.
* Let's register it. */
#if VFIO_ENABLED #if VFIO_ENABLED
enum rte_iova_mode iova_mode; enum rte_iova_mode iova_mode;
@ -1015,19 +920,38 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
/* We'll use the virtual address as the iova to match DPDK. */ /* We'll use the virtual address as the iova to match DPDK. */
paddr = (uint64_t)vaddr; paddr = (uint64_t)vaddr;
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
if (rc) { num_pages = len / VALUE_2MB;
return -EFAULT; iovas = calloc(num_pages, sizeof(rte_iova_t));
if (iovas == NULL) {
return -ENOMEM;
} }
while (len > 0) {
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); for (i = 0; i < num_pages; i++) {
if (rc != 0) { iovas[i] = paddr;
return rc;
}
vaddr += VALUE_2MB;
paddr += VALUE_2MB; paddr += VALUE_2MB;
len -= VALUE_2MB;
} }
rc = rte_extmem_register(vaddr, len, iovas, num_pages, VALUE_2MB);
if (rc != 0) {
goto error;
}
/* For each device, map the memory */
RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
rte_dev_dma_map(dev, vaddr, (uint64_t)vaddr, len);
}
for (i = 0; i < num_pages; i++) {
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, iovas[i]);
if (rc != 0) {
goto error;
}
vaddr += VALUE_2MB;
}
free(iovas);
} else } else
#endif #endif
{ {
@ -1038,7 +962,8 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
paddr = vtophys_get_paddr_pci((uint64_t)vaddr); paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
if (paddr == SPDK_VTOPHYS_ERROR) { if (paddr == SPDK_VTOPHYS_ERROR) {
DEBUG_PRINT("could not get phys addr for %p\n", vaddr); DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
return -EFAULT; rc = -EFAULT;
goto error;
} }
/* The beginning of this address range points to a PCI resource, /* The beginning of this address range points to a PCI resource,
* so the rest must point to a PCI resource as well. * so the rest must point to a PCI resource as well.
@ -1057,29 +982,39 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
if (paddr == SPDK_VTOPHYS_ERROR) { if (paddr == SPDK_VTOPHYS_ERROR) {
DEBUG_PRINT("could not get phys addr for %p\n", vaddr); DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
return -EFAULT; rc = -EFAULT;
goto error;
} }
/* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
if (!pci_phys && (paddr & MASK_2MB)) { if (!pci_phys && (paddr & MASK_2MB)) {
DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
return -EINVAL; rc = -EINVAL;
goto error;
} }
#if VFIO_ENABLED #if VFIO_ENABLED
/* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
* with the IOMMU using the physical address to match. */ * with the IOMMU using the physical address to match. */
if (spdk_iommu_is_enabled()) { if (spdk_iommu_is_enabled()) {
rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); iova = paddr;
if (rc) {
DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); rc = rte_extmem_register(vaddr, VALUE_2MB, &iova, 1, VALUE_2MB);
return -EFAULT; if (rc != 0) {
goto error;
}
/* For each device, map the memory */
RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
rte_dev_dma_map(dev, vaddr, paddr, len);
} }
} }
#endif #endif
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
if (rc != 0) { if (rc != 0) {
return rc; goto error;
} }
vaddr += VALUE_2MB; vaddr += VALUE_2MB;
@ -1108,11 +1043,8 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
break; break;
case SPDK_MEM_MAP_NOTIFY_UNREGISTER: case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
#if VFIO_ENABLED #if VFIO_ENABLED
if (paddr == SPDK_VTOPHYS_ERROR) { if (g_spdk_mem_do_not_notify == false) {
/* /* If vfio is enabled, we need to unmap the range from the IOMMU */
* This is not an address that DPDK is managing. If vfio is enabled,
* we need to unmap the range from the IOMMU
*/
if (spdk_iommu_is_enabled()) { if (spdk_iommu_is_enabled()) {
uint64_t buffer_len = len; uint64_t buffer_len = len;
uint8_t *va = vaddr; uint8_t *va = vaddr;
@ -1131,29 +1063,33 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
va, len, paddr, buffer_len); va, len, paddr, buffer_len);
return -EINVAL; return -EINVAL;
} }
rc = vtophys_iommu_unmap_dma(paddr, len);
if (rc) { /* For each device, map the memory */
DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
return -EFAULT; rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len);
}
rc = rte_extmem_unregister(vaddr, len);
if (rc != 0) {
return rc;
} }
} else if (iova_mode == RTE_IOVA_PA) { } else if (iova_mode == RTE_IOVA_PA) {
/* Get paddr for each 2MB chunk in this address range */ /* Get paddr for each 2MB chunk in this address range */
while (buffer_len > 0) { paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
DEBUG_PRINT("could not get phys addr for %p\n", va); DEBUG_PRINT("could not get phys addr for %p\n", va);
return -EFAULT; return -EFAULT;
} }
rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); /* For each device, map the memory */
if (rc) { RTE_DEV_FOREACH(dev, devstr, &dev_iter) {
DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len);
return -EFAULT; }
}
va += VALUE_2MB; rc = rte_extmem_unregister(vaddr, len);
buffer_len -= VALUE_2MB; if (rc != 0) {
return rc;
} }
} }
} }
@ -1174,6 +1110,13 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
SPDK_UNREACHABLE(); SPDK_UNREACHABLE();
} }
return rc;
error:
#if VFIO_ENABLED
free(iovas);
#endif
return rc; return rc;
} }

View File

@ -65,6 +65,13 @@ DEFINE_STUB(rte_dev_event_callback_unregister, int, (const char *device_name,
rte_dev_event_cb_fn cb_fn, void *cb_arg), 0); rte_dev_event_cb_fn cb_fn, void *cb_arg), 0);
DEFINE_STUB(rte_dev_event_monitor_start, int, (void), 0); DEFINE_STUB(rte_dev_event_monitor_start, int, (void), 0);
DEFINE_STUB(rte_dev_event_monitor_stop, int, (void), 0); DEFINE_STUB(rte_dev_event_monitor_stop, int, (void), 0);
DEFINE_STUB(rte_extmem_register, int, (void *va_addr, size_t len, rte_iova_t iova_addrs[],
unsigned int n_pages, size_t page_sz), 0);
DEFINE_STUB(rte_extmem_unregister, int, (void *va_addr, size_t len), 0);
DEFINE_STUB(rte_dev_dma_map, int, (struct rte_device *dev, void *addr, uint64_t iova,
size_t len), 0);
DEFINE_STUB(rte_dev_dma_unmap, int, (struct rte_device *dev, void *addr, uint64_t iova,
size_t len), 0);
static int static int
test_mem_map_notify(void *cb_ctx, struct spdk_mem_map *map, test_mem_map_notify(void *cb_ctx, struct spdk_mem_map *map,