diff --git a/lib/env_dpdk/env.c b/lib/env_dpdk/env.c index ceacf3c79..c2801da2e 100644 --- a/lib/env_dpdk/env.c +++ b/lib/env_dpdk/env.c @@ -98,19 +98,7 @@ spdk_realloc(void *buf, size_t size, size_t align) void spdk_free(void *buf) { - struct spdk_mem_region *region, *tmp; - rte_free(buf); - pthread_mutex_lock(&g_spdk_mem_region_mutex); - g_spdk_mem_do_not_notify = true; - /* Perform memory unregister previously postponed in memory_hotplug_cb() */ - TAILQ_FOREACH_SAFE(region, &g_spdk_mem_regions, tailq, tmp) { - spdk_mem_unregister(region->addr, region->len); - TAILQ_REMOVE(&g_spdk_mem_regions, region, tailq); - free(region); - } - g_spdk_mem_do_not_notify = false; - pthread_mutex_unlock(&g_spdk_mem_region_mutex); } void * diff --git a/lib/env_dpdk/env_internal.h b/lib/env_dpdk/env_internal.h index 513e9dcd7..670b8cc39 100644 --- a/lib/env_dpdk/env_internal.h +++ b/lib/env_dpdk/env_internal.h @@ -50,18 +50,6 @@ #error RTE_VERSION is too old! Minimum 19.11 is required. #endif -extern __thread bool g_spdk_mem_do_not_notify; -extern struct spdk_mem_region_head g_spdk_mem_regions; -extern pthread_mutex_t g_spdk_mem_region_mutex; - -struct spdk_mem_region { - void *addr; - size_t len; - TAILQ_ENTRY(spdk_mem_region) tailq; -}; - -TAILQ_HEAD(spdk_mem_region_head, spdk_mem_region); - /* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47], * which is enough to cover 256 TB. */ diff --git a/lib/env_dpdk/memory.c b/lib/env_dpdk/memory.c index e5b77ccc5..dd30c79c1 100644 --- a/lib/env_dpdk/memory.c +++ b/lib/env_dpdk/memory.c @@ -146,10 +146,7 @@ struct spdk_mem_map { static struct spdk_mem_map *g_mem_reg_map; static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); -struct spdk_mem_region_head g_spdk_mem_regions = TAILQ_HEAD_INITIALIZER(g_spdk_mem_regions); static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_mutex_t g_spdk_mem_region_mutex = PTHREAD_MUTEX_INITIALIZER; -__thread bool g_spdk_mem_do_not_notify; static bool g_legacy_mem; @@ -735,19 +732,7 @@ memory_hotplug_cb(enum rte_mem_event event_type, len -= seg->hugepage_sz; } } else if (event_type == RTE_MEM_EVENT_FREE) { - /* We need to postpone spdk_mem_unregister() call here to avoid - * double lock of SPDK mutex (g_spdk_mem_map_mutex) and DPDK mutex - * (memory_hotplug_lock) which may happen when one thread is calling - * spdk_free() and another one is calling vhost_session_mem_unregister() */ - struct spdk_mem_region *region; - - region = calloc(1, sizeof(*region)); - assert(region != NULL); - region->addr = (void *)addr; - region->len = len; - pthread_mutex_lock(&g_spdk_mem_region_mutex); - TAILQ_INSERT_TAIL(&g_spdk_mem_regions, region, tailq); - pthread_mutex_unlock(&g_spdk_mem_region_mutex); + spdk_mem_unregister((void *)addr, len); } } @@ -800,6 +785,126 @@ static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = static struct spdk_mem_map *g_vtophys_map; static struct spdk_mem_map *g_phys_ref_map; +#if VFIO_ENABLED +static int +vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + uint64_t refcount; + int ret; + + refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); + assert(refcount < UINT64_MAX); + if (refcount > 0) { + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); + return 0; + } + + dma_map = calloc(1, sizeof(*dma_map)); + if (dma_map == NULL) { + return -ENOMEM; + } + + dma_map->map.argsz = sizeof(dma_map->map); + dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dma_map->map.vaddr = vaddr; + dma_map->map.iova = iova; + dma_map->map.size = size; + + pthread_mutex_lock(&g_vfio.mutex); + if (g_vfio.device_ref == 0) { + /* VFIO requires at least one device (IOMMU group) to be added to + * a VFIO container before it is possible to perform any IOMMU + * operations on that container. This memory will be mapped once + * the first device (IOMMU group) is hotplugged. + * + * Since the vfio container is managed internally by DPDK, it is + * also possible that some device is already in that container, but + * it's not managed by SPDK - e.g. an NIC attached internally + * inside DPDK. We could map the memory straight away in such + * scenario, but there's no need to do it. DPDK devices clearly + * don't need our mappings and hence we defer the mapping + * unconditionally until the first SPDK-managed device is + * hotplugged. + */ + goto out_insert; + } + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return ret; + } + +out_insert: + TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); + return 0; +} + +static int +vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + uint64_t refcount; + int ret; + struct vfio_iommu_type1_dma_unmap unmap = {}; + + pthread_mutex_lock(&g_vfio.mutex); + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + if (dma_map->map.iova == iova) { + break; + } + } + + if (dma_map == NULL) { + DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); + pthread_mutex_unlock(&g_vfio.mutex); + return -ENXIO; + } + + refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); + assert(refcount < UINT64_MAX); + if (refcount > 0) { + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); + } + + /* We still have outstanding references, don't clear it. */ + if (refcount > 1) { + pthread_mutex_unlock(&g_vfio.mutex); + return 0; + } + + /** don't support partial or multiple-page unmap for now */ + assert(dma_map->map.size == size); + + if (g_vfio.device_ref == 0) { + /* Memory is not mapped anymore, just remove it's references */ + goto out_remove; + } + + unmap.argsz = sizeof(unmap); + unmap.flags = 0; + unmap.iova = dma_map->map.iova; + unmap.size = dma_map->map.size; + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); + if (ret) { + DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + return ret; + } + +out_remove: + TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return 0; +} +#endif + static uint64_t vtophys_get_paddr_memseg(uint64_t vaddr) { @@ -883,14 +988,6 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, { int rc = 0, pci_phys = 0; uint64_t paddr; -#if VFIO_ENABLED - uint32_t num_pages, i; - rte_iova_t *iovas = NULL; - rte_iova_t iova; - struct rte_dev_iterator dev_iter; - struct rte_device *dev; - const char *devstr = "bus=pci"; -#endif if ((uintptr_t)vaddr & ~MASK_256TB) { DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); @@ -909,9 +1006,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, switch (action) { case SPDK_MEM_MAP_NOTIFY_REGISTER: if (paddr == SPDK_VTOPHYS_ERROR) { - /* This is not an address that DPDK is managing currently. - * Let's register it. */ - + /* This is not an address that DPDK is managing. */ #if VFIO_ENABLED enum rte_iova_mode iova_mode; @@ -920,38 +1015,19 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { /* We'll use the virtual address as the iova to match DPDK. */ paddr = (uint64_t)vaddr; - - num_pages = len / VALUE_2MB; - iovas = calloc(num_pages, sizeof(rte_iova_t)); - if (iovas == NULL) { - return -ENOMEM; + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); + if (rc) { + return -EFAULT; } - - for (i = 0; i < num_pages; i++) { - iovas[i] = paddr; - paddr += VALUE_2MB; - } - - rc = rte_extmem_register(vaddr, len, iovas, num_pages, VALUE_2MB); - if (rc != 0) { - goto error; - } - - /* For each device, map the memory */ - RTE_DEV_FOREACH(dev, devstr, &dev_iter) { - rte_dev_dma_map(dev, vaddr, (uint64_t)vaddr, len); - } - - for (i = 0; i < num_pages; i++) { - rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, iovas[i]); + while (len > 0) { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); if (rc != 0) { - goto error; + return rc; } - vaddr += VALUE_2MB; + paddr += VALUE_2MB; + len -= VALUE_2MB; } - - free(iovas); } else #endif { @@ -962,8 +1038,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, paddr = vtophys_get_paddr_pci((uint64_t)vaddr); if (paddr == SPDK_VTOPHYS_ERROR) { DEBUG_PRINT("could not get phys addr for %p\n", vaddr); - rc = -EFAULT; - goto error; + return -EFAULT; } /* The beginning of this address range points to a PCI resource, * so the rest must point to a PCI resource as well. @@ -982,39 +1057,29 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, if (paddr == SPDK_VTOPHYS_ERROR) { DEBUG_PRINT("could not get phys addr for %p\n", vaddr); - rc = -EFAULT; - goto error; - + return -EFAULT; } /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ if (!pci_phys && (paddr & MASK_2MB)) { DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); - rc = -EINVAL; - goto error; - + return -EINVAL; } #if VFIO_ENABLED /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory * with the IOMMU using the physical address to match. */ if (spdk_iommu_is_enabled()) { - iova = paddr; - - rc = rte_extmem_register(vaddr, VALUE_2MB, &iova, 1, VALUE_2MB); - if (rc != 0) { - goto error; - } - - /* For each device, map the memory */ - RTE_DEV_FOREACH(dev, devstr, &dev_iter) { - rte_dev_dma_map(dev, vaddr, paddr, len); + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); + if (rc) { + DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); + return -EFAULT; } } #endif rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); if (rc != 0) { - goto error; + return rc; } vaddr += VALUE_2MB; @@ -1043,8 +1108,11 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, break; case SPDK_MEM_MAP_NOTIFY_UNREGISTER: #if VFIO_ENABLED - if (g_spdk_mem_do_not_notify == false) { - /* If vfio is enabled, we need to unmap the range from the IOMMU */ + if (paddr == SPDK_VTOPHYS_ERROR) { + /* + * This is not an address that DPDK is managing. If vfio is enabled, + * we need to unmap the range from the IOMMU + */ if (spdk_iommu_is_enabled()) { uint64_t buffer_len = len; uint8_t *va = vaddr; @@ -1063,33 +1131,29 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, va, len, paddr, buffer_len); return -EINVAL; } - - /* For each device, map the memory */ - RTE_DEV_FOREACH(dev, devstr, &dev_iter) { - rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len); - } - - rc = rte_extmem_unregister(vaddr, len); - if (rc != 0) { - return rc; + rc = vtophys_iommu_unmap_dma(paddr, len); + if (rc) { + DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); + return -EFAULT; } } else if (iova_mode == RTE_IOVA_PA) { /* Get paddr for each 2MB chunk in this address range */ - paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); + while (buffer_len > 0) { + paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); - if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { - DEBUG_PRINT("could not get phys addr for %p\n", va); - return -EFAULT; - } + if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { + DEBUG_PRINT("could not get phys addr for %p\n", va); + return -EFAULT; + } - /* For each device, map the memory */ - RTE_DEV_FOREACH(dev, devstr, &dev_iter) { - rte_dev_dma_unmap(dev, vaddr, (uint64_t)vaddr, len); - } + rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); + if (rc) { + DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); + return -EFAULT; + } - rc = rte_extmem_unregister(vaddr, len); - if (rc != 0) { - return rc; + va += VALUE_2MB; + buffer_len -= VALUE_2MB; } } } @@ -1110,13 +1174,6 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, SPDK_UNREACHABLE(); } - return rc; - -error: -#if VFIO_ENABLED - free(iovas); -#endif - return rc; } diff --git a/test/env/memory/memory_ut.c b/test/env/memory/memory_ut.c index f43232f3c..696cc7c1a 100644 --- a/test/env/memory/memory_ut.c +++ b/test/env/memory/memory_ut.c @@ -65,13 +65,6 @@ DEFINE_STUB(rte_dev_event_callback_unregister, int, (const char *device_name, rte_dev_event_cb_fn cb_fn, void *cb_arg), 0); DEFINE_STUB(rte_dev_event_monitor_start, int, (void), 0); DEFINE_STUB(rte_dev_event_monitor_stop, int, (void), 0); -DEFINE_STUB(rte_extmem_register, int, (void *va_addr, size_t len, rte_iova_t iova_addrs[], - unsigned int n_pages, size_t page_sz), 0); -DEFINE_STUB(rte_extmem_unregister, int, (void *va_addr, size_t len), 0); -DEFINE_STUB(rte_dev_dma_map, int, (struct rte_device *dev, void *addr, uint64_t iova, - size_t len), 0); -DEFINE_STUB(rte_dev_dma_unmap, int, (struct rte_device *dev, void *addr, uint64_t iova, - size_t len), 0); static int test_mem_map_notify(void *cb_ctx, struct spdk_mem_map *map,