From a36bc251dfde6c53765da3e331df65044c92fe93 Mon Sep 17 00:00:00 2001 From: Ben Walker Date: Wed, 3 Aug 2022 13:28:23 -0700 Subject: [PATCH] env_dpdk: Automatically map PCI BARs into VFIO By doing the registration immediately upon mapping the BAR instead of when the memory is inserted into the spdk_mem_map, we're able to register BARs that are not 2MB multiples in size and alignment. The SPDK API for registering a BAR already returns the physical/io address in the map call, and it can be used directly without a call to spdk_mem_register(). If the user does elect to later register the BAR using spdk_mem_register(), we attempt to insert the 2MB aligned segments we can into the spdk_mem_map. Users may still need to register memory for a few reasons, such as making spdk_vtophys() work, or for setting up the BAR as a target for RDMA. These cases still require 2MB aligned and sized segments. Change-Id: I395ae8803ec4bf22703f6f76db54200949e82532 Signed-off-by: Ben Walker Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14017 Reviewed-by: Jim Harris Reviewed-by: Konrad Sztyber Reviewed-by: Shuhei Matsumoto Tested-by: SPDK CI Jenkins Community-CI: Mellanox Build Bot --- include/spdk/memory.h | 11 ++ lib/env_dpdk/env_internal.h | 3 + lib/env_dpdk/memory.c | 207 +++++++++++++++++++++++++---------- lib/env_dpdk/pci.c | 45 +++++++- lib/env_dpdk/pci_dpdk.c | 4 +- lib/env_dpdk/pci_dpdk.h | 4 +- lib/env_dpdk/pci_dpdk_2207.c | 23 +++- test/env/memory/memory_ut.c | 3 +- 8 files changed, 231 insertions(+), 69 deletions(-) diff --git a/include/spdk/memory.h b/include/spdk/memory.h index 218b37cb6..7d67b4a7e 100644 --- a/include/spdk/memory.h +++ b/include/spdk/memory.h @@ -8,6 +8,17 @@ #include "spdk/stdinc.h" +#ifndef __linux__ +#define VFIO_ENABLED 0 +#else +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_ENABLED 1 +#else +#define VFIO_ENABLED 0 +#endif +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/lib/env_dpdk/env_internal.h b/lib/env_dpdk/env_internal.h index 41774afd6..3516c0d05 100644 --- a/lib/env_dpdk/env_internal.h +++ b/lib/env_dpdk/env_internal.h @@ -33,6 +33,9 @@ void pci_env_fini(void); int mem_map_init(bool legacy_mem); int vtophys_init(void); +int vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size); +int vtophys_iommu_unmap_dma_bar(uint64_t vaddr); + struct rte_pci_device; /** diff --git a/lib/env_dpdk/memory.c b/lib/env_dpdk/memory.c index 208a6a763..3628e638d 100644 --- a/lib/env_dpdk/memory.c +++ b/lib/env_dpdk/memory.c @@ -22,12 +22,9 @@ #include "spdk/env_dpdk.h" #include "spdk/log.h" -#ifndef __linux__ -#define VFIO_ENABLED 0 -#else +#ifdef __linux__ #include #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) -#define VFIO_ENABLED 1 #include #include @@ -53,9 +50,6 @@ static struct vfio_cfg g_vfio = { .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), .mutex = PTHREAD_MUTEX_INITIALIZER }; - -#else -#define VFIO_ENABLED 0 #endif #endif @@ -765,19 +759,11 @@ static struct spdk_mem_map *g_phys_ref_map; #if VFIO_ENABLED static int -vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) +_vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) { struct spdk_vfio_dma_map *dma_map; - uint64_t refcount; int ret; - refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); - assert(refcount < UINT64_MAX); - if (refcount > 0) { - spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); - return 0; - } - dma_map = calloc(1, sizeof(*dma_map)); if (dma_map == NULL) { return -ENOMEM; @@ -789,7 +775,6 @@ vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) dma_map->map.iova = iova; dma_map->map.size = size; - pthread_mutex_lock(&g_vfio.mutex); if (g_vfio.device_ref == 0) { /* VFIO requires at least one device (IOMMU group) to be added to * a VFIO container before it is possible to perform any IOMMU @@ -816,18 +801,78 @@ vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) out_insert: TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); + return 0; +} + + +static int +vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) +{ + uint64_t refcount; + int ret; + + refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); + assert(refcount < UINT64_MAX); + if (refcount > 0) { + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); + return 0; + } + + pthread_mutex_lock(&g_vfio.mutex); + ret = _vfio_iommu_map_dma(vaddr, iova, size); pthread_mutex_unlock(&g_vfio.mutex); + if (ret) { + return ret; + } + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); return 0; } +int +vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size) +{ + int ret; + + pthread_mutex_lock(&g_vfio.mutex); + ret = _vfio_iommu_map_dma(vaddr, iova, size); + pthread_mutex_unlock(&g_vfio.mutex); + + return ret; +} + +static int +_vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map) +{ + struct vfio_iommu_type1_dma_unmap unmap = {}; + int ret; + + if (g_vfio.device_ref == 0) { + /* Memory is not mapped anymore, just remove it's references */ + goto out_remove; + } + + unmap.argsz = sizeof(unmap); + unmap.flags = 0; + unmap.iova = dma_map->map.iova; + unmap.size = dma_map->map.size; + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); + if (ret) { + SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); + } + +out_remove: + TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); + free(dma_map); + return 0; +} + static int vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) { struct spdk_vfio_dma_map *dma_map; uint64_t refcount; int ret; - struct vfio_iommu_type1_dma_unmap unmap = {}; pthread_mutex_lock(&g_vfio.mutex); TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { @@ -857,25 +902,34 @@ vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) /** don't support partial or multiple-page unmap for now */ assert(dma_map->map.size == size); - if (g_vfio.device_ref == 0) { - /* Memory is not mapped anymore, just remove it's references */ - goto out_remove; - } - - unmap.argsz = sizeof(unmap); - unmap.flags = 0; - unmap.iova = dma_map->map.iova; - unmap.size = dma_map->map.size; - ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap); - if (ret) { - SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno); - } - -out_remove: - TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); + ret = _vfio_iommu_unmap_dma(dma_map); pthread_mutex_unlock(&g_vfio.mutex); - free(dma_map); - return 0; + + return ret; +} + +int +vtophys_iommu_unmap_dma_bar(uint64_t vaddr) +{ + struct spdk_vfio_dma_map *dma_map; + int ret; + + pthread_mutex_lock(&g_vfio.mutex); + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + if (dma_map->map.vaddr == vaddr) { + break; + } + } + + if (dma_map == NULL) { + DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr); + pthread_mutex_unlock(&g_vfio.mutex); + return -ENXIO; + } + + ret = _vfio_iommu_unmap_dma(dma_map); + pthread_mutex_unlock(&g_vfio.mutex); + return ret; } #endif @@ -926,7 +980,7 @@ vtophys_get_paddr_pagemap(uint64_t vaddr) /* Try to get the paddr from pci devices */ static uint64_t -vtophys_get_paddr_pci(uint64_t vaddr) +vtophys_get_paddr_pci(uint64_t vaddr, size_t len) { struct spdk_vtophys_pci_device *vtophys_dev; uintptr_t paddr; @@ -935,7 +989,7 @@ vtophys_get_paddr_pci(uint64_t vaddr) pthread_mutex_lock(&g_vtophys_pci_devices_mutex); TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { dev = vtophys_dev->pci_device; - paddr = dpdk_pci_device_vtophys(dev, vaddr); + paddr = dpdk_pci_device_vtophys(dev, vaddr, len); if (paddr != SPDK_VTOPHYS_ERROR) { pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); return paddr; @@ -943,7 +997,7 @@ vtophys_get_paddr_pci(uint64_t vaddr) } pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); - return SPDK_VTOPHYS_ERROR; + return SPDK_VTOPHYS_ERROR; } static int @@ -951,7 +1005,7 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, enum spdk_mem_map_notify_action action, void *vaddr, size_t len) { - int rc = 0, pci_phys = 0; + int rc = 0; uint64_t paddr; if ((uintptr_t)vaddr & ~MASK_256TB) { @@ -972,6 +1026,30 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, case SPDK_MEM_MAP_NOTIFY_REGISTER: if (paddr == SPDK_VTOPHYS_ERROR) { /* This is not an address that DPDK is managing. */ + + /* Check if this is a PCI BAR. They need special handling */ + paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); + if (paddr != SPDK_VTOPHYS_ERROR) { + /* Get paddr for each 2MB chunk in this address range */ + while (len > 0) { + paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + return 0; + } + #if VFIO_ENABLED enum rte_iova_mode iova_mode; @@ -999,34 +1077,21 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, /* Get the physical address from /proc/self/pagemap. */ paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); if (paddr == SPDK_VTOPHYS_ERROR) { - /* Get the physical address from PCI devices */ - paddr = vtophys_get_paddr_pci((uint64_t)vaddr); - if (paddr == SPDK_VTOPHYS_ERROR) { - DEBUG_PRINT("could not get phys addr for %p\n", vaddr); - return -EFAULT; - } - /* The beginning of this address range points to a PCI resource, - * so the rest must point to a PCI resource as well. - */ - pci_phys = 1; + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; } /* Get paddr for each 2MB chunk in this address range */ while (len > 0) { /* Get the physical address from /proc/self/pagemap. */ - if (pci_phys) { - paddr = vtophys_get_paddr_pci((uint64_t)vaddr); - } else { - paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); - } + paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); if (paddr == SPDK_VTOPHYS_ERROR) { DEBUG_PRINT("could not get phys addr for %p\n", vaddr); return -EFAULT; } - /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ - if (!pci_phys && (paddr & MASK_2MB)) { + if (paddr & MASK_2MB) { DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); return -EINVAL; } @@ -1075,7 +1140,33 @@ vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, #if VFIO_ENABLED if (paddr == SPDK_VTOPHYS_ERROR) { /* - * This is not an address that DPDK is managing. If vfio is enabled, + * This is not an address that DPDK is managing. + */ + + /* Check if this is a PCI BAR. They need special handling */ + paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len); + if (paddr != SPDK_VTOPHYS_ERROR) { + /* Get paddr for each 2MB chunk in this address range */ + while (len > 0) { + paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + return 0; + } + + /* If vfio is enabled, * we need to unmap the range from the IOMMU */ if (spdk_iommu_is_enabled()) { diff --git a/lib/env_dpdk/pci.c b/lib/env_dpdk/pci.c index 71c0d8fc3..14813704e 100644 --- a/lib/env_dpdk/pci.c +++ b/lib/env_dpdk/pci.c @@ -11,6 +11,7 @@ #include "spdk/env.h" #include "spdk/log.h" #include "spdk/string.h" +#include "spdk/memory.h" #define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" @@ -712,12 +713,54 @@ int spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar, void **mapped_addr, uint64_t *phys_addr, uint64_t *size) { - return dev->map_bar(dev, bar, mapped_addr, phys_addr, size); + int rc; + + rc = dev->map_bar(dev, bar, mapped_addr, phys_addr, size); + if (rc) { + return rc; + } + +#if VFIO_ENABLED + /* Automatically map the BAR to the IOMMU */ + if (!spdk_iommu_is_enabled()) { + return 0; + } + + if (rte_eal_iova_mode() == RTE_IOVA_VA) { + /* We'll use the virtual address as the iova to match DPDK. */ + rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), (uint64_t) * mapped_addr, *size); + if (rc) { + dev->unmap_bar(dev, bar, *mapped_addr); + return -EFAULT; + } + + *phys_addr = (uint64_t)(*mapped_addr); + } else { + /* We'll use the physical address as the iova to match DPDK. */ + rc = vtophys_iommu_map_dma_bar((uint64_t)(*mapped_addr), *phys_addr, *size); + if (rc) { + dev->unmap_bar(dev, bar, *mapped_addr); + return -EFAULT; + } + } +#endif + return rc; } int spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr) { +#if VFIO_ENABLED + int rc; + + if (spdk_iommu_is_enabled()) { + rc = vtophys_iommu_unmap_dma_bar((uint64_t)addr); + if (rc) { + return -EFAULT; + } + } +#endif + return dev->unmap_bar(dev, bar, addr); } diff --git a/lib/env_dpdk/pci_dpdk.c b/lib/env_dpdk/pci_dpdk.c index 48c087e5b..2ba855db2 100644 --- a/lib/env_dpdk/pci_dpdk.c +++ b/lib/env_dpdk/pci_dpdk.c @@ -44,9 +44,9 @@ dpdk_pci_init(void) } uint64_t -dpdk_pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr) +dpdk_pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len) { - return g_dpdk_fn_table->pci_device_vtophys(dev, vaddr); + return g_dpdk_fn_table->pci_device_vtophys(dev, vaddr, len); } const char * diff --git a/lib/env_dpdk/pci_dpdk.h b/lib/env_dpdk/pci_dpdk.h index 414ea6709..300ccce2a 100644 --- a/lib/env_dpdk/pci_dpdk.h +++ b/lib/env_dpdk/pci_dpdk.h @@ -26,7 +26,7 @@ struct rte_pci_driver; struct rte_device; struct dpdk_fn_table { - uint64_t (*pci_device_vtophys)(struct rte_pci_device *dev, uint64_t vaddr); + uint64_t (*pci_device_vtophys)(struct rte_pci_device *dev, uint64_t vaddr, size_t len); const char *(*pci_device_get_name)(struct rte_pci_device *); struct rte_devargs *(*pci_device_get_devargs)(struct rte_pci_device *); void (*pci_device_copy_identifiers)(struct rte_pci_device *_dev, struct spdk_pci_device *dev); @@ -52,7 +52,7 @@ struct dpdk_fn_table { int dpdk_pci_init(void); -uint64_t dpdk_pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr); +uint64_t dpdk_pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len); const char *dpdk_pci_device_get_name(struct rte_pci_device *); struct rte_devargs *dpdk_pci_device_get_devargs(struct rte_pci_device *); void dpdk_pci_device_copy_identifiers(struct rte_pci_device *_dev, struct spdk_pci_device *dev); diff --git a/lib/env_dpdk/pci_dpdk_2207.c b/lib/env_dpdk/pci_dpdk_2207.c index b0b7af34d..b238517f2 100644 --- a/lib/env_dpdk/pci_dpdk_2207.c +++ b/lib/env_dpdk/pci_dpdk_2207.c @@ -14,7 +14,7 @@ SPDK_STATIC_ASSERT(offsetof(struct spdk_pci_driver, driver) >= sizeof(struct rte "driver_buf not big enough"); static uint64_t -pci_device_vtophys_2207(struct rte_pci_device *dev, uint64_t vaddr) +pci_device_vtophys_2207(struct rte_pci_device *dev, uint64_t vaddr, size_t len) { struct rte_mem_resource *res; uint64_t paddr; @@ -22,11 +22,24 @@ pci_device_vtophys_2207(struct rte_pci_device *dev, uint64_t vaddr) for (r = 0; r < PCI_MAX_RESOURCE; r++) { res = &dev->mem_resource[r]; - if (res->phys_addr && vaddr >= (uint64_t)res->addr && - vaddr < (uint64_t)res->addr + res->len) { - paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); - return paddr; + + if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr || + (vaddr + len) >= (uint64_t)res->addr + res->len) { + continue; } + +#if VFIO_ENABLED + if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) { + /* + * The IOMMU is on and we're using IOVA == VA. The BAR was + * automatically registered when it was mapped, so just return + * the virtual address here. + */ + return vaddr; + } +#endif + paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); + return paddr; } return SPDK_VTOPHYS_ERROR; diff --git a/test/env/memory/memory_ut.c b/test/env/memory/memory_ut.c index dd0b07367..45d15cb6a 100644 --- a/test/env/memory/memory_ut.c +++ b/test/env/memory/memory_ut.c @@ -29,7 +29,8 @@ DEFINE_STUB(rte_vfio_noiommu_is_enabled, int, (void), 0); DEFINE_STUB(rte_memseg_get_fd_thread_unsafe, int, (const struct rte_memseg *ms), 0); DEFINE_STUB(rte_memseg_get_fd_offset_thread_unsafe, int, (const struct rte_memseg *ms, size_t *offset), 0); -DEFINE_STUB(dpdk_pci_device_vtophys, uint64_t, (struct rte_pci_device *dev, uint64_t vaddr), 0); +DEFINE_STUB(dpdk_pci_device_vtophys, uint64_t, (struct rte_pci_device *dev, uint64_t vaddr, + size_t len), 0); static int test_mem_map_notify(void *cb_ctx, struct spdk_mem_map *map,