vtophys: remap vfio dma memory when necessary
VFIO requires at least one IOMMU group to be added to the VFIO container to be able to perform any IOMMU operations on that container. [1] Without any groups added, VFIO_IOMMU_MAP_DMA would always respond with errno 22 (Invalid argument). Also, if the last IOMMU group is removed from the container (device hotremove), all the IOMMU mappings are lost. In both cases we need to remap vfio memory as soon as the first IOMMU group is attached. The attach is done inside DPDK during device attach and we can't hook into it directly. Instead, this patch hooks into our PCI init/fini callbacks. There's now a PCI device ref counter in our vfio manager and a history of all registered memory pages. When the refcount is increased from 0 to 1, the vtophys will remap all vfio dma memory. [1] https://www.kernel.org/doc/Documentation/vfio.txt "On its own, the container provides little functionality, with all but a couple version and extension query interfaces locked away. The user needs to add a group into the container for the next level of functionality. [...] With a group (or groups) attached to a container, the remaining ioctls become available, enabling access to the VFIO IOMMU interfaces." Change-Id: I744e07043dbe7ffd433fc95d604dad39647675f4 Signed-off-by: Dariusz Stojaczyk <dariuszx.stojaczyk@intel.com> Reviewed-on: https://review.gerrithub.io/390655 Tested-by: SPDK Automated Test System <sys_sgsw@intel.com> Reviewed-by: Pawel Wodkowski <pawelx.wodkowski@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
parent
9daf838db0
commit
479134e9d4
@ -87,4 +87,16 @@ int spdk_pci_device_attach(struct spdk_pci_enum_ctx *ctx, spdk_pci_enum_cb enum_
|
||||
void spdk_mem_map_init(void);
|
||||
void spdk_vtophys_init(void);
|
||||
|
||||
/**
|
||||
* Increase the refcount of active DMA-capable devices managed by SPDK.
|
||||
* This must be called after a `rte_pci_device` is created.
|
||||
*/
|
||||
void spdk_vtophys_get_ref(void);
|
||||
|
||||
/**
|
||||
* Decrease the refcount of active DMA-capable devices managed by SPDK.
|
||||
* This must be called before a `rte_pci_device` is destroyed.
|
||||
*/
|
||||
void spdk_vtophys_put_ref(void);
|
||||
|
||||
#endif
|
||||
|
@ -45,6 +45,7 @@ spdk_pci_device_init(struct rte_pci_driver *driver,
|
||||
struct rte_pci_device *device)
|
||||
{
|
||||
struct spdk_pci_enum_ctx *ctx = (struct spdk_pci_enum_ctx *)driver;
|
||||
int rc;
|
||||
|
||||
if (!ctx->cb_fn) {
|
||||
#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4)
|
||||
@ -66,12 +67,19 @@ spdk_pci_device_init(struct rte_pci_driver *driver,
|
||||
usleep(500 * 1000);
|
||||
}
|
||||
|
||||
return ctx->cb_fn(ctx->cb_arg, (struct spdk_pci_device *)device);
|
||||
rc = ctx->cb_fn(ctx->cb_arg, (struct spdk_pci_device *)device);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
spdk_vtophys_get_ref();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
spdk_pci_device_fini(struct rte_pci_device *device)
|
||||
{
|
||||
spdk_vtophys_put_ref();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -56,15 +56,27 @@
|
||||
/* Internal DPDK function forward declaration */
|
||||
int pci_vfio_is_enabled(void);
|
||||
|
||||
struct spdk_vfio_dma_map {
|
||||
struct vfio_iommu_type1_dma_map map;
|
||||
TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
|
||||
};
|
||||
|
||||
struct vfio_cfg {
|
||||
int fd;
|
||||
bool enabled;
|
||||
unsigned device_ref;
|
||||
TAILQ_HEAD(, spdk_vfio_dma_map) maps;
|
||||
pthread_mutex_t mutex;
|
||||
};
|
||||
|
||||
static struct vfio_cfg g_vfio = {
|
||||
.fd = -1,
|
||||
.enabled = false
|
||||
.enabled = false,
|
||||
.device_ref = 0,
|
||||
.maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
|
||||
.mutex = PTHREAD_MUTEX_INITIALIZER
|
||||
};
|
||||
|
||||
#else
|
||||
#define SPDK_VFIO_ENABLED 0
|
||||
#endif
|
||||
@ -82,42 +94,98 @@ static struct spdk_mem_map *g_vtophys_map;
|
||||
static int
|
||||
vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
|
||||
{
|
||||
struct vfio_iommu_type1_dma_map dma_map;
|
||||
struct spdk_vfio_dma_map *dma_map;
|
||||
int ret;
|
||||
|
||||
dma_map.argsz = sizeof(dma_map);
|
||||
dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
|
||||
dma_map.vaddr = vaddr;
|
||||
dma_map.iova = iova;
|
||||
dma_map.size = size;
|
||||
|
||||
ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map);
|
||||
|
||||
if (ret) {
|
||||
DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
|
||||
dma_map = calloc(1, sizeof(*dma_map));
|
||||
if (dma_map == NULL) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
return ret;
|
||||
dma_map->map.argsz = sizeof(dma_map->map);
|
||||
dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
|
||||
dma_map->map.vaddr = vaddr;
|
||||
dma_map->map.iova = iova;
|
||||
dma_map->map.size = size;
|
||||
|
||||
pthread_mutex_lock(&g_vfio.mutex);
|
||||
if (g_vfio.device_ref == 0) {
|
||||
/* VFIO requires at least one device (IOMMU group) to be added to
|
||||
* a VFIO container before it is possible to perform any IOMMU
|
||||
* operations on that container. This memory will be mapped once
|
||||
* the first device (IOMMU group) is hotplugged.
|
||||
*
|
||||
* Since the vfio container is managed internally by DPDK, it is
|
||||
* also possible that some device is already in that container, but
|
||||
* it's not managed by SPDK - e.g. an NIC attached internally
|
||||
* inside DPDK. We could map the memory straight away in such
|
||||
* scenario, but there's no need to do it. DPDK devices clearly
|
||||
* don't need our mappings and hence we defer the mapping
|
||||
* unconditionally until the first SPDK-managed device is
|
||||
* hotplugged.
|
||||
*/
|
||||
goto out_insert;
|
||||
}
|
||||
|
||||
ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
|
||||
if (ret) {
|
||||
DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
free(dma_map);
|
||||
return ret;
|
||||
}
|
||||
|
||||
out_insert:
|
||||
TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
|
||||
{
|
||||
struct vfio_iommu_type1_dma_unmap dma_unmap;
|
||||
struct spdk_vfio_dma_map *dma_map;
|
||||
int ret;
|
||||
|
||||
pthread_mutex_lock(&g_vfio.mutex);
|
||||
TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
|
||||
if (dma_map->map.iova == iova) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (dma_map == NULL) {
|
||||
DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
/** don't support partial or multiple-page unmap for now */
|
||||
assert(dma_map->map.size == size);
|
||||
|
||||
if (g_vfio.device_ref == 0) {
|
||||
/* Memory is not mapped anymore, just remove it's references */
|
||||
goto out_remove;
|
||||
}
|
||||
|
||||
dma_unmap.argsz = sizeof(dma_unmap);
|
||||
dma_unmap.flags = 0;
|
||||
dma_unmap.iova = iova;
|
||||
dma_unmap.size = size;
|
||||
|
||||
ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
|
||||
|
||||
if (ret) {
|
||||
DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
out_remove:
|
||||
TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
free(dma_map);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -312,6 +380,75 @@ spdk_vtophys_iommu_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
spdk_vtophys_get_ref(void)
|
||||
{
|
||||
#if SPDK_VFIO_ENABLED
|
||||
struct spdk_vfio_dma_map *dma_map;
|
||||
int ret;
|
||||
|
||||
if (!g_vfio.enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&g_vfio.mutex);
|
||||
g_vfio.device_ref++;
|
||||
if (g_vfio.device_ref > 1) {
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
return;
|
||||
}
|
||||
|
||||
/* This is the first SPDK device using DPDK vfio. This means that the first
|
||||
* IOMMU group might have been just been added to the DPDK vfio container.
|
||||
* From this point it is certain that the memory can be mapped now.
|
||||
*/
|
||||
TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
|
||||
ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
|
||||
if (ret) {
|
||||
DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
|
||||
break;
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
spdk_vtophys_put_ref(void)
|
||||
{
|
||||
#if SPDK_VFIO_ENABLED
|
||||
struct spdk_vfio_dma_map *dma_map;
|
||||
int ret;
|
||||
|
||||
if (!g_vfio.enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&g_vfio.mutex);
|
||||
assert(g_vfio.device_ref > 0);
|
||||
g_vfio.device_ref--;
|
||||
if (g_vfio.device_ref > 0) {
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
return;
|
||||
}
|
||||
|
||||
/* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
|
||||
* any additional devices using it's vfio container, all the mappings
|
||||
* will be automatically removed by the Linux vfio driver. We unmap
|
||||
* the memory manually to be able to easily re-map it later regardless
|
||||
* of other, external factors.
|
||||
*/
|
||||
TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
|
||||
ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->map);
|
||||
if (ret) {
|
||||
DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
|
||||
break;
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&g_vfio.mutex);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
spdk_vtophys_init(void)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user