diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e84124da..9b08be594 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,9 @@ dwords. Added a new custom transport `SPDK_NVME_TRANSPORT_VFIOUSER` to enable NVMe driver running with NVMe over vfio-user target. +Added the vfio-user custom transport implementation in NVMe driver which can connect +to NVMe over vfio-user target via vfio-user transport. + ### event The pci_whitelist and pci_blacklist members of struct spdk_app_opts have been @@ -102,6 +105,9 @@ An `opts_size`element was added in the `spdk_nvmf_transport_opts` structure to solve the ABI compatiblity issue between different SPDK version. And also add `opts_size` parameter in spdk_nvmf_transport_opts_init function. +Added a new custom vfio-user transport implementation in NVMf which can provide +emulated NVMe devices to QEMU and SPDK NVMe driver. + ### json A new API `spdk_jsonrpc_send_bool_response` was added to allow sending response for diff --git a/CONFIG b/CONFIG index 6f6f34799..2d82e27bb 100644 --- a/CONFIG +++ b/CONFIG @@ -117,7 +117,7 @@ CONFIG_VHOST=y # Build vhost initiator (Virtio) driver. CONFIG_VIRTIO=y -# Build NVMf custom vfio-user target. +# Build custom vfio-user transport for NVMf target and NVMe initiator. CONFIG_VFIO_USER=y # Build with PMDK backends diff --git a/configure b/configure index 403c0fda5..3bd19fcd9 100755 --- a/configure +++ b/configure @@ -62,7 +62,7 @@ function usage() echo " No path required." echo " virtio Build vhost initiator and virtio-pci bdev modules." echo " No path required." - echo " vfio-user Build NVMf custom vfio-user target." + echo " vfio-user Build custom vfio-user transport for NVMf target and NVMe initiator." echo " No path required." echo " pmdk Build persistent memory bdev." echo " example: /usr/share/pmdk" diff --git a/examples/nvme/perf/perf.c b/examples/nvme/perf/perf.c index 59c9f62bd..18251fdb9 100644 --- a/examples/nvme/perf/perf.c +++ b/examples/nvme/perf/perf.c @@ -1011,6 +1011,9 @@ build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) case SPDK_NVME_TRANSPORT_TCP: res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); break; + case SPDK_NVME_TRANSPORT_VFIOUSER: + res = snprintf(name, length, "VFIOUSER (%s)", trid->traddr); + break; case SPDK_NVME_TRANSPORT_CUSTOM: res = snprintf(name, length, "CUSTOM (%s)", trid->traddr); break; diff --git a/examples/nvme/reconnect/reconnect.c b/examples/nvme/reconnect/reconnect.c index 9c8abb651..f0349509d 100644 --- a/examples/nvme/reconnect/reconnect.c +++ b/examples/nvme/reconnect/reconnect.c @@ -266,6 +266,9 @@ build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) case SPDK_NVME_TRANSPORT_TCP: snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); break; + case SPDK_NVME_TRANSPORT_VFIOUSER: + snprintf(name, length, "VFIOUSER (%s)", trid->traddr); + break; case SPDK_NVME_TRANSPORT_CUSTOM: snprintf(name, length, "CUSTOM (%s)", trid->traddr); break; diff --git a/include/spdk/vfio_user_pci.h b/include/spdk/vfio_user_pci.h new file mode 100644 index 000000000..7aecdb9aa --- /dev/null +++ b/include/spdk/vfio_user_pci.h @@ -0,0 +1,59 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SPDK_VFIO_USER_PCI_H +#define _SPDK_VFIO_USER_PCI_H + +#include "spdk/stdinc.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct vfio_device; + +int spdk_vfio_user_pci_bar_access(struct vfio_device *dev, uint32_t index, + uint64_t offset, size_t len, void *buf, + bool is_write); + +void *spdk_vfio_user_get_bar_addr(struct vfio_device *dev, uint32_t index, + uint64_t offset, uint32_t len); + +struct vfio_device *spdk_vfio_user_setup(const char *path); + +void spdk_vfio_user_release(struct vfio_device *dev); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/spdk/vfio_user_spec.h b/include/spdk/vfio_user_spec.h new file mode 100644 index 000000000..ac87ba181 --- /dev/null +++ b/include/spdk/vfio_user_spec.h @@ -0,0 +1,110 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VFIO_USER_SPEC_H +#define _VFIO_USER_SPEC_H + +#include "spdk/stdinc.h" + +enum vfio_user_command { + VFIO_USER_VERSION = 1, + VFIO_USER_DMA_MAP = 2, + VFIO_USER_DMA_UNMAP = 3, + VFIO_USER_DEVICE_GET_INFO = 4, + VFIO_USER_DEVICE_GET_REGION_INFO = 5, + VFIO_USER_DEVICE_GET_IRQ_INFO = 6, + VFIO_USER_DEVICE_SET_IRQS = 7, + VFIO_USER_REGION_READ = 8, + VFIO_USER_REGION_WRITE = 9, + VFIO_USER_DMA_READ = 10, + VFIO_USER_DMA_WRITE = 11, + VFIO_USER_VM_INTERRUPT = 12, + VFIO_USER_DEVICE_RESET = 13, + VFIO_USER_MAX, +}; + +enum vfio_user_message_type { + VFIO_USER_MESSAGE_COMMAND = 0, + VFIO_USER_MESSAGE_REPLY = 1, +}; + +#define VFIO_USER_FLAGS_NO_REPLY (0x1) + +struct vfio_user_header { + uint16_t msg_id; + uint16_t cmd; + uint32_t msg_size; + struct { + uint32_t type : 4; +#define VFIO_USER_F_TYPE_COMMAND 0 +#define VFIO_USER_F_TYPE_REPLY 1 + uint32_t no_reply : 1; + uint32_t error : 1; + uint32_t resvd : 26; + } flags; + uint32_t error_no; +} __attribute__((packed)); + +struct vfio_user_version { + uint16_t major; + uint16_t minor; + uint8_t data[]; +} __attribute__((packed)); + +struct vfio_user_dma_region { + uint64_t addr; + uint64_t size; + uint64_t offset; + uint32_t prot; + uint32_t flags; +#define VFIO_USER_F_DMA_REGION_MAPPABLE (1 << 0) +} __attribute__((packed)); + +struct vfio_user_region_access { + uint64_t offset; + uint32_t region; + uint32_t count; + uint8_t data[]; +} __attribute__((packed)); + +struct vfio_user_dma_region_access { + uint64_t addr; + uint32_t count; + uint8_t data[]; +} __attribute__((packed)); + +struct vfio_user_irq_info { + uint32_t subindex; +} __attribute__((packed)); + +#endif diff --git a/lib/Makefile b/lib/Makefile index 57f5e49e3..eab297ed0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -48,6 +48,7 @@ DIRS-$(CONFIG_VHOST) += vhost DIRS-$(CONFIG_VIRTIO) += virtio DIRS-$(CONFIG_REDUCE) += reduce DIRS-$(CONFIG_RDMA) += rdma +DIRS-$(CONFIG_VFIO_USER) += vfio_user # If CONFIG_ENV is pointing at a directory in lib, build it. # Out-of-tree env implementations must be built separately by the user. diff --git a/lib/nvme/Makefile b/lib/nvme/Makefile index 888c99843..fcc9d2923 100644 --- a/lib/nvme/Makefile +++ b/lib/nvme/Makefile @@ -39,6 +39,7 @@ SO_MINOR := 2 C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie_common.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c \ nvme_ctrlr_ocssd_cmd.c nvme_ns_ocssd_cmd.c nvme_tcp.c nvme_opal.c nvme_io_msg.c nvme_poll_group.c nvme_zns.c +C_SRCS-$(CONFIG_VFIO_USER) += nvme_vfio_user.c C_SRCS-$(CONFIG_RDMA) += nvme_rdma.c C_SRCS-$(CONFIG_NVME_CUSE) += nvme_cuse.c diff --git a/lib/nvme/nvme_vfio_user.c b/lib/nvme/nvme_vfio_user.c new file mode 100644 index 000000000..d137ff892 --- /dev/null +++ b/lib/nvme/nvme_vfio_user.c @@ -0,0 +1,939 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* VFIO transport extensions for spdk_nvme_ctrlr */ + +#include "spdk/stdinc.h" +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/vfio_user_pci.h" +#include "nvme_internal.h" +#include "nvme_pcie_internal.h" + +#include + +#define NVME_MAX_XFER_SIZE (131072) +#define NVME_MAX_SGES (1) + +struct nvme_vfio_ctrlr { + struct nvme_pcie_ctrlr pctrlr; + + volatile uint32_t *doorbell_base; + int bar0_fd; + struct vfio_device *dev; +}; + +static inline uint64_t +vfio_vtophys(const void *vaddr, uint64_t *size) +{ + return (uint64_t)(uintptr_t)vaddr; +} + +static inline struct nvme_vfio_ctrlr * +nvme_vfio_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + return SPDK_CONTAINEROF(pctrlr, struct nvme_vfio_ctrlr, pctrlr); +} + +static int +nvme_vfio_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + struct nvme_vfio_ctrlr *vctrlr = nvme_vfio_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + SPDK_DEBUGLOG(nvme_vfio, "ctrlr %s: offset 0x%x, value 0x%x\n", ctrlr->trid.traddr, offset, value); + + return spdk_vfio_user_pci_bar_access(vctrlr->dev, VFIO_PCI_BAR0_REGION_INDEX, + offset, 4, &value, true); +} + +static int +nvme_vfio_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + struct nvme_vfio_ctrlr *vctrlr = nvme_vfio_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + SPDK_DEBUGLOG(nvme_vfio, "ctrlr %s: offset 0x%x, value 0x%"PRIx64"\n", ctrlr->trid.traddr, offset, + value); + + return spdk_vfio_user_pci_bar_access(vctrlr->dev, VFIO_PCI_BAR0_REGION_INDEX, + offset, 8, &value, true); +} + +static int +nvme_vfio_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + struct nvme_vfio_ctrlr *vctrlr = nvme_vfio_ctrlr(ctrlr); + int ret; + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + + ret = spdk_vfio_user_pci_bar_access(vctrlr->dev, VFIO_PCI_BAR0_REGION_INDEX, + offset, 4, value, false); + if (ret != 0) { + SPDK_ERRLOG("ctrlr %p, offset %x\n", ctrlr, offset); + return ret; + } + + SPDK_DEBUGLOG(nvme_vfio, "ctrlr %s: offset 0x%x, value 0x%x\n", ctrlr->trid.traddr, offset, *value); + + return 0; +} + +static int +nvme_vfio_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + struct nvme_vfio_ctrlr *vctrlr = nvme_vfio_ctrlr(ctrlr); + int ret; + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + + ret = spdk_vfio_user_pci_bar_access(vctrlr->dev, VFIO_PCI_BAR0_REGION_INDEX, + offset, 8, value, false); + if (ret != 0) { + SPDK_ERRLOG("ctrlr %p, offset %x\n", ctrlr, offset); + return ret; + } + + SPDK_DEBUGLOG(nvme_vfio, "ctrlr %s: offset 0x%x, value 0x%"PRIx64"\n", ctrlr->trid.traddr, offset, + *value); + + return 0; +} + +static int +nvme_vfio_ctrlr_set_asq(struct spdk_nvme_ctrlr *ctrlr, uint64_t value) +{ + return nvme_vfio_ctrlr_set_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, asq), + value); +} + +static int +nvme_vfio_ctrlr_set_acq(struct spdk_nvme_ctrlr *ctrlr, uint64_t value) +{ + return nvme_vfio_ctrlr_set_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, acq), + value); +} + +static int +nvme_vfio_ctrlr_set_aqa(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_aqa_register *aqa) +{ + return nvme_vfio_ctrlr_set_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), + aqa->raw); +} + +/* Instead of using path as the bar0 file descriptor, we can also use + * SPARSE MMAP to get the doorbell mmaped address. + */ +static int +nvme_vfio_setup_bar0(struct nvme_vfio_ctrlr *vctrlr, const char *path) +{ + volatile uint32_t *doorbell; + int fd; + + fd = open(path, O_RDWR); + if (fd < 0) { + SPDK_ERRLOG("Failed to open file %s\n", path); + return fd; + } + + doorbell = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0x1000); + if (doorbell == MAP_FAILED) { + SPDK_ERRLOG("Failed to mmap file %s\n", path); + close(fd); + return -EFAULT; + } + + vctrlr->bar0_fd = fd; + vctrlr->doorbell_base = doorbell; + + return 0; +} + +static void +nvme_vfio_bar0_destruct(struct nvme_vfio_ctrlr *vctrlr) +{ + if (vctrlr->doorbell_base) { + munmap((void *)vctrlr->doorbell_base, 0x1000); + } + + close(vctrlr->bar0_fd); +} + +static struct spdk_nvme_ctrlr * + nvme_vfio_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct nvme_vfio_ctrlr *vctrlr; + struct nvme_pcie_ctrlr *pctrlr; + uint16_t cmd_reg; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + int ret; + char ctrlr_path[PATH_MAX]; + char ctrlr_bar0[PATH_MAX]; + + snprintf(ctrlr_path, sizeof(ctrlr_path), "%s/cntrl", trid->traddr); + snprintf(ctrlr_bar0, sizeof(ctrlr_bar0), "%s/bar0", trid->traddr); + + ret = access(ctrlr_path, F_OK); + if (ret != 0) { + SPDK_ERRLOG("Access path %s failed\n", ctrlr_path); + return NULL; + } + + ret = access(ctrlr_bar0, F_OK); + if (ret != 0) { + SPDK_ERRLOG("Access path %s failed\n", ctrlr_bar0); + return NULL; + } + + vctrlr = calloc(1, sizeof(*vctrlr)); + if (!vctrlr) { + return NULL; + } + + ret = nvme_vfio_setup_bar0(vctrlr, ctrlr_bar0); + if (ret != 0) { + free(vctrlr); + return NULL; + } + + vctrlr->dev = spdk_vfio_user_setup(ctrlr_path); + if (!vctrlr->dev) { + SPDK_ERRLOG("Error to setup vfio device\n"); + nvme_vfio_bar0_destruct(vctrlr); + free(vctrlr); + return NULL; + } + + pctrlr = &vctrlr->pctrlr; + pctrlr->doorbell_base = vctrlr->doorbell_base; + pctrlr->ctrlr.is_removed = false; + pctrlr->ctrlr.opts = *opts; + pctrlr->ctrlr.trid = *trid; + pctrlr->ctrlr.opts.use_cmb_sqs = false; + + ret = nvme_ctrlr_construct(&pctrlr->ctrlr); + if (ret != 0) { + goto exit; + } + + /* Enable PCI busmaster and disable INTx */ + ret = spdk_vfio_user_pci_bar_access(vctrlr->dev, VFIO_PCI_CONFIG_REGION_INDEX, 4, 2, + &cmd_reg, false); + if (ret != 0) { + SPDK_ERRLOG("Read PCI CMD REG failed\n"); + goto exit; + } + cmd_reg |= 0x404; + ret = spdk_vfio_user_pci_bar_access(vctrlr->dev, VFIO_PCI_CONFIG_REGION_INDEX, 4, 2, + &cmd_reg, true); + if (ret != 0) { + SPDK_ERRLOG("Write PCI CMD REG failed\n"); + goto exit; + } + + if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + goto exit; + } + + if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + goto exit; + } + + nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); + /* Doorbell stride is 2 ^ (dstrd + 2), + * but we want multiples of 4, so drop the + 2 */ + pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; + + ret = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size); + if (ret != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + goto exit; + } + + /* Construct the primary process properties */ + ret = nvme_ctrlr_add_process(&pctrlr->ctrlr, 0); + if (ret != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + goto exit; + } + + return &pctrlr->ctrlr; + +exit: + nvme_vfio_bar0_destruct(vctrlr); + spdk_vfio_user_release(vctrlr->dev); + free(vctrlr); + return NULL; +} + +static int +nvme_vfio_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + int ret; + + if (probe_ctx->trid.trtype != SPDK_NVME_TRANSPORT_VFIOUSER) { + SPDK_ERRLOG("Can only use SPDK_NVME_TRANSPORT_VFIOUSER"); + return -EINVAL; + } + + ret = access(probe_ctx->trid.traddr, F_OK); + if (ret != 0) { + SPDK_ERRLOG("Error to access file %s\n", probe_ctx->trid.traddr); + return ret; + } + SPDK_NOTICELOG("Scan controller : %s\n", probe_ctx->trid.traddr); + + return nvme_ctrlr_probe(&probe_ctx->trid, probe_ctx, NULL); +} + +static int +nvme_vfio_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_qpair *vadminq = nvme_pcie_qpair(ctrlr->adminq); + union spdk_nvme_aqa_register aqa; + + if (nvme_vfio_ctrlr_set_asq(ctrlr, vadminq->cmd_bus_addr)) { + SPDK_ERRLOG("set_asq() failed\n"); + return -EIO; + } + + if (nvme_vfio_ctrlr_set_acq(ctrlr, vadminq->cpl_bus_addr)) { + SPDK_ERRLOG("set_acq() failed\n"); + return -EIO; + } + + aqa.raw = 0; + /* acqs and asqs are 0-based. */ + aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + + if (nvme_vfio_ctrlr_set_aqa(ctrlr, &aqa)) { + SPDK_ERRLOG("set_aqa() failed\n"); + return -EIO; + } + + return 0; +} + +static int +nvme_vfio_qpair_destroy(struct spdk_nvme_qpair *qpair); + +static int +nvme_vfio_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_vfio_ctrlr *vctrlr = nvme_vfio_ctrlr(ctrlr); + + if (ctrlr->adminq) { + nvme_vfio_qpair_destroy(ctrlr->adminq); + } + + nvme_ctrlr_destruct_finish(ctrlr); + + nvme_ctrlr_free_processes(ctrlr); + + nvme_vfio_bar0_destruct(vctrlr); + spdk_vfio_user_release(vctrlr->dev); + free(vctrlr); + + return 0; +} + +static uint32_t +nvme_vfio_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + return NVME_MAX_XFER_SIZE; +} + +static uint16_t +nvme_vfio_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + return NVME_MAX_SGES; +} + +static struct spdk_nvme_qpair * +nvme_vfio_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct nvme_pcie_qpair *vqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + assert(ctrlr != NULL); + + vqpair = spdk_zmalloc(sizeof(*vqpair), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (vqpair == NULL) { + return NULL; + } + + vqpair->num_entries = opts->io_queue_size; + vqpair->flags.delay_cmd_submit = opts->delay_cmd_submit; + + qpair = &vqpair->qpair; + + rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); + if (rc != 0) { + nvme_vfio_qpair_destroy(qpair); + return NULL; + } + + rc = nvme_pcie_qpair_construct(qpair, opts); + + if (rc != 0) { + nvme_vfio_qpair_destroy(qpair); + return NULL; + } + + return qpair; +} + +static void +nvme_vfio_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr); + +static int +nvme_vfio_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_completion_poll_status *status; + int rc; + + assert(ctrlr != NULL); + + if (ctrlr->is_removed) { + goto free; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + /* Delete the I/O submission queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); + free(status); + return rc; + } + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + + memset(status, 0, sizeof(*status)); + /* Delete the completion queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); + free(status); + return rc; + } + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + free(status); + +free: + if (qpair->no_deletion_notification_needed == 0) { + /* Abort the rest of the I/O */ + nvme_vfio_qpair_abort_trackers(qpair, 1); + } + + nvme_vfio_qpair_destroy(qpair); + return 0; +} + +static inline void +nvme_vfio_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + + if (qpair->first_fused_submitted) { + /* This is first cmd of two fused commands - don't ring doorbell */ + qpair->first_fused_submitted = 0; + return; + } + + spdk_wmb(); + spdk_mmio_write_4(vqpair->sq_tdbl, vqpair->sq_tail); +} + +static inline void +nvme_vfio_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + + spdk_mmio_write_4(vqpair->cq_hdbl, vqpair->cq_head); +} + +static void +nvme_vfio_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) +{ + struct nvme_request *req; + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + + req = tr->req; + assert(req != NULL); + + if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) { + /* This is first cmd of two fused commands - don't ring doorbell */ + qpair->first_fused_submitted = 1; + } + + vqpair->cmd[vqpair->sq_tail] = req->cmd; + + if (spdk_unlikely(++vqpair->sq_tail == vqpair->num_entries)) { + vqpair->sq_tail = 0; + } + + if (spdk_unlikely(vqpair->sq_tail == vqpair->sq_head)) { + SPDK_ERRLOG("sq_tail is passing sq_head!\n"); + } + + nvme_vfio_qpair_ring_sq_doorbell(qpair); +} + +static void +nvme_vfio_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, + struct spdk_nvme_cpl *cpl, bool print_on_error) +{ + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + struct nvme_request *req; + bool retry, error; + bool req_from_current_proc = true; + + req = tr->req; + + assert(req != NULL); + + error = spdk_nvme_cpl_is_error(cpl); + retry = error && nvme_completion_is_retry(cpl) && + req->retries < vqpair->retry_count; + + if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { + spdk_nvme_qpair_print_command(qpair, &req->cmd); + spdk_nvme_qpair_print_completion(qpair, cpl); + } + + assert(cpl->cid == req->cmd.cid); + + if (retry) { + req->retries++; + nvme_vfio_qpair_submit_tracker(qpair, tr); + } else { + /* Only check admin requests from different processes. */ + if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { + req_from_current_proc = false; + nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); + } else { + nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl); + } + + if (req_from_current_proc == true) { + nvme_qpair_free_request(qpair, req); + } + + tr->req = NULL; + + TAILQ_REMOVE(&vqpair->outstanding_tr, tr, tq_list); + TAILQ_INSERT_HEAD(&vqpair->free_tr, tr, tq_list); + } +} + +static void +nvme_vfio_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, + struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, + bool print_on_error) +{ + struct spdk_nvme_cpl cpl; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.cid = tr->cid; + cpl.status.sct = sct; + cpl.status.sc = sc; + cpl.status.dnr = dnr; + nvme_vfio_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); +} + +static void +nvme_vfio_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr, *temp, *last; + + last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head); + + /* Abort previously submitted (outstanding) trs */ + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting outstanding command\n"); + } + nvme_vfio_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); + + if (tr == last) { + break; + } + } +} + +static void +nvme_vfio_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + nvme_vfio_qpair_abort_trackers(qpair, dnr); +} + +static void +nvme_vfio_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + + tr = TAILQ_FIRST(&vqpair->outstanding_tr); + while (tr != NULL) { + assert(tr->req != NULL); + if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + nvme_vfio_qpair_manual_complete_tracker(qpair, tr, + SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, + false); + tr = TAILQ_FIRST(&vqpair->outstanding_tr); + } else { + tr = TAILQ_NEXT(tr, tq_list); + } + } +} + +static void +nvme_vfio_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + nvme_vfio_admin_qpair_abort_aers(qpair); +} + +static int +nvme_vfio_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_vfio_admin_qpair_destroy(qpair); + } + + spdk_free(vqpair->cmd); + spdk_free(vqpair->cpl); + + if (vqpair->tr) { + spdk_free(vqpair->tr); + } + + nvme_qpair_deinit(qpair); + + spdk_free(vqpair); + + return 0; +} + +static inline int +nvme_vfio_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, + uint32_t page_size) +{ + struct spdk_nvme_cmd *cmd = &tr->req->cmd; + uintptr_t page_mask = page_size - 1; + uint64_t phys_addr; + uint32_t i; + + SPDK_DEBUGLOG(nvme_vfio, "prp_index:%u virt_addr:%p len:%u\n", + *prp_index, virt_addr, (uint32_t)len); + + if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + return -EFAULT; + } + + i = *prp_index; + while (len) { + uint32_t seg_len; + + /* + * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, + * so prp_index == count is valid. + */ + if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { + SPDK_ERRLOG("out of PRP entries\n"); + return -EFAULT; + } + + phys_addr = vfio_vtophys(virt_addr, NULL); + + if (i == 0) { + SPDK_DEBUGLOG(nvme_vfio, "prp1 = %p\n", (void *)phys_addr); + cmd->dptr.prp.prp1 = phys_addr; + seg_len = page_size - ((uintptr_t)virt_addr & page_mask); + } else { + if ((phys_addr & page_mask) != 0) { + SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); + return -EFAULT; + } + + SPDK_DEBUGLOG(nvme_vfio, "prp[%u] = %p\n", i - 1, (void *)phys_addr); + tr->u.prp[i - 1] = phys_addr; + seg_len = page_size; + } + + seg_len = spdk_min(seg_len, len); + virt_addr += seg_len; + len -= seg_len; + i++; + } + + cmd->psdt = SPDK_NVME_PSDT_PRP; + if (i <= 1) { + cmd->dptr.prp.prp2 = 0; + } else if (i == 2) { + cmd->dptr.prp.prp2 = tr->u.prp[0]; + SPDK_DEBUGLOG(nvme_vfio, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); + } else { + cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; + SPDK_DEBUGLOG(nvme_vfio, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); + } + + *prp_index = i; + return 0; +} + +static int +nvme_vfio_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + uint32_t prp_index = 0; + int rc; + + rc = nvme_vfio_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, + req->payload_size, qpair->ctrlr->page_size); + if (rc) { + nvme_vfio_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_INVALID_FIELD, + 1 /* do not retry */, true); + } + + return rc; +} + +static int +nvme_vfio_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + struct nvme_tracker *tr; + int rc = 0; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + tr = TAILQ_FIRST(&vqpair->free_tr); + + if (tr == NULL) { + /* Inform the upper layer to try again later. */ + rc = -EAGAIN; + goto exit; + } + + TAILQ_REMOVE(&vqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ + TAILQ_INSERT_TAIL(&vqpair->outstanding_tr, tr, tq_list); + tr->req = req; + tr->cb_fn = req->cb_fn; + tr->cb_arg = req->cb_arg; + req->cmd.cid = tr->cid; + + if (req->payload_size != 0) { + rc = nvme_vfio_qpair_build_contig_request(qpair, req, tr, true); + if (rc) { + goto exit; + } + } + + nvme_vfio_qpair_submit_tracker(qpair, tr); + +exit: + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return rc; +} + +static int32_t +nvme_vfio_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + struct nvme_pcie_qpair *vqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + struct spdk_nvme_cpl *cpl, *next_cpl; + uint32_t num_completions = 0; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + uint16_t next_cq_head; + uint8_t next_phase; + bool next_is_valid = false; + + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + if (max_completions == 0 || max_completions > vqpair->max_completions_cap) { + /* + * max_completions == 0 means unlimited, but complete at most + * max_completions_cap batch of I/O at a time so that the completion + * queue doorbells don't wrap around. + */ + max_completions = vqpair->max_completions_cap; + } + + while (1) { + cpl = &vqpair->cpl[vqpair->cq_head]; + + if (!next_is_valid && cpl->status.p != vqpair->flags.phase) { + break; + } + + if (spdk_likely(vqpair->cq_head + 1 != vqpair->num_entries)) { + next_cq_head = vqpair->cq_head + 1; + next_phase = vqpair->flags.phase; + } else { + next_cq_head = 0; + next_phase = !vqpair->flags.phase; + } + next_cpl = &vqpair->cpl[next_cq_head]; + next_is_valid = (next_cpl->status.p == next_phase); + if (next_is_valid) { + __builtin_prefetch(&vqpair->tr[next_cpl->cid]); + } + + if (spdk_unlikely(++vqpair->cq_head == vqpair->num_entries)) { + vqpair->cq_head = 0; + vqpair->flags.phase = !vqpair->flags.phase; + } + + tr = &vqpair->tr[cpl->cid]; + /* Prefetch the req's STAILQ_ENTRY since we'll need to access it + * as part of putting the req back on the qpair's free list. + */ + __builtin_prefetch(&tr->req->stailq); + vqpair->sq_head = cpl->sqhd; + + if (tr->req) { + nvme_vfio_qpair_complete_tracker(qpair, tr, cpl, true); + } else { + SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); + spdk_nvme_qpair_print_completion(qpair, cpl); + assert(0); + } + + if (++num_completions == max_completions) { + break; + } + } + + if (num_completions > 0) { + nvme_vfio_qpair_ring_cq_doorbell(qpair); + } + + if (vqpair->flags.delay_cmd_submit) { + if (vqpair->last_sq_tail != vqpair->sq_tail) { + nvme_vfio_qpair_ring_sq_doorbell(qpair); + vqpair->last_sq_tail = vqpair->sq_tail; + } + } + + /* Before returning, complete any pending admin request. */ + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_pcie_qpair_complete_pending_admin_request(qpair); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return num_completions; +} + +const struct spdk_nvme_transport_ops vfio_ops = { + .name = "VFIOUSER", + .type = SPDK_NVME_TRANSPORT_VFIOUSER, + .ctrlr_construct = nvme_vfio_ctrlr_construct, + .ctrlr_scan = nvme_vfio_ctrlr_scan, + .ctrlr_destruct = nvme_vfio_ctrlr_destruct, + .ctrlr_enable = nvme_vfio_ctrlr_enable, + + .ctrlr_set_reg_4 = nvme_vfio_ctrlr_set_reg_4, + .ctrlr_set_reg_8 = nvme_vfio_ctrlr_set_reg_8, + .ctrlr_get_reg_4 = nvme_vfio_ctrlr_get_reg_4, + .ctrlr_get_reg_8 = nvme_vfio_ctrlr_get_reg_8, + + .ctrlr_get_max_xfer_size = nvme_vfio_ctrlr_get_max_xfer_size, + .ctrlr_get_max_sges = nvme_vfio_ctrlr_get_max_sges, + + .ctrlr_create_io_qpair = nvme_vfio_ctrlr_create_io_qpair, + .ctrlr_delete_io_qpair = nvme_vfio_ctrlr_delete_io_qpair, + .ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair, + .ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair, + .admin_qpair_abort_aers = nvme_vfio_admin_qpair_abort_aers, + + .qpair_reset = nvme_pcie_qpair_reset, + .qpair_abort_reqs = nvme_vfio_qpair_abort_reqs, + .qpair_submit_request = nvme_vfio_qpair_submit_request, + .qpair_process_completions = nvme_vfio_qpair_process_completions, + + .poll_group_create = nvme_pcie_poll_group_create, + .poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair, + .poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair, + .poll_group_add = nvme_pcie_poll_group_add, + .poll_group_remove = nvme_pcie_poll_group_remove, + .poll_group_process_completions = nvme_pcie_poll_group_process_completions, + .poll_group_destroy = nvme_pcie_poll_group_destroy, +}; + +SPDK_NVME_TRANSPORT_REGISTER(vfio, &vfio_ops); + +SPDK_LOG_REGISTER_COMPONENT(nvme_vfio) diff --git a/lib/vfio_user/Makefile b/lib/vfio_user/Makefile new file mode 100644 index 000000000..60009c434 --- /dev/null +++ b/lib/vfio_user/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 1 +SO_MINOR := 0 + +C_SRCS = vfio_user_pci.c vfio_user.c +LIBNAME = vfio_user + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vfio_user.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/lib/vfio_user/spdk_vfio_user.map b/lib/vfio_user/spdk_vfio_user.map new file mode 100644 index 000000000..f3db7619e --- /dev/null +++ b/lib/vfio_user/spdk_vfio_user.map @@ -0,0 +1,11 @@ +{ + global: + + # public functions from vfio_user_pci.h + spdk_vfio_user_pci_bar_access; + spdk_vfio_user_get_bar_addr; + spdk_vfio_user_setup; + spdk_vfio_user_release; + + local: *; +}; diff --git a/lib/vfio_user/vfio_user.c b/lib/vfio_user/vfio_user.c new file mode 100644 index 000000000..9d9ccb0a9 --- /dev/null +++ b/lib/vfio_user/vfio_user.c @@ -0,0 +1,398 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * vfio-user client socket messages. + */ + +#include "spdk/stdinc.h" +#include "spdk/queue.h" +#include "spdk/util.h" +#include "spdk/log.h" +#include "spdk/vfio_user_spec.h" + +#include "vfio_user_internal.h" + +struct vfio_user_request { + struct vfio_user_header hdr; +#define VFIO_USER_MAX_PAYLOAD_SIZE (4096) + uint8_t payload[VFIO_USER_MAX_PAYLOAD_SIZE]; + int fds[VFIO_MAXIMUM_SPARSE_MMAP_REGISONS]; + int fd_num; +}; + +static const char *vfio_user_message_str[VFIO_USER_MAX] = { + [VFIO_USER_VERSION] = "VFIO_USER_VERSION", + [VFIO_USER_DMA_MAP] = "VFIO_USER_DMA_MAP", + [VFIO_USER_DMA_UNMAP] = "VFIO_USER_DMA_UNMAP", + [VFIO_USER_DEVICE_GET_INFO] = "VFIO_USER_DEVICE_GET_INFO", + [VFIO_USER_DEVICE_GET_REGION_INFO] = "VFIO_USER_DEVICE_GET_REGION_INFO", + [VFIO_USER_DEVICE_GET_IRQ_INFO] = "VFIO_USER_DEVICE_GET_IRQ_INFO", + [VFIO_USER_DEVICE_SET_IRQS] = "VFIO_USER_DEVICE_SET_IRQS", + [VFIO_USER_REGION_READ] = "VFIO_USER_REGION_READ", + [VFIO_USER_REGION_WRITE] = "VFIO_USER_REGION_WRITE", + [VFIO_USER_DMA_READ] = "VFIO_USER_DMA_READ", + [VFIO_USER_DMA_WRITE] = "VFIO_USER_DMA_WRITE", + [VFIO_USER_DEVICE_RESET] = "VFIO_USER_DEVICE_RESET", +}; + +static int +vfio_user_write(int fd, void *buf, int len, int *fds, int num_fds) +{ + int r; + struct msghdr msgh; + struct iovec iov; + size_t fd_size = num_fds * sizeof(int); + char control[CMSG_SPACE(VFIO_MAXIMUM_SPARSE_MMAP_REGISONS * sizeof(int))]; + struct cmsghdr *cmsg; + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = (uint8_t *)buf; + iov.iov_len = len; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + assert(num_fds <= VFIO_MAXIMUM_SPARSE_MMAP_REGISONS); + + if (fds && num_fds) { + msgh.msg_control = control; + msgh.msg_controllen = CMSG_SPACE(fd_size); + cmsg = CMSG_FIRSTHDR(&msgh); + assert(cmsg != NULL); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fd_size); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + r = sendmsg(fd, &msgh, MSG_NOSIGNAL); + } while (r < 0 && errno == EINTR); + + if (r == -1) { + return -errno; + } + + return 0; +} + +static int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int *fd_num) +{ + struct iovec iov; + struct msghdr msgh; + char control[CMSG_SPACE(VFIO_MAXIMUM_SPARSE_MMAP_REGISONS * sizeof(int))]; + struct cmsghdr *cmsg; + int got_fds = 0; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + return -ENOTSUP; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + *fd_num = got_fds; + assert(got_fds <= VFIO_MAXIMUM_SPARSE_MMAP_REGISONS); + memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int)); + break; + } + } + + return ret; +} + +static int +vfio_user_read(int fd, struct vfio_user_request *req) +{ + int ret; + size_t sz_payload; + + ret = read_fd_message(fd, (char *)req, sizeof(struct vfio_user_header), req->fds, &req->fd_num); + if (ret <= 0) { + return ret; + } + + if (req->hdr.flags.error) { + SPDK_ERRLOG("Command %u return failure\n", req->hdr.cmd); + errno = req->hdr.error_no; + return -EFAULT; + } + + if (req->hdr.msg_size > sizeof(struct vfio_user_header)) { + sz_payload = req->hdr.msg_size - sizeof(struct vfio_user_header); + ret = read(fd, req->payload, sz_payload); + if (ret <= 0) { + return ret; + } + } + + return 0; +} + +static int +vfio_user_dev_send_request(struct vfio_device *dev, enum vfio_user_command command, + void *arg, size_t arg_len, size_t buf_len, int *fds, int max_fds) +{ + struct vfio_user_request req = {}; + size_t sz_payload; + int ret; + bool fds_write = false; + + if (arg_len > VFIO_USER_MAX_PAYLOAD_SIZE) { + SPDK_ERRLOG("Oversized argument length, command %u\n", command); + return -EINVAL; + } + + req.hdr.cmd = command; + req.hdr.msg_size = sizeof(struct vfio_user_header) + arg_len; + memcpy(req.payload, arg, arg_len); + + if (command == VFIO_USER_DMA_MAP || command == VFIO_USER_DMA_UNMAP) { + fds_write = true; + } + + SPDK_DEBUGLOG(vfio_user, "[I] Command %s, msg size %u, fds %p, max_fds %d\n", + vfio_user_message_str[command], req.hdr.msg_size, fds, max_fds); + + if (fds_write && fds) { + ret = vfio_user_write(dev->fd, (void *)&req, req.hdr.msg_size, fds, max_fds); + } else { + ret = vfio_user_write(dev->fd, (void *)&req, req.hdr.msg_size, NULL, 0); + } + + if (ret) { + return ret; + } + + /* a reply is mandatory */ + memset(&req, 0, sizeof(req)); + ret = vfio_user_read(dev->fd, &req); + if (ret) { + return ret; + } + + SPDK_DEBUGLOG(vfio_user, "[I] Command %s response, msg size %u\n", + vfio_user_message_str[req.hdr.cmd], req.hdr.msg_size); + + assert(req.hdr.flags.type == VFIO_USER_MESSAGE_REPLY); + sz_payload = req.hdr.msg_size - sizeof(struct vfio_user_header); + if (!sz_payload) { + return 0; + } + + if (!fds_write) { + if (sz_payload > buf_len) { + SPDK_ERRLOG("Payload size error sz %zd, buf_len %zd\n", sz_payload, buf_len); + return -EIO; + } + memcpy(arg, req.payload, sz_payload); + /* VFIO_USER_DEVICE_GET_REGION_INFO may contains BAR fd */ + if (fds && req.fd_num) { + assert(req.fd_num < max_fds); + memcpy(fds, req.fds, sizeof(int) * req.fd_num); + } + } + + return 0; +} + +static int +vfio_user_check_version(struct vfio_device *dev) +{ + int ret; + struct vfio_user_request req = {}; + struct vfio_user_version *version = (struct vfio_user_version *)req.payload; + + version->major = VFIO_USER_MAJOR_VER; + version->minor = VFIO_USER_MINOR_VER; + + ret = vfio_user_dev_send_request(dev, VFIO_USER_VERSION, req.payload, + sizeof(struct vfio_user_version), sizeof(req.payload), NULL, 0); + if (ret < 0) { + return ret; + } + + SPDK_NOTICELOG("%s Negotiate version %u.%u\n", vfio_user_message_str[VFIO_USER_VERSION], + version->major, version->minor); + + return 0; +} + +int +vfio_user_get_dev_region_info(struct vfio_device *dev, struct vfio_region_info *region_info, + size_t buf_len, int *fds, int num_fds) +{ + assert(buf_len > sizeof(struct vfio_region_info)); + region_info->argsz = buf_len - sizeof(struct vfio_region_info); + return vfio_user_dev_send_request(dev, VFIO_USER_DEVICE_GET_REGION_INFO, + region_info, region_info->argsz, buf_len, fds, num_fds); +} + +int +vfio_user_get_dev_info(struct vfio_device *dev, struct vfio_device_info *dev_info, + size_t buf_len) +{ + dev_info->argsz = sizeof(struct vfio_device_info); + return vfio_user_dev_send_request(dev, VFIO_USER_DEVICE_GET_INFO, + dev_info, dev_info->argsz, buf_len, NULL, 0); +} + +int +vfio_user_dev_dma_map_unmap(struct vfio_device *dev, struct vfio_memory_region *mr, bool map) +{ + struct vfio_user_dma_region region; + + region.addr = mr->iova; + region.size = mr->size; + region.offset = mr->offset; + if (map) { + region.flags = VFIO_USER_F_DMA_REGION_MAPPABLE; + region.prot = PROT_READ | PROT_WRITE; + } + + return vfio_user_dev_send_request(dev, map ? VFIO_USER_DMA_MAP : VFIO_USER_DMA_UNMAP, + ®ion, sizeof(region), sizeof(region), &mr->fd, 1); +} + +int +vfio_user_dev_mmio_access(struct vfio_device *dev, uint32_t index, uint64_t offset, + size_t len, void *buf, bool is_write) +{ + struct vfio_user_region_access *access; + size_t arg_len; + int ret; + + arg_len = sizeof(*access) + len; + access = calloc(1, arg_len); + if (!access) { + return -ENOMEM; + } + + access->offset = offset; + access->region = index; + access->count = len; + if (is_write) { + memcpy(access->data, buf, len); + ret = vfio_user_dev_send_request(dev, VFIO_USER_REGION_WRITE, + access, arg_len, arg_len, NULL, 0); + } else { + ret = vfio_user_dev_send_request(dev, VFIO_USER_REGION_READ, + access, sizeof(*access), arg_len, NULL, 0); + } + + if (ret) { + free(access); + return ret; + } + + if (!is_write) { + memcpy(buf, (void *)access->data, len); + } + + free(access); + return 0; +} + +int +vfio_user_dev_setup(struct vfio_device *dev) +{ + int fd; + int flag; + struct sockaddr_un un; + ssize_t rc; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + SPDK_ERRLOG("socket() error\n"); + return -errno; + } + + flag = fcntl(fd, F_GETFD); + if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) { + SPDK_ERRLOG("fcntl failed\n"); + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path); + if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) { + SPDK_ERRLOG("socket path too long\n"); + close(fd); + if (rc < 0) { + return -errno; + } else { + return -EINVAL; + } + } + if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + SPDK_ERRLOG("connect error\n"); + close(fd); + return -errno; + } + + dev->fd = fd; + + if (vfio_user_check_version(dev)) { + SPDK_ERRLOG("Check VFIO_USER_VERSION message failed\n"); + close(fd); + return -EFAULT; + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT(vfio_user) diff --git a/lib/vfio_user/vfio_user_internal.h b/lib/vfio_user/vfio_user_internal.h new file mode 100644 index 000000000..f14cc87a4 --- /dev/null +++ b/lib/vfio_user/vfio_user_internal.h @@ -0,0 +1,96 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VFIO_INTERNAL_H +#define _VFIO_INTERNAL_H + +#include + +#define VFIO_USER_MAJOR_VER 0 +#define VFIO_USER_MINOR_VER 1 + +/* Maximum memory regions supported */ +#define VFIO_MAXIMUM_MEMORY_REGIONS 128 +/* Maximum sparse memory regions in one BAR region */ +#define VFIO_MAXIMUM_SPARSE_MMAP_REGISONS 8 + +struct vfio_memory_region { + uint64_t iova; + uint64_t size; /* bytes */ + uint64_t vaddr; + uint64_t offset; + int fd; + TAILQ_ENTRY(vfio_memory_region) link; +}; + +struct vfio_sparse_mmaps { + void *mem; + uint64_t offset; + size_t size; +}; + +struct vfio_pci_region { + uint64_t offset; + size_t size; + uint64_t flags; + uint32_t nr_mmaps; + struct vfio_sparse_mmaps mmaps[VFIO_MAXIMUM_SPARSE_MMAP_REGISONS]; +}; + +struct vfio_device { + int fd; + + char name[64]; + char path[PATH_MAX]; + + TAILQ_ENTRY(vfio_device) link; + + /* PCI Regions */ + uint32_t pci_regions; + struct vfio_pci_region regions[VFIO_PCI_NUM_REGIONS + 1]; + uint64_t flags; + + struct spdk_mem_map *map; + TAILQ_HEAD(, vfio_memory_region) mrs_head; + uint32_t nr_mrs; +}; + +int vfio_user_dev_setup(struct vfio_device *dev); +int vfio_user_get_dev_info(struct vfio_device *dev, struct vfio_device_info *dev_info, + size_t buf_len); +int vfio_user_get_dev_region_info(struct vfio_device *dev, struct vfio_region_info *region_info, + size_t buf_len, int *fds, int num_fds); +int vfio_user_dev_dma_map_unmap(struct vfio_device *dev, struct vfio_memory_region *mr, bool map); +int vfio_user_dev_mmio_access(struct vfio_device *dev, uint32_t index, uint64_t offset, size_t len, + void *buf, bool is_write); + +#endif diff --git a/lib/vfio_user/vfio_user_pci.c b/lib/vfio_user/vfio_user_pci.c new file mode 100644 index 000000000..25fab78cc --- /dev/null +++ b/lib/vfio_user/vfio_user_pci.c @@ -0,0 +1,455 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * vfio-user transport for PCI devices. + */ + +#include "spdk/stdinc.h" +#include "spdk/log.h" +#include "spdk/env.h" +#include "spdk/queue.h" +#include "spdk/vfio_user_pci.h" + +#include "vfio_user_internal.h" + +static TAILQ_HEAD(, vfio_device) g_vfio_devices = TAILQ_HEAD_INITIALIZER(g_vfio_devices); +static uint32_t g_vfio_dev_id; + +int +spdk_vfio_user_pci_bar_access(struct vfio_device *dev, uint32_t index, uint64_t offset, + size_t len, void *buf, bool is_write) +{ + struct vfio_pci_region *region = &dev->regions[index]; + uint32_t i; + + if (offset + len > region->size) { + return -EINVAL; + } + + if (!region->nr_mmaps || (offset < region->mmaps[0].offset)) { + return vfio_user_dev_mmio_access(dev, index, offset, len, buf, is_write); + } + + /* SPARSE MMAP */ + for (i = 0; i < region->nr_mmaps; i++) { + if ((offset >= region->mmaps[i].offset) && + (offset + len <= region->mmaps[i].offset + region->mmaps[i].size)) { + assert(region->mmaps[i].mem != NULL); + void *bar_addr = region->mmaps[i].mem + offset; + if (is_write) { + memcpy(bar_addr, buf, len); + } else { + memcpy(buf, bar_addr, len); + } + return 0; + } + } + + return -EFAULT; +} + +static int +vfio_add_mr(struct vfio_device *dev, struct vfio_memory_region *mr) +{ + if (dev->nr_mrs == VFIO_MAXIMUM_MEMORY_REGIONS) { + SPDK_ERRLOG("Maximum supported memory regions %d\n", VFIO_MAXIMUM_MEMORY_REGIONS); + return -EINVAL; + } + + TAILQ_INSERT_TAIL(&dev->mrs_head, mr, link); + dev->nr_mrs++; + + SPDK_DEBUGLOG(vfio_pci, "Add memory region: FD %d, VADDR 0x%lx, IOVA 0x%lx, Size 0x%lx\n", + mr->fd, mr->vaddr, mr->iova, mr->size); + + return 0; +} + +static struct vfio_memory_region * +vfio_get_mr(struct vfio_device *dev, uint64_t addr, size_t len) +{ + struct vfio_memory_region *mr, *tmp_mr; + + if (dev->nr_mrs == 0) { + return false; + } + + TAILQ_FOREACH_SAFE(mr, &dev->mrs_head, link, tmp_mr) { + if ((mr->vaddr == addr) || (mr->iova == addr)) { + return mr; + } + } + + return false; +} + +static void +vfio_remove_mr(struct vfio_device *dev, uint64_t addr, size_t len) +{ + struct vfio_memory_region *mr, *tmp_mr; + + TAILQ_FOREACH_SAFE(mr, &dev->mrs_head, link, tmp_mr) { + if ((mr->vaddr == addr) || (mr->iova == addr)) { + SPDK_DEBUGLOG(vfio_pci, "Remove memory region: FD %d, VADDR 0x%lx, IOVA 0x%lx, Size 0x%lx\n", + mr->fd, mr->vaddr, mr->iova, mr->size); + TAILQ_REMOVE(&dev->mrs_head, mr, link); + assert(dev->nr_mrs > 0); + dev->nr_mrs--; + free(mr); + return; + } + } +} + +static int +vfio_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + int ret; + struct vfio_device *dev = cb_ctx; + struct vfio_memory_region *mr; + uint64_t offset; + + mr = vfio_get_mr(dev, (uint64_t)vaddr, size); + if (action == SPDK_MEM_MAP_NOTIFY_UNREGISTER) { + if (!mr) { + SPDK_ERRLOG("Memory region VADDR %p doesn't exist\n", vaddr); + return -EEXIST; + } + + ret = vfio_user_dev_dma_map_unmap(dev, mr, false); + /* remove the memory region */ + vfio_remove_mr(dev, (uint64_t)vaddr, size); + return ret; + } + + /* SPDK_MEM_MAP_NOTIFY_REGISTER */ + if (mr != NULL) { + SPDK_ERRLOG("Memory region VADDR 0x%lx already exist\n", mr->vaddr); + return -EEXIST; + } + + mr = calloc(1, sizeof(*mr)); + if (mr == NULL) { + return -ENOMEM; + } + mr->vaddr = (uint64_t)(uintptr_t)vaddr; + mr->iova = mr->vaddr; + mr->size = size; + mr->fd = spdk_mem_get_fd_and_offset(vaddr, &offset); + if (mr->fd < 0) { + SPDK_ERRLOG("Error to get the memory map offset\n"); + free(mr); + return -EFAULT; + } + mr->offset = offset; + + ret = vfio_add_mr(dev, mr); + if (ret) { + free(mr); + return ret; + } + + return vfio_user_dev_dma_map_unmap(dev, mr, true); +} + +static int +vfio_device_dma_map(struct vfio_device *device) +{ + const struct spdk_mem_map_ops vfio_map_ops = { + .notify_cb = vfio_mr_map_notify, + .are_contiguous = NULL, + }; + + device->map = spdk_mem_map_alloc((uint64_t)NULL, &vfio_map_ops, device); + if (device->map == NULL) { + SPDK_ERRLOG("Failed to allocate memory map structure\n"); + return -EFAULT; + } + + return 0; +} + +static struct vfio_info_cap_header * +vfio_device_get_info_cap(struct vfio_region_info *info, int cap) +{ + struct vfio_info_cap_header *h; + size_t offset; + + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) == 0) { + return NULL; + } + + offset = info->cap_offset; + while (offset != 0) { + h = (struct vfio_info_cap_header *)((uintptr_t)info + offset); + if (h->id == cap) { + return h; + } + offset = h->next; + } + + return NULL; +} + +static int +vfio_device_setup_sparse_mmaps(struct vfio_device *device, int index, + struct vfio_region_info *info, int *fds) +{ + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; + struct vfio_pci_region *region = &device->regions[index]; + uint32_t i, j = 0; + int prot = 0; + + hdr = vfio_device_get_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + if (!hdr) { + SPDK_NOTICELOG("Device doesn't have sparse mmap\n"); + return -EEXIST; + } + + sparse = SPDK_CONTAINEROF(hdr, struct vfio_region_info_cap_sparse_mmap, header); + for (i = 0; i < sparse->nr_areas; i++) { + if (sparse->areas[i].size) { + region->mmaps[j].offset = sparse->areas[i].offset; + region->mmaps[j].size = sparse->areas[i].size; + prot |= info->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; + prot |= info->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; + if (*fds) { + region->mmaps[j].mem = mmap(NULL, region->mmaps[j].size, prot, MAP_SHARED, + fds[i], region->offset + region->mmaps[j].offset); + if (region->mmaps[j].mem == MAP_FAILED) { + SPDK_ERRLOG("Device SPARSE MMAP failed\n"); + return -EIO; + } + } else { + SPDK_NOTICELOG("No valid fd, skip mmap for bar %d region %u\n", index, i); + } + SPDK_NOTICELOG("Sparse region %u, Size 0x%llx, Offset 0x%llx, Map addr %p\n", + i, sparse->areas[i].size, sparse->areas[i].offset, + region->mmaps[j].mem); + j++; + } + } + device->regions[index].nr_mmaps = j; + + return 0; +} + +static int +vfio_device_map_region(struct vfio_device *device, struct vfio_pci_region *region, int fd) +{ + int prot = 0; + + prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; + prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; + + region->mmaps[0].offset = 0; + region->mmaps[0].size = region->size; + + region->mmaps[0].mem = mmap(NULL, region->size, prot, MAP_SHARED, + fd, region->offset); + if (region->mmaps[0].mem == MAP_FAILED) { + SPDK_ERRLOG("Device Region MMAP failed\n"); + return -EFAULT; + } + SPDK_DEBUGLOG(vfio_pci, "Memory mapped to %p\n", region->mmaps[0].mem); + region->nr_mmaps = 1; + + return 0; +} + +static int +vfio_device_map_bars_and_config_region(struct vfio_device *device) +{ + uint32_t i; + int ret; + size_t len = 4096; + int fds[VFIO_MAXIMUM_SPARSE_MMAP_REGISONS]; + struct vfio_region_info *info; + uint8_t *buf; + + buf = calloc(1, len); + if (!buf) { + return -ENOMEM; + } + + info = (struct vfio_region_info *)buf; + for (i = 0; i < device->pci_regions; i++) { + memset(info, 0, len); + memset(fds, 0, sizeof(fds)); + + info->index = i; + ret = vfio_user_get_dev_region_info(device, info, len, fds, VFIO_MAXIMUM_SPARSE_MMAP_REGISONS); + if (ret) { + SPDK_ERRLOG("Device setup bar %d failed\n", ret); + free(buf); + return ret; + } + + device->regions[i].size = info->size; + device->regions[i].offset = info->offset; + device->regions[i].flags = info->flags; + + SPDK_DEBUGLOG(vfio_pci, "Bar %d, Size 0x%llx, Offset 0x%llx, Flags 0x%x, Cap offset %u\n", + i, info->size, info->offset, info->flags, info->cap_offset); + + /* Setup MMAP if any */ + if (info->size && (info->flags & VFIO_REGION_INFO_FLAG_MMAP)) { + /* try to map sparse memory region first */ + ret = vfio_device_setup_sparse_mmaps(device, i, info, fds); + if (ret < 0) { + ret = vfio_device_map_region(device, &device->regions[i], fds[0]); + } + + if (ret != 0) { + SPDK_ERRLOG("Setup Device %s region %d failed\n", device->name, i); + free(buf); + return ret; + } + } + } + + free(buf); + return 0; +} + +static void +vfio_device_unmap_bars(struct vfio_device *dev) +{ + uint32_t i, j; + struct vfio_pci_region *region; + + for (i = 0; i < dev->pci_regions; i++) { + region = &dev->regions[i]; + for (j = 0; j < region->nr_mmaps; j++) { + if (region->mmaps[j].mem) { + munmap(region->mmaps[j].mem, region->mmaps[j].size); + } + } + } + memset(dev->regions, 0, sizeof(dev->regions)); +} + +struct vfio_device * +spdk_vfio_user_setup(const char *path) +{ + int ret; + struct vfio_device *device = NULL; + struct vfio_device_info dev_info = {}; + + device = calloc(1, sizeof(*device)); + if (!device) { + return NULL; + } + TAILQ_INIT(&device->mrs_head); + snprintf(device->path, PATH_MAX, "%s", path); + snprintf(device->name, sizeof(device->name), "vfio-user%u", g_vfio_dev_id++); + + ret = vfio_user_dev_setup(device); + if (ret) { + free(device); + SPDK_ERRLOG("Error to setup vfio-user via path %s\n", path); + return NULL; + } + + ret = vfio_user_get_dev_info(device, &dev_info, sizeof(dev_info)); + if (ret) { + SPDK_ERRLOG("Device get info failed\n"); + close(device->fd); + goto cleanup; + } + device->pci_regions = dev_info.num_regions; + device->flags = dev_info.flags; + + ret = vfio_device_map_bars_and_config_region(device); + if (ret) { + close(device->fd); + goto cleanup; + } + + /* Register DMA Region */ + ret = vfio_device_dma_map(device); + if (ret) { + SPDK_ERRLOG("Container DMA map failed\n"); + close(device->fd); + goto cleanup; + } + + SPDK_NOTICELOG("Device %s, Path %s Setup Successfully\n", device->name, device->path); + TAILQ_INSERT_TAIL(&g_vfio_devices, device, link); + + return device; + +cleanup: + free(device); + return NULL; +} + +void +spdk_vfio_user_release(struct vfio_device *dev) +{ + SPDK_NOTICELOG("Release file %s\n", dev->path); + + vfio_device_unmap_bars(dev); + if (dev->map) { + spdk_mem_map_free(&dev->map); + } + close(dev->fd); + + free(dev); +} + +void * +spdk_vfio_user_get_bar_addr(struct vfio_device *dev, uint32_t index, uint64_t offset, uint32_t len) +{ + struct vfio_pci_region *region = &dev->regions[index]; + uint32_t i; + + if (!region->size || !(region->flags & VFIO_REGION_INFO_FLAG_MMAP)) { + return NULL; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mem && (region->mmaps[i].offset <= offset) && + ((offset + len) <= (region->mmaps[i].offset + region->mmaps[i].size))) { + return (void *)((uintptr_t)region->mmaps[i].mem + offset - region->mmaps[i].offset); + } + } + + return NULL; +} + +SPDK_LOG_REGISTER_COMPONENT(vfio_pci) diff --git a/mk/spdk.lib_deps.mk b/mk/spdk.lib_deps.mk index 7a70fe43e..a6c7825b7 100644 --- a/mk/spdk.lib_deps.mk +++ b/mk/spdk.lib_deps.mk @@ -49,6 +49,9 @@ DEPDIRS-idxd := log util DEPDIRS-sock := log $(JSON_LIBS) DEPDIRS-util := log DEPDIRS-vmd := log +ifeq ($(CONFIG_VFIO_USER),y) +DEPDIRS-vfio_user := log +endif DEPDIRS-conf := log util DEPDIRS-json := log util @@ -60,6 +63,9 @@ DEPDIRS-nvme := log sock util ifeq ($(CONFIG_RDMA),y) DEPDIRS-nvme += rdma endif +ifeq ($(CONFIG_VFIO_USER),y) +DEPDIRS-nvme += vfio_user +endif DEPDIRS-blob := log util thread DEPDIRS-accel := log util thread json diff --git a/mk/spdk.modules.mk b/mk/spdk.modules.mk index c100fb434..415a3b2d7 100644 --- a/mk/spdk.modules.mk +++ b/mk/spdk.modules.mk @@ -41,6 +41,10 @@ INTR_BLOCKDEV_MODULES_LIST = bdev_malloc bdev_passthru bdev_error bdev_gpt bdev_ # Logical volume, blobstore and blobfs can directly run in both interrupt mode and poll mode. INTR_BLOCKDEV_MODULES_LIST += bdev_lvol blobfs blobfs_bdev blob_bdev blob lvol +ifeq ($(CONFIG_VFIO_USER),y) +BLOCKDEV_MODULES_LIST += vfio_user +endif + ifeq ($(CONFIG_CRYPTO),y) BLOCKDEV_MODULES_LIST += bdev_crypto endif diff --git a/test/nvmf/nvmf.sh b/test/nvmf/nvmf.sh index be4197719..fc4509cd0 100755 --- a/test/nvmf/nvmf.sh +++ b/test/nvmf/nvmf.sh @@ -40,6 +40,10 @@ run_test "nvmf_bdevio" test/nvmf/target/bdevio.sh "${TEST_ARGS[@]}" run_test "nvmf_invalid" test/nvmf/target/invalid.sh "${TEST_ARGS[@]}" run_test "nvmf_abort" test/nvmf/target/abort.sh "${TEST_ARGS[@]}" +if grep -q '#define SPDK_CONFIG_VFIO_USER 1' $rootdir/include/spdk/config.h; then + run_test "nvmf_vfio_user" test/nvmf/target/nvmf_vfio_user.sh "${TEST_ARGS[@]}" +fi + if ! check_ip_is_soft_roce $NVMF_FIRST_TARGET_IP; then # Soft-RoCE will return invalid values in the WC field after a qp has been # destroyed which lead to NULL pointer references not seen in real hardware. diff --git a/test/nvmf/target/nvmf_vfio_user.sh b/test/nvmf/target/nvmf_vfio_user.sh new file mode 100755 index 000000000..4c52a0e25 --- /dev/null +++ b/test/nvmf/target/nvmf_vfio_user.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$(readlink -f $testdir/../../..) +source $rootdir/test/common/autotest_common.sh +source $rootdir/test/nvmf/common.sh + +MALLOC_BDEV_SIZE=512 +MALLOC_BLOCK_SIZE=512 + +rpc_py="$rootdir/scripts/rpc.py" + +export TEST_TRANSPORT=VFIOUSER + +rm -rf /var/run/muser +rm -rf /dev/shm/muser + +mkdir -p /var/run/muser +mkdir -p /var/run/muser/iommu_group +mkdir -p /var/run/muser/domain/muser0/8 +mkdir -p /dev/shm/muser/muser0 + +# Start the target +"${NVMF_APP[@]}" -m 0x1 & +nvmfpid=$! +echo "Process pid: $nvmfpid" + +trap 'killprocess $nvmfpid; exit 1' SIGINT SIGTERM EXIT +waitforlisten $nvmfpid + +sleep 1 + +$rpc_py nvmf_create_transport -t VFIOUSER + +$rpc_py bdev_malloc_create $MALLOC_BDEV_SIZE $MALLOC_BLOCK_SIZE -b Malloc0 +$rpc_py nvmf_create_subsystem nqn.2019-07.io.spdk:cnode0 -a -s SPDK0 +$rpc_py nvmf_subsystem_add_ns nqn.2019-07.io.spdk:cnode0 Malloc0 +$rpc_py nvmf_subsystem_add_listener nqn.2019-07.io.spdk:cnode0 -t VFIOUSER -a "/var/run/muser/domain/muser0/8" -s 0 + +ln -s /var/run/muser/domain/muser0/8 /var/run/muser/domain/muser0/8/iommu_group +ln -s /var/run/muser/domain/muser0/8 /var/run/muser/iommu_group/8 +ln -s /var/run/muser/domain/muser0/8/bar0 /dev/shm/muser/muser0/bar0 + +$SPDK_EXAMPLE_DIR/identify -r 'trtype:VFIOUSER traddr:/var/run/muser/domain/muser0/8' -g -L nvme -L nvme_vfio -L vfio_pci +sleep 1 +$SPDK_EXAMPLE_DIR/perf -r 'trtype:VFIOUSER traddr:/var/run/muser/domain/muser0/8' -s 256 -g -q 128 -o 4096 -w read -t 10 -c 0x2 +sleep 1 +$SPDK_EXAMPLE_DIR/perf -r 'trtype:VFIOUSER traddr:/var/run/muser/domain/muser0/8' -s 256 -g -q 128 -o 4096 -w write -t 10 -c 0x2 +sleep 1 +$SPDK_EXAMPLE_DIR/reconnect -r 'trtype:VFIOUSER traddr:/var/run/muser/domain/muser0/8' -g -q 32 -o 4096 -w randrw -M 50 -t 10 -c 0xE +sleep 1 + +killprocess $nvmfpid + +rm -rf /var/run/muser +rm -rf /dev/shm/muser + +trap - SIGINT SIGTERM EXIT