Spdk/lib/nvmf/vfio_user.c
Changpeng Liu addfab0101 nvmf/vfio-user: start a MMIO poller to poll MMIO access
Previously we poll the MMIO callbacks in the context of ADMIN queue's
poll group, here we do some improvement to start a poller to do MMIO
poll, then the group poll will only process NVMe commands while the
MMIO poller will process MMIO access.

This is useful when doing live migration, because the migration region
defined by VFIO is a BAR region, we should stop polling queue pairs
but ack the MMIO accesses during the live migration.

Change-Id: I63bac44889cbe0c31d47599810aab8335dfd4ff5
Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/7251
Community-CI: Broadcom CI
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: <dongx.yi@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-04-15 11:23:47 +00:00

2391 lines
60 KiB
C

/*-
* BSD LICENSE
* Copyright (c) Intel Corporation. All rights reserved.
* Copyright (c) 2019, Nutanix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* NVMe over vfio-user transport
*/
#include <vfio-user/libvfio-user.h>
#include <vfio-user/pci_defs.h>
#include "spdk/barrier.h"
#include "spdk/stdinc.h"
#include "spdk/assert.h"
#include "spdk/thread.h"
#include "spdk/nvmf_transport.h"
#include "spdk/sock.h"
#include "spdk/string.h"
#include "spdk/util.h"
#include "spdk/log.h"
#include "transport.h"
#include "nvmf_internal.h"
#define NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH 256
#define NVMF_VFIO_USER_DEFAULT_AQ_DEPTH 32
#define NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR 64
#define NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE 0
#define NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE 131072
#define NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE 131072
#define NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS 512 /* internal buf size */
#define NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE 0
#define NVMF_VFIO_USER_DOORBELLS_OFFSET 0x1000
#define NVMF_VFIO_USER_DOORBELLS_SIZE 0x1000
#define NVME_REG_CFG_SIZE 0x1000
#define NVME_REG_BAR0_SIZE 0x4000
#define NVME_IRQ_INTX_NUM 1
#define NVME_IRQ_MSIX_NUM NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR
struct nvmf_vfio_user_req;
struct nvmf_vfio_user_qpair;
typedef int (*nvmf_vfio_user_req_cb_fn)(struct nvmf_vfio_user_req *req, void *cb_arg);
#define NVMF_VFIO_USER_MDTS 32
#define NVMF_VFIO_USER_MAX_IOVECS (NVMF_VFIO_USER_MDTS + 1)
struct nvmf_vfio_user_req {
struct spdk_nvmf_request req;
struct spdk_nvme_cpl rsp;
struct spdk_nvme_cmd cmd;
uint16_t cid;
nvmf_vfio_user_req_cb_fn cb_fn;
void *cb_arg;
dma_sg_t sg[NVMF_VFIO_USER_MAX_IOVECS];
struct iovec iov[NVMF_VFIO_USER_MAX_IOVECS];
uint8_t iovcnt;
TAILQ_ENTRY(nvmf_vfio_user_req) link;
};
/*
* A NVMe queue.
*/
struct nvme_q {
bool is_cq;
void *addr;
dma_sg_t sg;
struct iovec iov;
uint32_t size;
uint64_t prp1;
union {
struct {
uint32_t head;
/* multiple SQs can be mapped to the same CQ */
uint16_t cqid;
};
struct {
uint32_t tail;
uint16_t iv;
bool ien;
};
};
};
enum nvmf_vfio_user_qpair_state {
VFIO_USER_QPAIR_UNINITIALIZED = 0,
VFIO_USER_QPAIR_ACTIVE,
VFIO_USER_QPAIR_DELETED,
VFIO_USER_QPAIR_INACTIVE,
VFIO_USER_QPAIR_ERROR,
};
struct nvmf_vfio_user_qpair {
struct spdk_nvmf_qpair qpair;
struct spdk_nvmf_transport_poll_group *group;
struct nvmf_vfio_user_ctrlr *ctrlr;
struct nvmf_vfio_user_req *reqs_internal;
uint16_t qsize;
struct nvme_q cq;
struct nvme_q sq;
enum nvmf_vfio_user_qpair_state state;
TAILQ_HEAD(, nvmf_vfio_user_req) reqs;
TAILQ_ENTRY(nvmf_vfio_user_qpair) link;
};
struct nvmf_vfio_user_poll_group {
struct spdk_nvmf_transport_poll_group group;
TAILQ_HEAD(, nvmf_vfio_user_qpair) qps;
};
struct nvmf_vfio_user_ctrlr {
struct nvmf_vfio_user_endpoint *endpoint;
struct nvmf_vfio_user_transport *transport;
/* True when the socket connection is active */
bool ready;
/* Number of connected queue pairs */
uint32_t num_connected_qps;
struct spdk_thread *thread;
struct spdk_poller *mmio_poller;
uint16_t cntlid;
struct nvmf_vfio_user_qpair *qp[NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR];
TAILQ_ENTRY(nvmf_vfio_user_ctrlr) link;
volatile uint32_t *doorbells;
/* internal CSTS.CFS register for vfio-user fatal errors */
uint32_t cfs : 1;
};
struct nvmf_vfio_user_endpoint {
vfu_ctx_t *vfu_ctx;
struct msixcap *msix;
vfu_pci_config_space_t *pci_config_space;
int fd;
volatile uint32_t *doorbells;
struct spdk_nvme_transport_id trid;
const struct spdk_nvmf_subsystem *subsystem;
struct nvmf_vfio_user_ctrlr *ctrlr;
pthread_mutex_t lock;
TAILQ_ENTRY(nvmf_vfio_user_endpoint) link;
};
struct nvmf_vfio_user_transport {
struct spdk_nvmf_transport transport;
pthread_mutex_t lock;
TAILQ_HEAD(, nvmf_vfio_user_endpoint) endpoints;
TAILQ_HEAD(, nvmf_vfio_user_qpair) new_qps;
};
/*
* function prototypes
*/
static volatile uint32_t *
hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
static volatile uint32_t *
tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q);
static int
nvmf_vfio_user_req_free(struct spdk_nvmf_request *req);
static struct nvmf_vfio_user_req *
get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair);
static int
post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
uint16_t sct);
static char *
endpoint_id(struct nvmf_vfio_user_endpoint *endpoint)
{
return endpoint->trid.traddr;
}
static char *
ctrlr_id(struct nvmf_vfio_user_ctrlr *ctrlr)
{
if (!ctrlr || !ctrlr->endpoint) {
return "Null Ctrlr";
}
return endpoint_id(ctrlr->endpoint);
}
static uint16_t
io_q_id(struct nvme_q *q)
{
struct nvmf_vfio_user_qpair *vfio_user_qpair;
assert(q);
if (q->is_cq) {
vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, cq);
} else {
vfio_user_qpair = SPDK_CONTAINEROF(q, struct nvmf_vfio_user_qpair, sq);
}
assert(vfio_user_qpair);
return vfio_user_qpair->qpair.qid;
}
static void
fail_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
{
assert(ctrlr != NULL);
if (ctrlr->cfs == 0) {
SPDK_ERRLOG(":%s failing controller\n", ctrlr_id(ctrlr));
}
ctrlr->ready = false;
ctrlr->cfs = 1U;
}
static bool
ctrlr_interrupt_enabled(struct nvmf_vfio_user_ctrlr *ctrlr)
{
assert(ctrlr != NULL);
assert(ctrlr->endpoint != NULL);
vfu_pci_config_space_t *pci = ctrlr->endpoint->pci_config_space;
return (!pci->hdr.cmd.id || ctrlr->endpoint->msix->mxc.mxe);
}
static void
nvmf_vfio_user_destroy_endpoint(struct nvmf_vfio_user_endpoint *endpoint)
{
if (endpoint->doorbells) {
munmap((void *)endpoint->doorbells, NVMF_VFIO_USER_DOORBELLS_SIZE);
}
if (endpoint->fd > 0) {
close(endpoint->fd);
}
vfu_destroy_ctx(endpoint->vfu_ctx);
pthread_mutex_destroy(&endpoint->lock);
free(endpoint);
}
/* called when process exits */
static int
nvmf_vfio_user_destroy(struct spdk_nvmf_transport *transport,
spdk_nvmf_transport_destroy_done_cb cb_fn, void *cb_arg)
{
struct nvmf_vfio_user_transport *vu_transport;
struct nvmf_vfio_user_endpoint *endpoint, *tmp;
SPDK_DEBUGLOG(nvmf_vfio, "destroy transport\n");
vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
transport);
(void)pthread_mutex_destroy(&vu_transport->lock);
TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
nvmf_vfio_user_destroy_endpoint(endpoint);
}
free(vu_transport);
if (cb_fn) {
cb_fn(cb_arg);
}
return 0;
}
static struct spdk_nvmf_transport *
nvmf_vfio_user_create(struct spdk_nvmf_transport_opts *opts)
{
struct nvmf_vfio_user_transport *vu_transport;
int err;
vu_transport = calloc(1, sizeof(*vu_transport));
if (vu_transport == NULL) {
SPDK_ERRLOG("Transport alloc fail: %m\n");
return NULL;
}
err = pthread_mutex_init(&vu_transport->lock, NULL);
if (err != 0) {
SPDK_ERRLOG("Pthread initialisation failed (%d)\n", err);
goto err;
}
TAILQ_INIT(&vu_transport->endpoints);
TAILQ_INIT(&vu_transport->new_qps);
return &vu_transport->transport;
err:
free(vu_transport);
return NULL;
}
static uint16_t
max_queue_size(struct nvmf_vfio_user_ctrlr const *ctrlr)
{
assert(ctrlr != NULL);
assert(ctrlr->qp[0] != NULL);
assert(ctrlr->qp[0]->qpair.ctrlr != NULL);
return ctrlr->qp[0]->qpair.ctrlr->vcprop.cap.bits.mqes + 1;
}
static void *
map_one(vfu_ctx_t *ctx, uint64_t addr, uint64_t len, dma_sg_t *sg, struct iovec *iov)
{
int ret;
assert(ctx != NULL);
assert(sg != NULL);
assert(iov != NULL);
ret = vfu_addr_to_sg(ctx, addr, len, sg, 1, PROT_READ | PROT_WRITE);
if (ret != 1) {
return NULL;
}
ret = vfu_map_sg(ctx, sg, iov, 1);
if (ret != 0) {
return NULL;
}
assert(iov->iov_base != NULL);
return iov->iov_base;
}
static uint32_t
sq_head(struct nvmf_vfio_user_qpair *qpair)
{
assert(qpair != NULL);
return qpair->sq.head;
}
static void
sqhd_advance(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair)
{
assert(ctrlr != NULL);
assert(qpair != NULL);
qpair->sq.head = (qpair->sq.head + 1) % qpair->sq.size;
}
static void
insert_queue(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q,
const bool is_cq, const uint16_t id)
{
struct nvme_q *_q;
struct nvmf_vfio_user_qpair *qpair;
assert(ctrlr != NULL);
assert(q != NULL);
qpair = ctrlr->qp[id];
q->is_cq = is_cq;
if (is_cq) {
_q = &qpair->cq;
*_q = *q;
*hdbl(ctrlr, _q) = 0;
} else {
_q = &qpair->sq;
*_q = *q;
*tdbl(ctrlr, _q) = 0;
}
}
static int
asq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
{
struct nvme_q q = {};
const struct spdk_nvmf_registers *regs;
assert(ctrlr != NULL);
assert(ctrlr->qp[0] != NULL);
assert(ctrlr->qp[0]->sq.addr == NULL);
/* XXX ctrlr->asq == 0 is a valid memory address */
regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
q.size = regs->aqa.bits.asqs + 1;
q.head = ctrlr->doorbells[0] = 0;
q.cqid = 0;
q.addr = map_one(ctrlr->endpoint->vfu_ctx, regs->asq,
q.size * sizeof(struct spdk_nvme_cmd), &q.sg, &q.iov);
if (q.addr == NULL) {
return -1;
}
memset(q.addr, 0, q.size * sizeof(struct spdk_nvme_cmd));
insert_queue(ctrlr, &q, false, 0);
return 0;
}
static uint16_t
cq_next(struct nvme_q *q)
{
assert(q != NULL);
assert(q->is_cq);
return (q->tail + 1) % q->size;
}
static int
queue_index(uint16_t qid, int is_cq)
{
return (qid * 2) + is_cq;
}
static volatile uint32_t *
tdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
{
assert(ctrlr != NULL);
assert(q != NULL);
assert(!q->is_cq);
return &ctrlr->doorbells[queue_index(io_q_id(q), false)];
}
static volatile uint32_t *
hdbl(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
{
assert(ctrlr != NULL);
assert(q != NULL);
assert(q->is_cq);
return &ctrlr->doorbells[queue_index(io_q_id(q), true)];
}
static bool
cq_is_full(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvme_q *q)
{
assert(ctrlr != NULL);
assert(q != NULL);
return cq_next(q) == *hdbl(ctrlr, q);
}
static void
cq_tail_advance(struct nvme_q *q)
{
assert(q != NULL);
q->tail = cq_next(q);
}
static int
acq_map(struct nvmf_vfio_user_ctrlr *ctrlr)
{
struct nvme_q q = {};
const struct spdk_nvmf_registers *regs;
assert(ctrlr != NULL);
assert(ctrlr->qp[0] != NULL);
assert(ctrlr->qp[0]->cq.addr == NULL);
regs = spdk_nvmf_ctrlr_get_regs(ctrlr->qp[0]->qpair.ctrlr);
assert(regs != NULL);
q.size = regs->aqa.bits.acqs + 1;
q.tail = 0;
q.addr = map_one(ctrlr->endpoint->vfu_ctx, regs->acq,
q.size * sizeof(struct spdk_nvme_cpl), &q.sg, &q.iov);
if (q.addr == NULL) {
return -1;
}
memset(q.addr, 0, q.size * sizeof(struct spdk_nvme_cpl));
q.is_cq = true;
q.ien = true;
insert_queue(ctrlr, &q, true, 0);
return 0;
}
static void *
_map_one(void *prv, uint64_t addr, uint64_t len)
{
struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)prv;
struct spdk_nvmf_qpair *qpair;
struct nvmf_vfio_user_req *vu_req;
struct nvmf_vfio_user_qpair *vu_qpair;
void *ret;
assert(req != NULL);
qpair = req->qpair;
vu_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
assert(vu_req->iovcnt < NVMF_VFIO_USER_MAX_IOVECS);
ret = map_one(vu_qpair->ctrlr->endpoint->vfu_ctx, addr, len,
&vu_req->sg[vu_req->iovcnt],
&vu_req->iov[vu_req->iovcnt]);
if (spdk_likely(ret != NULL)) {
vu_req->iovcnt++;
}
return ret;
}
static int
vfio_user_map_prps(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req,
struct iovec *iov, uint32_t length)
{
/* Map PRP list to from Guest physical memory to
* virtual memory address.
*/
return spdk_nvme_map_prps(req, &req->cmd->nvme_cmd, iov, length,
4096, _map_one);
}
static struct spdk_nvmf_request *
get_nvmf_req(struct nvmf_vfio_user_qpair *qp);
static int
handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
struct spdk_nvmf_request *req);
/*
* Posts a CQE in the completion queue.
*
* @ctrlr: the vfio-user controller
* @cmd: the NVMe command for which the completion is posted
* @cq: the completion queue
* @cdw0: cdw0 as reported by NVMf (only for SPDK_NVME_OPC_GET/SET_FEATURES)
* @sc: the NVMe CQE status code
* @sct: the NVMe CQE status code type
*/
static int
post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
struct nvme_q *cq, uint32_t cdw0, uint16_t sc,
uint16_t sct)
{
struct spdk_nvme_cpl *cpl;
uint16_t qid;
int err;
assert(ctrlr != NULL);
assert(cmd != NULL);
qid = io_q_id(cq);
if (ctrlr->qp[0]->qpair.ctrlr->vcprop.csts.bits.shst != SPDK_NVME_SHST_NORMAL) {
SPDK_DEBUGLOG(nvmf_vfio,
"%s: ignore completion SQ%d cid=%d status=%#x\n",
ctrlr_id(ctrlr), qid, cmd->cid, sc);
return 0;
}
if (cq_is_full(ctrlr, cq)) {
SPDK_ERRLOG("%s: CQ%d full (tail=%d, head=%d)\n",
ctrlr_id(ctrlr), qid, cq->tail, *hdbl(ctrlr, cq));
return -1;
}
cpl = ((struct spdk_nvme_cpl *)cq->addr) + cq->tail;
SPDK_DEBUGLOG(nvmf_vfio,
"%s: request complete SQ%d cid=%d status=%#x SQ head=%#x CQ tail=%#x\n",
ctrlr_id(ctrlr), qid, cmd->cid, sc, ctrlr->qp[qid]->sq.head,
cq->tail);
if (qid == 0) {
switch (cmd->opc) {
case SPDK_NVME_OPC_SET_FEATURES:
case SPDK_NVME_OPC_GET_FEATURES:
cpl->cdw0 = cdw0;
break;
}
}
assert(ctrlr->qp[qid] != NULL);
cpl->sqhd = ctrlr->qp[qid]->sq.head;
cpl->cid = cmd->cid;
cpl->status.dnr = 0x0;
cpl->status.m = 0x0;
cpl->status.sct = sct;
cpl->status.p = ~cpl->status.p;
cpl->status.sc = sc;
cq_tail_advance(cq);
/*
* this function now executes at SPDK thread context, we
* might be triggerring interrupts from vfio-user thread context so
* check for race conditions.
*/
if (ctrlr_interrupt_enabled(ctrlr) && cq->ien) {
err = vfu_irq_trigger(ctrlr->endpoint->vfu_ctx, cq->iv);
if (err != 0) {
SPDK_ERRLOG("%s: failed to trigger interrupt: %m\n",
ctrlr_id(ctrlr));
return err;
}
}
return 0;
}
static struct nvme_q *
lookup_io_q(struct nvmf_vfio_user_ctrlr *ctrlr, const uint16_t qid, const bool is_cq)
{
struct nvme_q *q;
assert(ctrlr != NULL);
if (qid > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
return NULL;
}
if (ctrlr->qp[qid] == NULL) {
return NULL;
}
if (is_cq) {
q = &ctrlr->qp[qid]->cq;
} else {
q = &ctrlr->qp[qid]->sq;
}
if (q->addr == NULL) {
return NULL;
}
return q;
}
static void
unmap_qp(struct nvmf_vfio_user_qpair *qp)
{
struct nvmf_vfio_user_ctrlr *ctrlr;
if (qp->ctrlr == NULL) {
return;
}
ctrlr = qp->ctrlr;
SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy I/O QP%d\n",
ctrlr_id(ctrlr), qp->qpair.qid);
if (qp->sq.addr != NULL) {
vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, &qp->sq.sg, &qp->sq.iov, 1);
qp->sq.addr = NULL;
}
if (qp->cq.addr != NULL) {
vfu_unmap_sg(ctrlr->endpoint->vfu_ctx, &qp->cq.sg, &qp->cq.iov, 1);
qp->cq.addr = NULL;
}
}
/*
* TODO we can immediately remove the QP from the list because this function
* is now executed by the SPDK thread.
*/
static void
destroy_qp(struct nvmf_vfio_user_ctrlr *ctrlr, uint16_t qid)
{
struct nvmf_vfio_user_qpair *qpair;
if (ctrlr == NULL) {
return;
}
qpair = ctrlr->qp[qid];
if (qpair == NULL) {
return;
}
SPDK_DEBUGLOG(nvmf_vfio, "%s: destroy QP%d=%p\n", ctrlr_id(ctrlr),
qid, qpair);
unmap_qp(qpair);
free(qpair->reqs_internal);
free(qpair);
ctrlr->qp[qid] = NULL;
}
/* This function can only fail because of memory allocation errors. */
static int
init_qp(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_transport *transport,
const uint16_t qsize, const uint16_t id)
{
int err = 0, i;
struct nvmf_vfio_user_qpair *qpair;
struct nvmf_vfio_user_req *vu_req;
struct spdk_nvmf_request *req;
assert(ctrlr != NULL);
assert(transport != NULL);
qpair = calloc(1, sizeof(*qpair));
if (qpair == NULL) {
return -ENOMEM;
}
qpair->qpair.qid = id;
qpair->qpair.transport = transport;
qpair->ctrlr = ctrlr;
qpair->qsize = qsize;
TAILQ_INIT(&qpair->reqs);
qpair->reqs_internal = calloc(qsize, sizeof(struct nvmf_vfio_user_req));
if (qpair->reqs_internal == NULL) {
SPDK_ERRLOG("%s: error allocating reqs: %m\n", ctrlr_id(ctrlr));
err = -ENOMEM;
goto out;
}
for (i = 0; i < qsize; i++) {
vu_req = &qpair->reqs_internal[i];
req = &vu_req->req;
vu_req->cid = i;
req->qpair = &qpair->qpair;
req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
TAILQ_INSERT_TAIL(&qpair->reqs, vu_req, link);
}
ctrlr->qp[id] = qpair;
out:
if (err != 0) {
free(qpair);
}
return err;
}
/*
* Creates a completion or sumbission I/O queue. Returns 0 on success, -errno
* on error.
*
* XXX SPDK thread context.
*/
static int
handle_create_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
struct spdk_nvme_cmd *cmd, const bool is_cq)
{
size_t entry_size;
uint16_t sc = SPDK_NVME_SC_SUCCESS;
uint16_t sct = SPDK_NVME_SCT_GENERIC;
int err = 0;
struct nvme_q io_q = {};
assert(ctrlr != NULL);
assert(cmd != NULL);
SPDK_DEBUGLOG(nvmf_vfio,
"%s: create I/O %cQ%d: QSIZE=%#x\n", ctrlr_id(ctrlr),
is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid,
cmd->cdw10_bits.create_io_q.qsize);
if (cmd->cdw10_bits.create_io_q.qid >= NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR) {
SPDK_ERRLOG("%s: invalid QID=%d, max=%d\n", ctrlr_id(ctrlr),
cmd->cdw10_bits.create_io_q.qid,
NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR);
sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
goto out;
}
if (lookup_io_q(ctrlr, cmd->cdw10_bits.create_io_q.qid, is_cq)) {
SPDK_ERRLOG("%s: %cQ%d already exists\n", ctrlr_id(ctrlr),
is_cq ? 'C' : 'S', cmd->cdw10_bits.create_io_q.qid);
sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
goto out;
}
/* TODO break rest of this function into smaller functions */
if (is_cq) {
entry_size = sizeof(struct spdk_nvme_cpl);
if (cmd->cdw11_bits.create_io_cq.pc != 0x1) {
/*
* TODO CAP.CMBS is currently set to zero, however we
* should zero it out explicitly when CAP is read.
* Support for CAP.CMBS is not mentioned in the NVMf
* spec.
*/
SPDK_ERRLOG("%s: non-PC CQ not supporred\n", ctrlr_id(ctrlr));
sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
goto out;
}
io_q.ien = cmd->cdw11_bits.create_io_cq.ien;
io_q.iv = cmd->cdw11_bits.create_io_cq.iv;
} else {
/* CQ must be created before SQ */
if (!lookup_io_q(ctrlr, cmd->cdw11_bits.create_io_sq.cqid, true)) {
SPDK_ERRLOG("%s: CQ%d does not exist\n", ctrlr_id(ctrlr),
cmd->cdw11_bits.create_io_sq.cqid);
sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
sc = SPDK_NVME_SC_COMPLETION_QUEUE_INVALID;
goto out;
}
entry_size = sizeof(struct spdk_nvme_cmd);
if (cmd->cdw11_bits.create_io_sq.pc != 0x1) {
SPDK_ERRLOG("%s: non-PC SQ not supported\n", ctrlr_id(ctrlr));
sc = SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF;
goto out;
}
io_q.cqid = cmd->cdw11_bits.create_io_sq.cqid;
SPDK_DEBUGLOG(nvmf_vfio, "%s: SQ%d CQID=%d\n", ctrlr_id(ctrlr),
cmd->cdw10_bits.create_io_q.qid, io_q.cqid);
}
io_q.size = cmd->cdw10_bits.create_io_q.qsize + 1;
if (io_q.size > max_queue_size(ctrlr)) {
SPDK_ERRLOG("%s: queue too big, want=%d, max=%d\n", ctrlr_id(ctrlr),
io_q.size, max_queue_size(ctrlr));
sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
sc = SPDK_NVME_SC_INVALID_QUEUE_SIZE;
goto out;
}
io_q.addr = map_one(ctrlr->endpoint->vfu_ctx, cmd->dptr.prp.prp1,
io_q.size * entry_size, &io_q.sg, &io_q.iov);
if (io_q.addr == NULL) {
sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
SPDK_ERRLOG("%s: failed to map I/O queue: %m\n", ctrlr_id(ctrlr));
goto out;
}
io_q.prp1 = cmd->dptr.prp.prp1;
memset(io_q.addr, 0, io_q.size * entry_size);
SPDK_DEBUGLOG(nvmf_vfio, "%s: mapped %cQ%d IOVA=%#lx vaddr=%#llx\n",
ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
cmd->cdw10_bits.create_io_q.qid, cmd->dptr.prp.prp1,
(unsigned long long)io_q.addr);
if (is_cq) {
err = init_qp(ctrlr, ctrlr->qp[0]->qpair.transport, io_q.size,
cmd->cdw10_bits.create_io_q.qid);
if (err != 0) {
sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
goto out;
}
} else {
/*
* After we've returned from the nvmf_vfio_user_poll_group_poll thread, once
* nvmf_vfio_user_accept executes it will pick up this QP and will eventually
* call nvmf_vfio_user_poll_group_add. The rest of the opertion needed to
* complete the addition of the queue will be continued at the
* completion callback.
*/
TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[cmd->cdw10_bits.create_io_q.qid], link);
}
insert_queue(ctrlr, &io_q, is_cq, cmd->cdw10_bits.create_io_q.qid);
out:
return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
}
/*
* Deletes a completion or sumbission I/O queue.
*/
static int
handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
struct spdk_nvme_cmd *cmd, const bool is_cq)
{
uint16_t sct = SPDK_NVME_SCT_GENERIC;
uint16_t sc = SPDK_NVME_SC_SUCCESS;
SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cQ: QID=%d\n",
ctrlr_id(ctrlr), is_cq ? 'C' : 'S',
cmd->cdw10_bits.delete_io_q.qid);
if (lookup_io_q(ctrlr, cmd->cdw10_bits.delete_io_q.qid, is_cq) == NULL) {
SPDK_ERRLOG("%s: %cQ%d does not exist\n", ctrlr_id(ctrlr),
is_cq ? 'C' : 'S', cmd->cdw10_bits.delete_io_q.qid);
sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
goto out;
}
if (is_cq) {
/* SQ must have been deleted first */
if (ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state != VFIO_USER_QPAIR_DELETED) {
SPDK_ERRLOG("%s: the associated SQ must be deleted first\n", ctrlr_id(ctrlr));
sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
sc = SPDK_NVME_SC_INVALID_QUEUE_DELETION;
goto out;
}
} else {
/*
* This doesn't actually delete the I/O queue, we can't
* do that anyway because NVMf doesn't support it. We're merely
* telling the poll_group_poll function to skip checking this
* queue. The only workflow this works is when CC.EN is set to
* 0 and we're stopping the subsystem, so we know that the
* relevant callbacks to destroy the queues will be called.
*/
assert(ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state == VFIO_USER_QPAIR_ACTIVE);
ctrlr->qp[cmd->cdw10_bits.delete_io_q.qid]->state = VFIO_USER_QPAIR_DELETED;
}
out:
return post_completion(ctrlr, cmd, &ctrlr->qp[0]->cq, 0, sc, sct);
}
/*
* Returns 0 on success and -errno on error.
*
* XXX SPDK thread context
*/
static int
consume_admin_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd)
{
assert(ctrlr != NULL);
assert(cmd != NULL);
SPDK_DEBUGLOG(nvmf_vfio, "%s: handle admin req opc=%#x cid=%d\n",
ctrlr_id(ctrlr), cmd->opc, cmd->cid);
switch (cmd->opc) {
case SPDK_NVME_OPC_CREATE_IO_CQ:
case SPDK_NVME_OPC_CREATE_IO_SQ:
return handle_create_io_q(ctrlr, cmd,
cmd->opc == SPDK_NVME_OPC_CREATE_IO_CQ);
case SPDK_NVME_OPC_DELETE_IO_SQ:
case SPDK_NVME_OPC_DELETE_IO_CQ:
return handle_del_io_q(ctrlr, cmd,
cmd->opc == SPDK_NVME_OPC_DELETE_IO_CQ);
default:
return handle_cmd_req(ctrlr, cmd, get_nvmf_req(ctrlr->qp[0]));
}
}
static int
handle_cmd_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
{
struct nvmf_vfio_user_qpair *qpair = cb_arg;
assert(qpair != NULL);
assert(req != NULL);
vfu_unmap_sg(qpair->ctrlr->endpoint->vfu_ctx, req->sg, req->iov, req->iovcnt);
return post_completion(qpair->ctrlr, &req->req.cmd->nvme_cmd,
&qpair->ctrlr->qp[req->req.qpair->qid]->cq,
req->req.rsp->nvme_cpl.cdw0,
req->req.rsp->nvme_cpl.status.sc,
req->req.rsp->nvme_cpl.status.sct);
}
static int
consume_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_qpair *qpair,
struct spdk_nvme_cmd *cmd)
{
assert(qpair != NULL);
if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
return consume_admin_cmd(ctrlr, cmd);
}
return handle_cmd_req(ctrlr, cmd, get_nvmf_req(qpair));
}
static ssize_t
handle_sq_tdbl_write(struct nvmf_vfio_user_ctrlr *ctrlr, const uint32_t new_tail,
struct nvmf_vfio_user_qpair *qpair)
{
struct spdk_nvme_cmd *queue;
assert(ctrlr != NULL);
assert(qpair != NULL);
queue = qpair->sq.addr;
while (sq_head(qpair) != new_tail) {
int err;
struct spdk_nvme_cmd *cmd = &queue[sq_head(qpair)];
/*
* SQHD must contain the new head pointer, so we must increase
* it before we generate a completion.
*/
sqhd_advance(ctrlr, qpair);
err = consume_cmd(ctrlr, qpair, cmd);
if (err != 0) {
return err;
}
}
return 0;
}
static int
map_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
{
int err;
assert(ctrlr != NULL);
err = acq_map(ctrlr);
if (err != 0) {
return err;
}
err = asq_map(ctrlr);
if (err != 0) {
return err;
}
return 0;
}
static void
unmap_admin_queue(struct nvmf_vfio_user_ctrlr *ctrlr)
{
assert(ctrlr->qp[0] != NULL);
unmap_qp(ctrlr->qp[0]);
}
static void
memory_region_add_cb(vfu_ctx_t *vfu_ctx, uint64_t iova, uint64_t len, uint32_t prot)
{
struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
struct nvmf_vfio_user_ctrlr *ctrlr;
struct nvmf_vfio_user_qpair *qpair;
int i, ret;
assert(endpoint != NULL);
if (endpoint->ctrlr == NULL) {
return;
}
ctrlr = endpoint->ctrlr;
SPDK_DEBUGLOG(nvmf_vfio, "%s: map IOVA %#lx-%#lx\n",
ctrlr_id(ctrlr), iova, iova + len);
for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
qpair = ctrlr->qp[i];
if (qpair == NULL) {
continue;
}
if (qpair->state != VFIO_USER_QPAIR_INACTIVE) {
continue;
}
if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
ret = map_admin_queue(ctrlr);
if (ret) {
SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap Admin queue\n");
continue;
}
qpair->state = VFIO_USER_QPAIR_ACTIVE;
} else {
struct nvme_q *sq = &qpair->sq;
struct nvme_q *cq = &qpair->cq;
sq->addr = map_one(ctrlr->endpoint->vfu_ctx, sq->prp1, sq->size * 64, &sq->sg, &sq->iov);
if (!sq->addr) {
SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap SQID %d %#lx-%#lx\n",
i, sq->prp1, sq->prp1 + sq->size * 64);
continue;
}
cq->addr = map_one(ctrlr->endpoint->vfu_ctx, cq->prp1, cq->size * 16, &cq->sg, &cq->iov);
if (!cq->addr) {
SPDK_DEBUGLOG(nvmf_vfio, "Memory isn't ready to remap CQID %d %#lx-%#lx\n",
i, cq->prp1, cq->prp1 + cq->size * 16);
continue;
}
qpair->state = VFIO_USER_QPAIR_ACTIVE;
}
}
}
static int
memory_region_remove_cb(vfu_ctx_t *vfu_ctx, uint64_t iova, uint64_t len)
{
struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
struct nvmf_vfio_user_ctrlr *ctrlr;
struct nvmf_vfio_user_qpair *qpair;
int i;
assert(endpoint != NULL);
if (endpoint->ctrlr == NULL) {
return 0;
}
ctrlr = endpoint->ctrlr;
SPDK_DEBUGLOG(nvmf_vfio, "%s: unmap IOVA %#lx-%#lx\n",
ctrlr_id(ctrlr), iova, iova + len);
for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
qpair = ctrlr->qp[i];
if (qpair == NULL) {
continue;
}
if ((qpair->cq.sg.dma_addr >= iova && qpair->cq.sg.dma_addr < iova + len) ||
(qpair->sq.sg.dma_addr >= iova && qpair->sq.sg.dma_addr < iova + len)) {
unmap_qp(qpair);
qpair->state = VFIO_USER_QPAIR_INACTIVE;
}
}
return 0;
}
static int
nvmf_vfio_user_prop_req_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
{
struct nvmf_vfio_user_qpair *qpair = cb_arg;
int ret;
assert(qpair != NULL);
assert(req != NULL);
if (req->req.cmd->prop_get_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET) {
assert(qpair->ctrlr != NULL);
assert(req != NULL);
memcpy(req->req.data,
&req->req.rsp->prop_get_rsp.value.u64,
req->req.length);
} else {
assert(req->req.cmd->prop_set_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET);
assert(qpair->ctrlr != NULL);
if (req->req.cmd->prop_set_cmd.ofst == offsetof(struct spdk_nvme_registers, cc)) {
union spdk_nvme_cc_register *cc;
cc = (union spdk_nvme_cc_register *)&req->req.cmd->prop_set_cmd.value.u64;
if (cc->bits.en == 1 && cc->bits.shn == 0) {
SPDK_DEBUGLOG(nvmf_vfio,
"%s: MAP Admin queue\n",
ctrlr_id(qpair->ctrlr));
ret = map_admin_queue(qpair->ctrlr);
if (ret) {
SPDK_ERRLOG("%s: failed to map Admin queue\n", ctrlr_id(qpair->ctrlr));
return ret;
}
qpair->state = VFIO_USER_QPAIR_ACTIVE;
} else if ((cc->bits.en == 0 && cc->bits.shn == 0) ||
(cc->bits.en == 1 && cc->bits.shn != 0)) {
SPDK_DEBUGLOG(nvmf_vfio,
"%s: UNMAP Admin queue\n",
ctrlr_id(qpair->ctrlr));
unmap_admin_queue(qpair->ctrlr);
qpair->state = VFIO_USER_QPAIR_INACTIVE;
}
}
}
return 0;
}
/*
* XXX Do NOT remove, see comment in access_bar0_fn.
*
* Handles a write at offset 0x1000 or more.
*
* DSTRD is set to fixed value 0 for NVMf.
*
*/
static int
handle_dbl_access(struct nvmf_vfio_user_ctrlr *ctrlr, uint32_t *buf,
const size_t count, loff_t pos, const bool is_write)
{
assert(ctrlr != NULL);
assert(buf != NULL);
if (count != sizeof(uint32_t)) {
SPDK_ERRLOG("%s: bad doorbell buffer size %ld\n",
ctrlr_id(ctrlr), count);
return -EINVAL;
}
pos -= NVMF_VFIO_USER_DOORBELLS_OFFSET;
/* pos must be dword aligned */
if ((pos & 0x3) != 0) {
SPDK_ERRLOG("%s: bad doorbell offset %#lx\n", ctrlr_id(ctrlr), pos);
return -EINVAL;
}
/* convert byte offset to array index */
pos >>= 2;
if (pos > NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR * 2) {
/*
* TODO: need to emit a "Write to Invalid Doorbell Register"
* asynchronous event
*/
SPDK_ERRLOG("%s: bad doorbell index %#lx\n", ctrlr_id(ctrlr), pos);
return -EINVAL;
}
if (is_write) {
ctrlr->doorbells[pos] = *buf;
spdk_wmb();
} else {
spdk_rmb();
*buf = ctrlr->doorbells[pos];
}
return 0;
}
static ssize_t
access_bar0_fn(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t pos,
bool is_write)
{
struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
struct nvmf_vfio_user_ctrlr *ctrlr;
struct nvmf_vfio_user_req *req;
int ret;
ctrlr = endpoint->ctrlr;
SPDK_DEBUGLOG(nvmf_vfio,
"%s: bar0 %s ctrlr: %p, count=%zu, pos=%"PRIX64"\n",
endpoint_id(endpoint), is_write ? "write" : "read",
ctrlr, count, pos);
if (pos >= NVMF_VFIO_USER_DOORBELLS_OFFSET) {
/*
* XXX The fact that the doorbells can be memory mapped doesn't
* mean thath the client (VFIO in QEMU) is obliged to memory
* map them, it might still elect to access them via regular
* read/write.
*/
ret = handle_dbl_access(ctrlr, (uint32_t *)buf, count,
pos, is_write);
if (ret == 0) {
return count;
}
assert(ret < 0);
return ret;
}
/* Construct a Fabric Property Get/Set command and send it */
req = get_nvmf_vfio_user_req(ctrlr->qp[0]);
if (req == NULL) {
return -1;
}
req->cb_fn = nvmf_vfio_user_prop_req_rsp;
req->cb_arg = ctrlr->qp[0];
req->req.cmd->prop_set_cmd.opcode = SPDK_NVME_OPC_FABRIC;
req->req.cmd->prop_set_cmd.cid = 0;
req->req.cmd->prop_set_cmd.attrib.size = (count / 4) - 1;
req->req.cmd->prop_set_cmd.ofst = pos;
if (is_write) {
req->req.cmd->prop_set_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
if (req->req.cmd->prop_set_cmd.attrib.size) {
req->req.cmd->prop_set_cmd.value.u64 = *(uint64_t *)buf;
} else {
req->req.cmd->prop_set_cmd.value.u32.high = 0;
req->req.cmd->prop_set_cmd.value.u32.low = *(uint32_t *)buf;
}
} else {
req->req.cmd->prop_get_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
}
req->req.length = count;
req->req.data = buf;
spdk_nvmf_request_exec_fabrics(&req->req);
return count;
}
/*
* NVMe driver reads 4096 bytes, which is the extended PCI configuration space
* available on PCI-X 2.0 and PCI Express buses
*/
static ssize_t
access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset,
bool is_write)
{
struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
if (is_write) {
SPDK_ERRLOG("%s: write %#lx-%#lx not supported\n",
endpoint_id(endpoint), offset, offset + count);
return -EINVAL;
}
if (offset + count > PCI_CFG_SPACE_EXP_SIZE) {
SPDK_ERRLOG("%s: access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n",
endpoint_id(endpoint), offset, count,
PCI_CFG_SPACE_EXP_SIZE);
return -ERANGE;
}
memcpy(buf, ((unsigned char *)endpoint->pci_config_space) + offset, count);
return count;
}
static void
vfio_user_log(vfu_ctx_t *vfu_ctx, int level, char const *msg)
{
struct nvmf_vfio_user_endpoint *endpoint = vfu_get_private(vfu_ctx);
if (level >= LOG_DEBUG) {
SPDK_DEBUGLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
} else if (level >= LOG_INFO) {
SPDK_INFOLOG(nvmf_vfio, "%s: %s\n", endpoint_id(endpoint), msg);
} else if (level >= LOG_NOTICE) {
SPDK_NOTICELOG("%s: %s\n", endpoint_id(endpoint), msg);
} else if (level >= LOG_WARNING) {
SPDK_WARNLOG("%s: %s\n", endpoint_id(endpoint), msg);
} else {
SPDK_ERRLOG("%s: %s\n", endpoint_id(endpoint), msg);
}
}
static void
init_pci_config_space(vfu_pci_config_space_t *p)
{
/* MLBAR */
p->hdr.bars[0].raw = 0x0;
/* MUBAR */
p->hdr.bars[1].raw = 0x0;
/* vendor specific, let's set them to zero for now */
p->hdr.bars[3].raw = 0x0;
p->hdr.bars[4].raw = 0x0;
p->hdr.bars[5].raw = 0x0;
/* enable INTx */
p->hdr.intr.ipin = 0x1;
}
static int
vfio_user_dev_info_fill(struct nvmf_vfio_user_endpoint *endpoint)
{
int ret;
ssize_t cap_offset;
vfu_ctx_t *vfu_ctx = endpoint->vfu_ctx;
struct pmcap pmcap = { .hdr.id = PCI_CAP_ID_PM, .pmcs.nsfrst = 0x1 };
struct pxcap pxcap = {
.hdr.id = PCI_CAP_ID_EXP,
.pxcaps.ver = 0x2,
.pxdcap = {.per = 0x1, .flrc = 0x1},
.pxdcap2.ctds = 0x1
};
struct msixcap msixcap = {
.hdr.id = PCI_CAP_ID_MSIX,
.mxc.ts = NVME_IRQ_MSIX_NUM - 1,
.mtab = {.tbir = 0x4, .to = 0x0},
.mpba = {.pbir = 0x5, .pbao = 0x0}
};
static struct iovec sparse_mmap[] = {
{
.iov_base = (void *)NVMF_VFIO_USER_DOORBELLS_OFFSET,
.iov_len = NVMF_VFIO_USER_DOORBELLS_SIZE,
},
};
ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_EXPRESS, PCI_HEADER_TYPE_NORMAL, 0);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to initialize PCI\n", vfu_ctx);
return ret;
}
vfu_pci_set_id(vfu_ctx, 0x4e58, 0x0001, 0, 0);
/*
* 0x02, controller uses the NVM Express programming interface
* 0x08, non-volatile memory controller
* 0x01, mass storage controller
*/
vfu_pci_set_class(vfu_ctx, 0x01, 0x08, 0x02);
cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pmcap);
if (cap_offset < 0) {
SPDK_ERRLOG("vfu_ctx %p failed add pmcap\n", vfu_ctx);
return ret;
}
cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &pxcap);
if (cap_offset < 0) {
SPDK_ERRLOG("vfu_ctx %p failed add pxcap\n", vfu_ctx);
return ret;
}
cap_offset = vfu_pci_add_capability(vfu_ctx, 0, 0, &msixcap);
if (cap_offset < 0) {
SPDK_ERRLOG("vfu_ctx %p failed add msixcap\n", vfu_ctx);
return ret;
}
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, NVME_REG_CFG_SIZE,
access_pci_config, VFU_REGION_FLAG_RW, NULL, 0, -1);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to setup cfg\n", vfu_ctx);
return ret;
}
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, NVME_REG_BAR0_SIZE,
access_bar0_fn, VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM,
sparse_mmap, 1, endpoint->fd);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to setup bar 0\n", vfu_ctx);
return ret;
}
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR4_REGION_IDX, PAGE_SIZE,
NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to setup bar 4\n", vfu_ctx);
return ret;
}
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR5_REGION_IDX, PAGE_SIZE,
NULL, VFU_REGION_FLAG_RW, NULL, 0, -1);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to setup bar 5\n", vfu_ctx);
return ret;
}
ret = vfu_setup_device_dma_cb(vfu_ctx, memory_region_add_cb, memory_region_remove_cb);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to setup dma callback\n", vfu_ctx);
return ret;
}
ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to setup INTX\n", vfu_ctx);
return ret;
}
ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, NVME_IRQ_MSIX_NUM);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to setup MSIX\n", vfu_ctx);
return ret;
}
ret = vfu_realize_ctx(vfu_ctx);
if (ret < 0) {
SPDK_ERRLOG("vfu_ctx %p failed to realize\n", vfu_ctx);
return ret;
}
endpoint->pci_config_space = vfu_pci_get_config_space(endpoint->vfu_ctx);
assert(endpoint->pci_config_space != NULL);
init_pci_config_space(endpoint->pci_config_space);
assert(cap_offset != 0);
endpoint->msix = (struct msixcap *)((uint8_t *)endpoint->pci_config_space + cap_offset);
return 0;
}
static void
_destroy_ctrlr(void *ctx)
{
struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
int i;
for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
destroy_qp(ctrlr, i);
}
if (ctrlr->endpoint) {
ctrlr->endpoint->ctrlr = NULL;
}
spdk_poller_unregister(&ctrlr->mmio_poller);
free(ctrlr);
}
static int
destroy_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
{
assert(ctrlr != NULL);
SPDK_NOTICELOG("destroy %s\n", ctrlr_id(ctrlr));
if (ctrlr->thread == spdk_get_thread()) {
_destroy_ctrlr(ctrlr);
} else {
spdk_thread_send_msg(ctrlr->thread, _destroy_ctrlr, ctrlr);
}
return 0;
}
static void
nvmf_vfio_user_create_ctrlr(struct nvmf_vfio_user_transport *transport,
struct nvmf_vfio_user_endpoint *endpoint)
{
struct nvmf_vfio_user_ctrlr *ctrlr;
int err;
/* First, construct a vfio-user CUSTOM transport controller */
ctrlr = calloc(1, sizeof(*ctrlr));
if (ctrlr == NULL) {
err = -ENOMEM;
goto out;
}
ctrlr->cntlid = 0xffff;
ctrlr->transport = transport;
ctrlr->endpoint = endpoint;
ctrlr->doorbells = endpoint->doorbells;
/* Then, construct an admin queue pair */
err = init_qp(ctrlr, &transport->transport, NVMF_VFIO_USER_DEFAULT_AQ_DEPTH, 0);
if (err != 0) {
goto out;
}
endpoint->ctrlr = ctrlr;
ctrlr->ready = true;
/* Notify the generic layer about the new admin queue pair */
TAILQ_INSERT_TAIL(&ctrlr->transport->new_qps, ctrlr->qp[0], link);
out:
if (err != 0) {
SPDK_ERRLOG("%s: failed to create vfio-user controller: %s\n",
endpoint_id(endpoint), strerror(-err));
if (destroy_ctrlr(ctrlr) != 0) {
SPDK_ERRLOG("%s: failed to clean up\n",
endpoint_id(endpoint));
}
}
}
static int
nvmf_vfio_user_listen(struct spdk_nvmf_transport *transport,
const struct spdk_nvme_transport_id *trid,
struct spdk_nvmf_listen_opts *listen_opts)
{
struct nvmf_vfio_user_transport *vu_transport;
struct nvmf_vfio_user_endpoint *endpoint, *tmp;
char *path = NULL;
char uuid[PATH_MAX] = {};
int fd;
int err;
vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
transport);
TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
/* Only compare traddr */
if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
return -EEXIST;
}
}
endpoint = calloc(1, sizeof(*endpoint));
if (!endpoint) {
return -ENOMEM;
}
endpoint->fd = -1;
memcpy(&endpoint->trid, trid, sizeof(endpoint->trid));
err = asprintf(&path, "%s/bar0", endpoint_id(endpoint));
if (err == -1) {
goto out;
}
fd = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
if (fd == -1) {
SPDK_ERRLOG("%s: failed to open device memory at %s: %m\n",
endpoint_id(endpoint), path);
err = fd;
free(path);
goto out;
}
free(path);
err = ftruncate(fd, NVMF_VFIO_USER_DOORBELLS_OFFSET + NVMF_VFIO_USER_DOORBELLS_SIZE);
if (err != 0) {
goto out;
}
endpoint->doorbells = mmap(NULL, NVMF_VFIO_USER_DOORBELLS_SIZE,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, NVMF_VFIO_USER_DOORBELLS_OFFSET);
if (endpoint->doorbells == MAP_FAILED) {
endpoint->doorbells = NULL;
err = -errno;
goto out;
}
endpoint->fd = fd;
snprintf(uuid, PATH_MAX, "%s/cntrl", endpoint_id(endpoint));
endpoint->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, uuid, LIBVFIO_USER_FLAG_ATTACH_NB,
endpoint, VFU_DEV_TYPE_PCI);
if (endpoint->vfu_ctx == NULL) {
SPDK_ERRLOG("%s: error creating libmuser context: %m\n",
endpoint_id(endpoint));
err = -1;
goto out;
}
vfu_setup_log(endpoint->vfu_ctx, vfio_user_log,
SPDK_DEBUGLOG_FLAG_ENABLED("nvmf_vfio") ? LOG_DEBUG : LOG_ERR);
err = vfio_user_dev_info_fill(endpoint);
if (err < 0) {
goto out;
}
pthread_mutex_init(&endpoint->lock, NULL);
TAILQ_INSERT_TAIL(&vu_transport->endpoints, endpoint, link);
SPDK_NOTICELOG("%s: doorbells %p\n", uuid, endpoint->doorbells);
out:
if (err != 0) {
nvmf_vfio_user_destroy_endpoint(endpoint);
}
return err;
}
static void
nvmf_vfio_user_stop_listen(struct spdk_nvmf_transport *transport,
const struct spdk_nvme_transport_id *trid)
{
struct nvmf_vfio_user_transport *vu_transport;
struct nvmf_vfio_user_endpoint *endpoint, *tmp;
int err;
assert(trid != NULL);
assert(trid->traddr != NULL);
SPDK_DEBUGLOG(nvmf_vfio, "%s: stop listen\n", trid->traddr);
vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
transport);
pthread_mutex_lock(&vu_transport->lock);
TAILQ_FOREACH_SAFE(endpoint, &vu_transport->endpoints, link, tmp) {
if (strcmp(trid->traddr, endpoint->trid.traddr) == 0) {
TAILQ_REMOVE(&vu_transport->endpoints, endpoint, link);
if (endpoint->ctrlr) {
err = destroy_ctrlr(endpoint->ctrlr);
if (err != 0) {
SPDK_ERRLOG("%s: failed destroy controller: %s\n",
endpoint_id(endpoint), strerror(-err));
}
}
nvmf_vfio_user_destroy_endpoint(endpoint);
pthread_mutex_unlock(&vu_transport->lock);
return;
}
}
pthread_mutex_unlock(&vu_transport->lock);
SPDK_DEBUGLOG(nvmf_vfio, "%s: not found\n", trid->traddr);
}
static void
nvmf_vfio_user_cdata_init(struct spdk_nvmf_transport *transport,
struct spdk_nvmf_subsystem *subsystem,
struct spdk_nvmf_ctrlr_data *cdata)
{
cdata->sgls.supported = SPDK_NVME_SGLS_NOT_SUPPORTED;
}
static int
nvmf_vfio_user_listen_associate(struct spdk_nvmf_transport *transport,
const struct spdk_nvmf_subsystem *subsystem,
const struct spdk_nvme_transport_id *trid)
{
struct nvmf_vfio_user_transport *vu_transport;
struct nvmf_vfio_user_endpoint *endpoint;
vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport, transport);
TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
if (strncmp(endpoint->trid.traddr, trid->traddr, sizeof(endpoint->trid.traddr)) == 0) {
break;
}
}
if (endpoint == NULL) {
return -ENOENT;
}
endpoint->subsystem = subsystem;
return 0;
}
/*
* Executed periodically.
*
* XXX SPDK thread context.
*/
static uint32_t
nvmf_vfio_user_accept(struct spdk_nvmf_transport *transport)
{
int err;
struct nvmf_vfio_user_transport *vu_transport;
struct nvmf_vfio_user_qpair *qp, *tmp_qp;
struct nvmf_vfio_user_endpoint *endpoint;
vu_transport = SPDK_CONTAINEROF(transport, struct nvmf_vfio_user_transport,
transport);
pthread_mutex_lock(&vu_transport->lock);
TAILQ_FOREACH(endpoint, &vu_transport->endpoints, link) {
/* we need try to attach the controller again after reset or shutdown */
if (endpoint->ctrlr != NULL && endpoint->ctrlr->ready) {
continue;
}
err = vfu_attach_ctx(endpoint->vfu_ctx);
if (err != 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
continue;
}
pthread_mutex_unlock(&vu_transport->lock);
return -EFAULT;
}
/* Construct a controller */
nvmf_vfio_user_create_ctrlr(vu_transport, endpoint);
}
TAILQ_FOREACH_SAFE(qp, &vu_transport->new_qps, link, tmp_qp) {
TAILQ_REMOVE(&vu_transport->new_qps, qp, link);
spdk_nvmf_tgt_new_qpair(transport->tgt, &qp->qpair);
}
pthread_mutex_unlock(&vu_transport->lock);
return 0;
}
static void
nvmf_vfio_user_discover(struct spdk_nvmf_transport *transport,
struct spdk_nvme_transport_id *trid,
struct spdk_nvmf_discovery_log_page_entry *entry)
{ }
static struct spdk_nvmf_transport_poll_group *
nvmf_vfio_user_poll_group_create(struct spdk_nvmf_transport *transport)
{
struct nvmf_vfio_user_poll_group *vu_group;
SPDK_DEBUGLOG(nvmf_vfio, "create poll group\n");
vu_group = calloc(1, sizeof(*vu_group));
if (vu_group == NULL) {
SPDK_ERRLOG("Error allocating poll group: %m");
return NULL;
}
TAILQ_INIT(&vu_group->qps);
return &vu_group->group;
}
/* called when process exits */
static void
nvmf_vfio_user_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
{
struct nvmf_vfio_user_poll_group *vu_group;
SPDK_DEBUGLOG(nvmf_vfio, "destroy poll group\n");
vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
free(vu_group);
}
static void
vfio_user_qpair_disconnect_cb(void *ctx)
{
struct nvmf_vfio_user_endpoint *endpoint = ctx;
struct nvmf_vfio_user_ctrlr *ctrlr;
pthread_mutex_lock(&endpoint->lock);
ctrlr = endpoint->ctrlr;
if (!ctrlr) {
pthread_mutex_unlock(&endpoint->lock);
return;
}
if (!ctrlr->num_connected_qps) {
destroy_ctrlr(ctrlr);
pthread_mutex_unlock(&endpoint->lock);
return;
}
pthread_mutex_unlock(&endpoint->lock);
}
static int
vfio_user_stop_ctrlr(struct nvmf_vfio_user_ctrlr *ctrlr)
{
uint32_t i;
struct nvmf_vfio_user_qpair *qpair;
struct nvmf_vfio_user_endpoint *endpoint;
SPDK_DEBUGLOG(nvmf_vfio, "%s stop processing\n", ctrlr_id(ctrlr));
ctrlr->ready = false;
endpoint = ctrlr->endpoint;
assert(endpoint != NULL);
for (i = 0; i < NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR; i++) {
qpair = ctrlr->qp[i];
if (qpair == NULL) {
continue;
}
spdk_nvmf_qpair_disconnect(&qpair->qpair, vfio_user_qpair_disconnect_cb, endpoint);
}
return 0;
}
static int
vfio_user_poll_mmio(void *ctx)
{
struct nvmf_vfio_user_ctrlr *ctrlr = ctx;
int ret;
assert(ctrlr != NULL);
/* This will call access_bar0_fn() if there are any writes
* to the portion of the BAR that is not mmap'd */
ret = vfu_run_ctx(ctrlr->endpoint->vfu_ctx);
if (spdk_unlikely(ret != 0)) {
spdk_poller_unregister(&ctrlr->mmio_poller);
/* initiator shutdown or reset, waiting for another re-connect */
if (errno == ENOTCONN) {
vfio_user_stop_ctrlr(ctrlr);
return SPDK_POLLER_BUSY;
}
fail_ctrlr(ctrlr);
}
return SPDK_POLLER_BUSY;
}
static int
handle_queue_connect_rsp(struct nvmf_vfio_user_req *req, void *cb_arg)
{
struct nvmf_vfio_user_poll_group *vu_group;
struct nvmf_vfio_user_qpair *qpair = cb_arg;
struct nvmf_vfio_user_ctrlr *ctrlr;
struct nvmf_vfio_user_endpoint *endpoint;
assert(qpair != NULL);
assert(req != NULL);
ctrlr = qpair->ctrlr;
endpoint = ctrlr->endpoint;
assert(ctrlr != NULL);
assert(endpoint != NULL);
if (spdk_nvme_cpl_is_error(&req->req.rsp->nvme_cpl)) {
SPDK_ERRLOG("SC %u, SCT %u\n", req->req.rsp->nvme_cpl.status.sc, req->req.rsp->nvme_cpl.status.sct);
destroy_qp(ctrlr, qpair->qpair.qid);
destroy_ctrlr(ctrlr);
return -1;
}
vu_group = SPDK_CONTAINEROF(qpair->group, struct nvmf_vfio_user_poll_group, group);
TAILQ_INSERT_TAIL(&vu_group->qps, qpair, link);
qpair->state = VFIO_USER_QPAIR_ACTIVE;
pthread_mutex_lock(&endpoint->lock);
if (nvmf_qpair_is_admin_queue(&qpair->qpair)) {
ctrlr->cntlid = qpair->qpair.ctrlr->cntlid;
ctrlr->thread = spdk_get_thread();
ctrlr->mmio_poller = SPDK_POLLER_REGISTER(vfio_user_poll_mmio, ctrlr, 0);
}
ctrlr->num_connected_qps++;
pthread_mutex_unlock(&endpoint->lock);
free(req->req.data);
req->req.data = NULL;
return 0;
}
/*
* Called by spdk_nvmf_transport_poll_group_add.
*/
static int
nvmf_vfio_user_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_qpair *qpair)
{
struct nvmf_vfio_user_qpair *vu_qpair;
struct nvmf_vfio_user_req *vu_req;
struct nvmf_vfio_user_ctrlr *ctrlr;
struct spdk_nvmf_request *req;
struct spdk_nvmf_fabric_connect_data *data;
bool admin;
vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
vu_qpair->group = group;
ctrlr = vu_qpair->ctrlr;
SPDK_DEBUGLOG(nvmf_vfio, "%s: add QP%d=%p(%p) to poll_group=%p\n",
ctrlr_id(ctrlr), vu_qpair->qpair.qid,
vu_qpair, qpair, group);
admin = nvmf_qpair_is_admin_queue(&vu_qpair->qpair);
vu_req = get_nvmf_vfio_user_req(vu_qpair);
if (vu_req == NULL) {
return -1;
}
req = &vu_req->req;
req->cmd->connect_cmd.opcode = SPDK_NVME_OPC_FABRIC;
req->cmd->connect_cmd.cid = vu_req->cid;
req->cmd->connect_cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
req->cmd->connect_cmd.recfmt = 0;
req->cmd->connect_cmd.sqsize = vu_qpair->qsize - 1;
req->cmd->connect_cmd.qid = admin ? 0 : qpair->qid;
req->length = sizeof(struct spdk_nvmf_fabric_connect_data);
req->data = calloc(1, req->length);
if (req->data == NULL) {
nvmf_vfio_user_req_free(req);
return -ENOMEM;
}
data = (struct spdk_nvmf_fabric_connect_data *)req->data;
data->cntlid = admin ? 0xFFFF : ctrlr->cntlid;
snprintf(data->subnqn, sizeof(data->subnqn), "%s",
spdk_nvmf_subsystem_get_nqn(ctrlr->endpoint->subsystem));
vu_req->cb_fn = handle_queue_connect_rsp;
vu_req->cb_arg = vu_qpair;
SPDK_DEBUGLOG(nvmf_vfio,
"%s: sending connect fabrics command for QID=%#x cntlid=%#x\n",
ctrlr_id(ctrlr), qpair->qid, data->cntlid);
spdk_nvmf_request_exec_fabrics(req);
return 0;
}
static int
nvmf_vfio_user_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_qpair *qpair)
{
struct nvmf_vfio_user_qpair *vu_qpair;
struct nvmf_vfio_user_ctrlr *vu_ctrlr;
struct nvmf_vfio_user_endpoint *endpoint;
struct nvmf_vfio_user_poll_group *vu_group;
vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
vu_ctrlr = vu_qpair->ctrlr;
endpoint = vu_ctrlr->endpoint;
SPDK_DEBUGLOG(nvmf_vfio,
"%s: remove NVMf QP%d=%p from NVMf poll_group=%p\n",
ctrlr_id(vu_qpair->ctrlr), qpair->qid, qpair, group);
vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
TAILQ_REMOVE(&vu_group->qps, vu_qpair, link);
pthread_mutex_lock(&endpoint->lock);
assert(vu_ctrlr->num_connected_qps);
vu_ctrlr->num_connected_qps--;
pthread_mutex_unlock(&endpoint->lock);
return 0;
}
static int
nvmf_vfio_user_req_free(struct spdk_nvmf_request *req)
{
struct nvmf_vfio_user_qpair *qpair;
struct nvmf_vfio_user_req *vfio_user_req;
assert(req != NULL);
vfio_user_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
qpair = SPDK_CONTAINEROF(vfio_user_req->req.qpair, struct nvmf_vfio_user_qpair, qpair);
TAILQ_INSERT_TAIL(&qpair->reqs, vfio_user_req, link);
return 0;
}
static int
nvmf_vfio_user_req_complete(struct spdk_nvmf_request *req)
{
struct nvmf_vfio_user_qpair *qpair;
struct nvmf_vfio_user_req *vfio_user_req;
assert(req != NULL);
vfio_user_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
qpair = SPDK_CONTAINEROF(vfio_user_req->req.qpair, struct nvmf_vfio_user_qpair, qpair);
if (vfio_user_req->cb_fn != NULL) {
if (vfio_user_req->cb_fn(vfio_user_req, vfio_user_req->cb_arg) != 0) {
fail_ctrlr(qpair->ctrlr);
}
}
TAILQ_INSERT_TAIL(&qpair->reqs, vfio_user_req, link);
return 0;
}
static void
nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
spdk_nvmf_transport_qpair_fini_cb cb_fn, void *cb_arg)
{
struct nvmf_vfio_user_qpair *vu_qpair;
assert(qpair != NULL);
vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
destroy_qp(vu_qpair->ctrlr, qpair->qid);
if (cb_fn) {
cb_fn(cb_arg);
}
}
/**
* Returns a preallocated spdk_nvmf_request or NULL if there isn't one available.
*/
static struct nvmf_vfio_user_req *
get_nvmf_vfio_user_req(struct nvmf_vfio_user_qpair *qpair)
{
struct nvmf_vfio_user_req *req;
assert(qpair != NULL);
if (TAILQ_EMPTY(&qpair->reqs)) {
return NULL;
}
req = TAILQ_FIRST(&qpair->reqs);
TAILQ_REMOVE(&qpair->reqs, req, link);
memset(&req->cmd, 0, sizeof(req->cmd));
memset(&req->rsp, 0, sizeof(req->rsp));
req->iovcnt = 0;
return req;
}
static struct spdk_nvmf_request *
get_nvmf_req(struct nvmf_vfio_user_qpair *qpair)
{
struct nvmf_vfio_user_req *req = get_nvmf_vfio_user_req(qpair);
if (req == NULL) {
return NULL;
}
return &req->req;
}
static int
get_nvmf_io_req_length(struct spdk_nvmf_request *req)
{
uint16_t nlb, nr;
uint32_t nsid;
struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
struct spdk_nvmf_ns *ns;
nsid = cmd->nsid;
ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
if (ns == NULL || ns->bdev == NULL) {
SPDK_ERRLOG("unsuccessful query for nsid %u\n", cmd->nsid);
return -EINVAL;
}
if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
nr = cmd->cdw10_bits.dsm.nr + 1;
return nr * sizeof(struct spdk_nvme_dsm_range);
}
nlb = (cmd->cdw12 & 0x0000ffffu) + 1;
return nlb * spdk_bdev_get_block_size(ns->bdev);
}
static int
map_admin_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
{
struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
uint32_t len = 0;
int iovcnt;
req->xfer = cmd->opc & 0x3;
req->length = 0;
req->data = NULL;
switch (cmd->opc) {
case SPDK_NVME_OPC_IDENTIFY:
len = 4096; /* TODO: there should be a define somewhere for this */
break;
case SPDK_NVME_OPC_GET_LOG_PAGE:
len = (cmd->cdw10_bits.get_log_page.numdl + 1) * 4;
break;
}
if (!cmd->dptr.prp.prp1 || !len) {
return 0;
}
iovcnt = vfio_user_map_prps(ctrlr, req, req->iov, len);
if (iovcnt < 0) {
SPDK_ERRLOG("%s: map Admin Opc %x failed\n",
ctrlr_id(ctrlr), cmd->opc);
return -1;
}
req->length = len;
req->data = req->iov[0].iov_base;
return 0;
}
/*
* Handles an I/O command.
*
* Returns 0 on success and -errno on failure. Sets @submit on whether or not
* the request must be forwarded to NVMf.
*/
static int
map_io_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *req)
{
int err = 0;
struct spdk_nvme_cmd *cmd;
assert(ctrlr != NULL);
assert(req != NULL);
cmd = &req->cmd->nvme_cmd;
req->xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
if (spdk_unlikely(req->xfer == SPDK_NVME_DATA_NONE)) {
return 0;
}
/* SGL isn't supported now */
assert(req->cmd->nvme_cmd.psdt == 0);
err = get_nvmf_io_req_length(req);
if (err < 0) {
return -EINVAL;
}
req->length = err;
err = vfio_user_map_prps(ctrlr, req, req->iov, req->length);
if (err < 0) {
SPDK_ERRLOG("%s: failed to map PRP: %d\n", ctrlr_id(ctrlr), err);
return -EFAULT;
}
req->data = req->iov[0].iov_base;
req->iovcnt = err;
return 0;
}
static int
handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
struct spdk_nvmf_request *req)
{
int err;
struct nvmf_vfio_user_req *vfio_user_req;
assert(ctrlr != NULL);
assert(cmd != NULL);
/*
* TODO: this means that there are no free requests available,
* returning -1 will fail the controller. Theoretically this error can
* be avoided completely by ensuring we have as many requests as slots
* in the SQ, plus one for the the property request.
*/
if (spdk_unlikely(req == NULL)) {
return -1;
}
vfio_user_req = SPDK_CONTAINEROF(req, struct nvmf_vfio_user_req, req);
vfio_user_req->cb_fn = handle_cmd_rsp;
vfio_user_req->cb_arg = SPDK_CONTAINEROF(req->qpair, struct nvmf_vfio_user_qpair, qpair);
req->cmd->nvme_cmd = *cmd;
if (nvmf_qpair_is_admin_queue(req->qpair)) {
err = map_admin_cmd_req(ctrlr, req);
} else {
err = map_io_cmd_req(ctrlr, req);
}
if (spdk_unlikely(err < 0)) {
SPDK_ERRLOG("%s: map NVMe command opc 0x%x failed\n",
ctrlr_id(ctrlr), cmd->opc);
req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
return handle_cmd_rsp(vfio_user_req, vfio_user_req->cb_arg);
}
spdk_nvmf_request_exec(req);
return 0;
}
static void
nvmf_vfio_user_qpair_poll(struct nvmf_vfio_user_qpair *qpair)
{
struct nvmf_vfio_user_ctrlr *ctrlr;
uint32_t new_tail;
assert(qpair != NULL);
ctrlr = qpair->ctrlr;
new_tail = *tdbl(ctrlr, &qpair->sq);
if (sq_head(qpair) != new_tail) {
int err = handle_sq_tdbl_write(ctrlr, new_tail, qpair);
if (err != 0) {
fail_ctrlr(ctrlr);
return;
}
}
}
/*
* Called unconditionally, periodically, very frequently from SPDK to ask
* whether there's work to be done. This function consumes requests generated
* from read/write_bar0 by setting ctrlr->prop_req.dir. read_bar0, and
* occasionally write_bar0 -- though this may change, synchronously wait. This
* function also consumes requests by looking at the doorbells.
*/
static int
nvmf_vfio_user_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
{
struct nvmf_vfio_user_poll_group *vu_group;
struct nvmf_vfio_user_qpair *vu_qpair, *tmp;
assert(group != NULL);
spdk_rmb();
vu_group = SPDK_CONTAINEROF(group, struct nvmf_vfio_user_poll_group, group);
TAILQ_FOREACH_SAFE(vu_qpair, &vu_group->qps, link, tmp) {
if (spdk_unlikely(vu_qpair->state != VFIO_USER_QPAIR_ACTIVE || !vu_qpair->sq.size)) {
continue;
}
nvmf_vfio_user_qpair_poll(vu_qpair);
}
return 0;
}
static int
nvmf_vfio_user_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
struct spdk_nvme_transport_id *trid)
{
struct nvmf_vfio_user_qpair *vu_qpair;
struct nvmf_vfio_user_ctrlr *ctrlr;
vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
ctrlr = vu_qpair->ctrlr;
memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
return 0;
}
static int
nvmf_vfio_user_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
struct spdk_nvme_transport_id *trid)
{
return 0;
}
static int
nvmf_vfio_user_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
struct spdk_nvme_transport_id *trid)
{
struct nvmf_vfio_user_qpair *vu_qpair;
struct nvmf_vfio_user_ctrlr *ctrlr;
vu_qpair = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_qpair, qpair);
ctrlr = vu_qpair->ctrlr;
memcpy(trid, &ctrlr->endpoint->trid, sizeof(*trid));
return 0;
}
static void
nvmf_vfio_user_opts_init(struct spdk_nvmf_transport_opts *opts)
{
opts->max_queue_depth = NVMF_VFIO_USER_DEFAULT_MAX_QUEUE_DEPTH;
opts->max_qpairs_per_ctrlr = NVMF_VFIO_USER_DEFAULT_MAX_QPAIRS_PER_CTRLR;
opts->in_capsule_data_size = NVMF_VFIO_USER_DEFAULT_IN_CAPSULE_DATA_SIZE;
opts->max_io_size = NVMF_VFIO_USER_DEFAULT_MAX_IO_SIZE;
opts->io_unit_size = NVMF_VFIO_USER_DEFAULT_IO_UNIT_SIZE;
opts->max_aq_depth = NVMF_VFIO_USER_DEFAULT_AQ_DEPTH;
opts->num_shared_buffers = NVMF_VFIO_USER_DEFAULT_NUM_SHARED_BUFFERS;
opts->buf_cache_size = NVMF_VFIO_USER_DEFAULT_BUFFER_CACHE_SIZE;
}
const struct spdk_nvmf_transport_ops spdk_nvmf_transport_vfio_user = {
.name = "VFIOUSER",
.type = SPDK_NVME_TRANSPORT_VFIOUSER,
.opts_init = nvmf_vfio_user_opts_init,
.create = nvmf_vfio_user_create,
.destroy = nvmf_vfio_user_destroy,
.listen = nvmf_vfio_user_listen,
.stop_listen = nvmf_vfio_user_stop_listen,
.accept = nvmf_vfio_user_accept,
.cdata_init = nvmf_vfio_user_cdata_init,
.listen_associate = nvmf_vfio_user_listen_associate,
.listener_discover = nvmf_vfio_user_discover,
.poll_group_create = nvmf_vfio_user_poll_group_create,
.poll_group_destroy = nvmf_vfio_user_poll_group_destroy,
.poll_group_add = nvmf_vfio_user_poll_group_add,
.poll_group_remove = nvmf_vfio_user_poll_group_remove,
.poll_group_poll = nvmf_vfio_user_poll_group_poll,
.req_free = nvmf_vfio_user_req_free,
.req_complete = nvmf_vfio_user_req_complete,
.qpair_fini = nvmf_vfio_user_close_qpair,
.qpair_get_local_trid = nvmf_vfio_user_qpair_get_local_trid,
.qpair_get_peer_trid = nvmf_vfio_user_qpair_get_peer_trid,
.qpair_get_listen_trid = nvmf_vfio_user_qpair_get_listen_trid,
};
SPDK_NVMF_TRANSPORT_REGISTER(muser, &spdk_nvmf_transport_vfio_user);
SPDK_LOG_REGISTER_COMPONENT(nvmf_vfio)