vhost/nvme: add shared BAR space to enable old Guest kernel

For some old Linux Guest kernels, the new NVMe 1.3 feature: shadow
doorbell buffer is not enabled, while here, make a dummy BAR region
inside slave target, when Guest submits a new request, the doorbell
value will be write to the shared memory between Guest and vhost
target, so that the existing vhost target can support both new
Linux Guest kernel(newer than 4.12) and old Guest kernel.

Also, the shared BAR space can be used in future which we can move
ADMIN queue processing into SPDK vhost target, with this feature,
the QEMU driver will become very small and easy for upstreaming.

Change-Id: I9463e9f13421368f43bfe4076facddd119f4552e
Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-on: https://review.gerrithub.io/419157
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Chandler-Test-Pool: SPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
This commit is contained in:
Changpeng Liu 2018-06-21 04:30:29 -04:00 committed by Darek Stojaczyk
parent 6569a529d6
commit fbc53ae3fb
8 changed files with 165 additions and 16 deletions

View File

@ -106,6 +106,7 @@ struct vhost_device_ops {
int (*features_changed)(int vid, uint64_t features);
int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf);
int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd);
int (*vhost_nvme_set_bar_mr)(int vid, void *bar_addr, uint64_t bar_size);
int (*vhost_nvme_get_cap)(int vid, uint64_t *cap);
int (*new_connection)(int vid);

View File

@ -200,6 +200,8 @@ struct virtio_net {
uint32_t max_guest_pages;
struct guest_page *guest_pages;
int has_new_mem_table;
void *bar_addr;
uint64_t bar_size;
struct VhostUserMemory mem_table;
int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS];
} __rte_cache_aligned;

View File

@ -84,7 +84,8 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
[VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL",
[VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP",
[VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP",
[VHOST_USER_NVME_IO_CMD] = "VHOST_USER_NVME_IO_CMD"
[VHOST_USER_NVME_IO_CMD] = "VHOST_USER_NVME_IO_CMD",
[VHOST_USER_NVME_SET_BAR_MR] = "VHOST_USER_NVME_SET_BAR_MR"
};
static uint64_t
@ -139,6 +140,11 @@ vhost_backend_cleanup(struct virtio_net *dev)
munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
dev->log_addr = 0;
}
if (dev->bar_addr) {
munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
dev->bar_addr = NULL;
dev->bar_size = 0;
}
}
/*
@ -1119,6 +1125,90 @@ vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap)
return -1;
}
static int
vhost_user_nvme_set_bar_mr(struct virtio_net *dev, struct VhostUserMsg *pmsg)
{
struct VhostUserMemory mem_table;
int fd = pmsg->fds[0];
void *mmap_addr;
uint64_t mmap_size;
uint64_t mmap_offset;
uint64_t alignment;
struct rte_vhost_mem_region reg;
int ret = 0;
memcpy(&mem_table, &pmsg->payload.memory, sizeof(mem_table));
reg.guest_phys_addr = mem_table.regions[0].guest_phys_addr;
reg.guest_user_addr = mem_table.regions[0].userspace_addr;
reg.size = mem_table.regions[0].memory_size;
reg.fd = fd;
mmap_offset = mem_table.regions[0].mmap_offset;
mmap_size = reg.size + mmap_offset;
alignment = get_blk_size(fd);
if (alignment == (uint64_t)-1) {
RTE_LOG(ERR, VHOST_CONFIG,
"couldn't get hugepage size through fstat\n");
return -1;
}
mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, 0);
if (mmap_addr == MAP_FAILED) {
RTE_LOG(ERR, VHOST_CONFIG,
"mmap region failed.\n");
return -1;
}
if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
RTE_LOG(INFO, VHOST_CONFIG,
"MADV_DONTDUMP advice setting failed.\n");
}
reg.mmap_addr = mmap_addr;
reg.mmap_size = mmap_size;
reg.host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
mmap_offset;
RTE_LOG(INFO, VHOST_CONFIG,
"BAR memory region %u, size: 0x%" PRIx64 "\n"
"\t guest physical addr: 0x%" PRIx64 "\n"
"\t guest virtual addr: 0x%" PRIx64 "\n"
"\t host virtual addr: 0x%" PRIx64 "\n"
"\t mmap addr : 0x%" PRIx64 "\n"
"\t mmap size : 0x%" PRIx64 "\n"
"\t mmap align: 0x%" PRIx64 "\n"
"\t mmap off : 0x%" PRIx64 "\n",
0, reg.size,
reg.guest_phys_addr,
reg.guest_user_addr,
reg.host_user_addr,
(uint64_t)(uintptr_t)mmap_addr,
mmap_size,
alignment,
mmap_offset);
if (dev->bar_addr) {
munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
}
dev->bar_addr = (void *)(uintptr_t)reg.host_user_addr;
dev->bar_size = reg.mmap_size;
if (dev->notify_ops->vhost_nvme_set_bar_mr) {
ret = dev->notify_ops->vhost_nvme_set_bar_mr(dev->vid, dev->bar_addr, dev->bar_size);
if (ret) {
munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
dev->bar_addr = NULL;
dev->bar_size = 0;
}
}
return ret;
}
int
vhost_user_msg_handler(int vid, int fd)
{
@ -1243,6 +1333,9 @@ vhost_user_msg_handler(int vid, int fd)
is_submission_queue = (msg.payload.nvme_io.queue_type == VHOST_USER_NVME_SUBMISSION_QUEUE) ? true : false;
vhost_user_nvme_io_request_passthrough(dev, qid, tail_head, is_submission_queue);
break;
case VHOST_USER_NVME_SET_BAR_MR:
ret = vhost_user_nvme_set_bar_mr(dev, &msg);
break;
case VHOST_USER_GET_FEATURES:
msg.payload.u64 = vhost_user_get_features(dev);
msg.size = sizeof(msg.payload.u64);

View File

@ -91,6 +91,7 @@ typedef enum VhostUserRequest {
VHOST_USER_NVME_GET_CAP = 82,
VHOST_USER_NVME_START_STOP = 83,
VHOST_USER_NVME_IO_CMD = 84,
VHOST_USER_NVME_SET_BAR_MR = 85,
VHOST_USER_MAX
} VhostUserRequest;

View File

@ -82,6 +82,7 @@ const struct vhost_device_ops g_spdk_vhost_ops = {
.vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough,
.vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call,
.vhost_nvme_get_cap = spdk_vhost_nvme_get_cap,
.vhost_nvme_set_bar_mr = spdk_vhost_nvme_set_bar_mr,
};
static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER(

View File

@ -267,6 +267,7 @@ void spdk_vhost_unlock(void);
int spdk_remove_vhost_controller(struct spdk_vhost_dev *vdev);
int spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf);
int spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd);
int spdk_vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size);
int spdk_vhost_nvme_get_cap(int vid, uint64_t *cap);
int spdk_vhost_nvme_controller_construct(void);
int spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues);

View File

@ -136,6 +136,11 @@ struct spdk_vhost_nvme_dev {
uint32_t num_ns;
struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE];
volatile uint32_t *bar;
volatile uint32_t *bar_db;
uint64_t bar_size;
bool dataplane_started;
volatile uint32_t *dbbuf_dbs;
volatile uint32_t *dbbuf_eis;
struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1];
@ -224,6 +229,21 @@ spdk_vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
return &dev->cq_queue[qid];
}
static inline uint32_t
spdk_vhost_nvme_get_queue_head(struct spdk_vhost_nvme_dev *nvme, uint32_t offset)
{
if (nvme->dataplane_started) {
return nvme->dbbuf_dbs[offset];
} else if (nvme->bar) {
return nvme->bar_db[offset];
}
assert(0);
return 0;
}
static int
spdk_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd,
struct spdk_vhost_nvme_task *task, uint32_t len)
@ -309,7 +329,7 @@ spdk_nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme)
continue;
}
cq_head = nvme->dbbuf_dbs[cq_offset(qid, 1)];
cq_head = spdk_vhost_nvme_get_queue_head(nvme, cq_offset(qid, 1));
if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) {
eventfd_write(cq->virq, (eventfd_t)1);
cq->need_signaled_cnt = 0;
@ -334,7 +354,7 @@ spdk_vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task)
return;
}
cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(cqid, 1)];
cq->guest_signaled_cq_head = spdk_vhost_nvme_get_queue_head(nvme, cq_offset(cqid, 1));
if (spdk_unlikely(nvme_cq_is_full(cq))) {
STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq);
return;
@ -355,7 +375,9 @@ spdk_vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task)
cq->need_signaled_cnt++;
/* MMIO Controll */
nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1);
if (nvme->dataplane_started) {
nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1);
}
STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
}
@ -607,10 +629,7 @@ nvme_worker(void *arg)
return -1;
}
/* worker thread can't start before the admin doorbell
* buffer config command
*/
if (spdk_unlikely(!nvme->dbbuf_dbs)) {
if (spdk_unlikely(!nvme->dataplane_started && !nvme->bar)) {
return -1;
}
@ -624,7 +643,7 @@ nvme_worker(void *arg)
if (spdk_unlikely(!cq)) {
return -1;
}
cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(sq->cqid, 1)];
cq->guest_signaled_cq_head = spdk_vhost_nvme_get_queue_head(nvme, cq_offset(sq->cqid, 1));
if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) &&
!nvme_cq_is_full(cq))) {
task = STAILQ_FIRST(&cq->cq_full_waited_tasks);
@ -632,7 +651,7 @@ nvme_worker(void *arg)
spdk_vhost_nvme_task_complete(task);
}
dbbuf_sq = nvme->dbbuf_dbs[sq_offset(qid, 1)];
dbbuf_sq = spdk_vhost_nvme_get_queue_head(nvme, sq_offset(qid, 1));
sq->sq_tail = (uint16_t)dbbuf_sq;
count = 0;
@ -658,7 +677,9 @@ nvme_worker(void *arg)
}
/* MMIO Control */
nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1);
if (nvme->dataplane_started) {
nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1);
}
/* Maximum batch I/Os to pick up at once */
if (count++ == MAX_BATCH_IO) {
@ -697,6 +718,10 @@ vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme,
cpl->status.sc = 0;
cpl->status.sct = 0;
/* Data plane started */
nvme->dataplane_started = true;
return 0;
}
@ -744,6 +769,9 @@ vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme,
}
nvme->num_sqs++;
sq->valid = true;
if (nvme->bar) {
nvme->bar_db[sq_offset(qid, 1)] = 0;
}
cpl->status.sc = 0;
cpl->status.sct = 0;
@ -824,6 +852,9 @@ vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme,
}
nvme->num_cqs++;
cq->valid = true;
if (nvme->bar) {
nvme->bar_db[cq_offset(qid, 1)] = 0;
}
STAILQ_INIT(&cq->cq_full_waited_tasks);
cpl->status.sc = 0;
@ -890,7 +921,6 @@ spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
struct spdk_vhost_nvme_ns *ns;
int ret = 0;
struct spdk_vhost_nvme_dev *nvme;
uint32_t cq_head, sq_tail;
nvme = spdk_vhost_nvme_get_by_name(vid);
if (!nvme) {
@ -943,10 +973,6 @@ spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl);
break;
case SPDK_NVME_OPC_ABORT:
sq_tail = nvme->dbbuf_dbs[sq_offset(1, 1)] & 0xffffu;
cq_head = nvme->dbbuf_dbs[cq_offset(1, 1)] & 0xffffu;
SPDK_NOTICELOG("ABORT: CID %u, SQ_TAIL %u, CQ_HEAD %u\n",
(req->cdw10 >> 16) & 0xffffu, sq_tail, cq_head);
/* TODO: ABORT failed fow now */
cpl->cdw0 = 1;
cpl->status.sc = 0;
@ -961,6 +987,24 @@ spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
return 0;
}
int
spdk_vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size)
{
struct spdk_vhost_nvme_dev *nvme;
nvme = spdk_vhost_nvme_get_by_name(vid);
if (!nvme) {
return -1;
}
nvme->bar = (volatile uint32_t *)(uintptr_t)(bar_addr);
/* BAR0 SQ/CQ doorbell registers start from offset 0x1000 */
nvme->bar_db = (volatile uint32_t *)(uintptr_t)(bar_addr + 0x1000ull);
nvme->bar_size = bar_size;
return 0;
}
int
spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd)
{
@ -1095,10 +1139,15 @@ destroy_device_poller_cb(void *arg)
ns_dev->bdev_io_channel = NULL;
}
}
/* Clear BAR space */
if (nvme->bar) {
memset((void *)nvme->bar, 0, nvme->bar_size);
}
nvme->num_sqs = 0;
nvme->num_cqs = 0;
nvme->dbbuf_dbs = NULL;
nvme->dbbuf_eis = NULL;
nvme->dataplane_started = false;
}
}

View File

@ -105,6 +105,7 @@ DEFINE_STUB(spdk_vhost_scsi_controller_construct, int, (void), 0);
DEFINE_STUB(spdk_vhost_blk_controller_construct, int, (void), 0);
DEFINE_STUB(spdk_vhost_nvme_admin_passthrough, int, (int vid, void *cmd, void *cqe, void *buf), 0);
DEFINE_STUB(spdk_vhost_nvme_set_cq_call, int, (int vid, uint16_t qid, int fd), 0);
DEFINE_STUB(spdk_vhost_nvme_set_bar_mr, int, (int vid, void *bar, uint64_t bar_size), 0);
DEFINE_STUB(spdk_vhost_nvme_get_cap, int, (int vid, uint64_t *cap), 0);
DEFINE_STUB(spdk_vhost_nvme_controller_construct, int, (void), 0);
DEFINE_STUB(rte_vhost_set_vhost_vring_last_idx, int,