Spdk/lib/vhost/vhost.c
Dariusz Stojaczyk 8123aef784 vhost: don't abort on mem region registration failure
A single warning message is printed instead.
This is required for VMs which have memory mapped virtual devices. (e.g
NVDIMM in Clear Containers). This memory won't ever be used by guest
kernel for IO processing.

Change-Id: Ie24501acbdb3ba0d67bd29b146a09b222451427d
Signed-off-by: Dariusz Stojaczyk <dariuszx.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/373224
Tested-by: SPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: Daniel Verkamp <daniel.verkamp@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Pawel Wodkowski <pawelx.wodkowski@intel.com>
2017-08-10 13:32:36 -04:00

727 lines
18 KiB
C

/*-
* BSD LICENSE
*
* Copyright(c) Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "spdk/stdinc.h"
#include "spdk/env.h"
#include "spdk/likely.h"
#include "spdk/util.h"
#include "spdk/vhost.h"
#include "vhost_internal.h"
#include "vhost_iommu.h"
static uint32_t g_num_ctrlrs[RTE_MAX_LCORE];
/* Path to folder where character device will be created. Can be set by user. */
static char dev_dirname[PATH_MAX] = "";
#define MAX_VHOST_DEVICES 64
static struct spdk_vhost_dev *g_spdk_vhost_devices[MAX_VHOST_DEVICES];
void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr)
{
return (void *)rte_vhost_gpa_to_vva(vdev->mem, addr);
}
/*
* Get available requests from avail ring.
*/
uint16_t
spdk_vhost_vq_avail_ring_get(struct rte_vhost_vring *vq, uint16_t *reqs, uint16_t reqs_len)
{
struct vring_avail *avail = vq->avail;
uint16_t size_mask = vq->size - 1;
uint16_t last_idx = vq->last_avail_idx, avail_idx = avail->idx;
uint16_t count = RTE_MIN((avail_idx - last_idx) & size_mask, reqs_len);
uint16_t i;
if (spdk_likely(count == 0)) {
return 0;
}
vq->last_avail_idx += count;
for (i = 0; i < count; i++) {
reqs[i] = vq->avail->ring[(last_idx + i) & size_mask];
}
SPDK_TRACELOG(SPDK_TRACE_VHOST_RING,
"AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
last_idx, avail_idx, count);
return count;
}
struct vring_desc *
spdk_vhost_vq_get_desc(struct rte_vhost_vring *vq, uint16_t req_idx)
{
assert(req_idx < vq->size);
return &vq->desc[req_idx];
}
/*
* Enqueue id and len to used ring.
*/
void
spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct rte_vhost_vring *vq,
uint16_t id,
uint32_t len)
{
struct vring_used *used = vq->used;
uint16_t size_mask = vq->size - 1;
uint16_t last_idx = vq->last_used_idx;
SPDK_TRACELOG(SPDK_TRACE_VHOST_RING, "USED: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
last_idx, id, len);
vq->last_used_idx++;
last_idx &= size_mask;
used->ring[last_idx].id = id;
used->ring[last_idx].len = len;
rte_compiler_barrier();
vq->used->idx = vq->last_used_idx;
/*
* We should be able to used hints form guest but simply checking
* avail->flags prove to be unreliable. Till it is figured out how
* reliable use avail->flags value interrupts are always sent to guest.
*/
eventfd_write(vq->callfd, (eventfd_t)1);
}
bool
spdk_vhost_vring_desc_has_next(struct vring_desc *cur_desc)
{
return !!(cur_desc->flags & VRING_DESC_F_NEXT);
}
struct vring_desc *
spdk_vhost_vring_desc_get_next(struct vring_desc *vq_desc, struct vring_desc *cur_desc)
{
assert(spdk_vhost_vring_desc_has_next(cur_desc));
return &vq_desc[cur_desc->next];
}
bool
spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
{
return !!(cur_desc->flags & VRING_DESC_F_WRITE);
}
#define _2MB_OFFSET(ptr) ((ptr) & (0x200000 - 1))
int
spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov,
uint16_t *iov_index, const struct vring_desc *desc)
{
uint32_t remaining = desc->len;
uint32_t len;
uintptr_t payload = desc->addr;
uintptr_t vva;
while (remaining) {
if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
return -1;
}
vva = (uintptr_t)spdk_vhost_gpa_to_vva(vdev, payload);
if (vva == 0) {
SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
return -1;
}
len = spdk_min(remaining, 0x200000 - _2MB_OFFSET(payload));
iov[*iov_index].iov_base = (void *)vva;
iov[*iov_index].iov_len = len;
remaining -= len;
payload += len;
(*iov_index)++;
}
return 0;
}
struct spdk_vhost_dev *
spdk_vhost_dev_find_by_vid(int vid)
{
unsigned i;
struct spdk_vhost_dev *vdev;
for (i = 0; i < MAX_VHOST_DEVICES; i++) {
vdev = g_spdk_vhost_devices[i];
if (vdev && vdev->vid == vid) {
return vdev;
}
}
return NULL;
}
#define SHIFT_2MB 21
#define SIZE_2MB (1ULL << SHIFT_2MB)
#define FLOOR_2MB(x) (((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB
#define CEIL_2MB(x) ((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB
void
spdk_vhost_dev_mem_register(struct spdk_vhost_dev *vdev)
{
struct rte_vhost_mem_region *region;
uint32_t i;
for (i = 0; i < vdev->mem->nregions; i++) {
uint64_t start, end, len;
region = &vdev->mem->regions[i];
start = FLOOR_2MB(region->mmap_addr);
end = CEIL_2MB(region->mmap_addr + region->mmap_size);
len = end - start;
SPDK_NOTICELOG("Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
start, len);
if (spdk_mem_register((void *)start, len) != 0) {
SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
i);
continue;
}
if (spdk_iommu_mem_register(region->host_user_addr, region->size)) {
abort();
}
}
}
void
spdk_vhost_dev_mem_unregister(struct spdk_vhost_dev *vdev)
{
struct rte_vhost_mem_region *region;
uint32_t i;
for (i = 0; i < vdev->mem->nregions; i++) {
uint64_t start, end, len;
region = &vdev->mem->regions[i];
start = FLOOR_2MB(region->mmap_addr);
end = CEIL_2MB(region->mmap_addr + region->mmap_size);
len = end - start;
if (spdk_vtophys((void *) start) == SPDK_VTOPHYS_ERROR) {
continue; /* region has not been registered */
}
if (spdk_iommu_mem_unregister(region->host_user_addr, region->size)) {
abort();
}
if (spdk_mem_unregister((void *)start, len) != 0) {
assert(false);
}
}
}
static void
spdk_vhost_free_reactor(uint32_t lcore)
{
g_num_ctrlrs[lcore]--;
}
struct spdk_vhost_dev *
spdk_vhost_dev_find(const char *ctrlr_name)
{
unsigned i;
size_t dev_dirname_len = strlen(dev_dirname);
if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
ctrlr_name += dev_dirname_len;
}
for (i = 0; i < MAX_VHOST_DEVICES; i++) {
if (g_spdk_vhost_devices[i] == NULL) {
continue;
}
if (strcmp(g_spdk_vhost_devices[i]->name, ctrlr_name) == 0) {
return g_spdk_vhost_devices[i];
}
}
return NULL;
}
int
spdk_vhost_dev_construct(struct spdk_vhost_dev *vdev, const char *name, uint64_t cpumask,
enum spdk_vhost_dev_type type, const struct spdk_vhost_dev_backend *backend)
{
unsigned ctrlr_num;
char path[PATH_MAX];
struct stat file_stat;
assert(vdev);
if (name == NULL) {
SPDK_ERRLOG("Can't register controller with no name\n");
return -EINVAL;
}
if ((cpumask & spdk_app_get_core_mask()) != cpumask) {
SPDK_ERRLOG("cpumask 0x%jx not a subset of app mask 0x%jx\n",
cpumask, spdk_app_get_core_mask());
return -EINVAL;
}
if (spdk_vhost_dev_find(name)) {
SPDK_ERRLOG("vhost controller %s already exists.\n", name);
return -EEXIST;
}
for (ctrlr_num = 0; ctrlr_num < MAX_VHOST_DEVICES; ctrlr_num++) {
if (g_spdk_vhost_devices[ctrlr_num] == NULL) {
break;
}
}
if (ctrlr_num == MAX_VHOST_DEVICES) {
SPDK_ERRLOG("Max controllers reached (%d).\n", MAX_VHOST_DEVICES);
return -ENOSPC;
}
if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
name);
return -EINVAL;
}
/* Register vhost driver to handle vhost messages. */
if (stat(path, &file_stat) != -1) {
if (!S_ISSOCK(file_stat.st_mode)) {
SPDK_ERRLOG("Cannot remove %s: not a socket.\n", path);
return -EIO;
} else if (unlink(path) != 0) {
SPDK_ERRLOG("Cannot remove %s.\n", path);
return -EIO;
}
}
if (rte_vhost_driver_register(path, 0) != 0) {
SPDK_ERRLOG("Could not register controller %s with vhost library\n", name);
SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
return -EIO;
}
if (rte_vhost_driver_set_features(path, backend->virtio_features) ||
rte_vhost_driver_disable_features(path, backend->disabled_features)) {
SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name);
rte_vhost_driver_unregister(path);
return -EIO;
}
if (rte_vhost_driver_callback_register(path, &backend->ops) != 0) {
rte_vhost_driver_unregister(path);
SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name);
return -EIO;
}
vdev->name = strdup(name);
vdev->path = strdup(path);
vdev->vid = -1;
vdev->lcore = -1;
vdev->cpumask = cpumask;
vdev->type = type;
vdev->vhost_backend = backend;
g_spdk_vhost_devices[ctrlr_num] = vdev;
if (rte_vhost_driver_start(path) != 0) {
SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s", name, errno,
strerror(errno));
rte_vhost_driver_unregister(path);
return -EIO;
}
SPDK_NOTICELOG("Controller %s: new controller added\n", vdev->name);
return 0;
}
int
spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
{
unsigned ctrlr_num;
char path[PATH_MAX];
if (vdev->lcore != -1) {
SPDK_ERRLOG("Controller %s is in use and hotplug is not supported\n", vdev->name);
return -ENODEV;
}
for (ctrlr_num = 0; ctrlr_num < MAX_VHOST_DEVICES; ctrlr_num++) {
if (g_spdk_vhost_devices[ctrlr_num] == vdev) {
break;
}
}
if (ctrlr_num == MAX_VHOST_DEVICES) {
SPDK_ERRLOG("Trying to remove invalid controller: %s.\n", vdev->name);
return -ENOSPC;
}
if (snprintf(path, sizeof(path), "%s%s", dev_dirname, vdev->name) >= (int)sizeof(path)) {
SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", vdev->name, dev_dirname,
vdev->name);
return -EINVAL;
}
if (rte_vhost_driver_unregister(path) != 0) {
SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
"Check if domain socket %s still exists\n", vdev->name, path);
return -EIO;
}
SPDK_NOTICELOG("Controller %s: removed\n", vdev->name);
free(vdev->name);
free(vdev->path);
g_spdk_vhost_devices[ctrlr_num] = NULL;
return 0;
}
int
spdk_vhost_parse_core_mask(const char *mask, uint64_t *cpumask)
{
char *end;
if (mask == NULL || cpumask == NULL) {
return -1;
}
errno = 0;
*cpumask = strtoull(mask, &end, 16);
if (*end != '\0' || errno || !*cpumask ||
((*cpumask & spdk_app_get_core_mask()) != *cpumask)) {
SPDK_ERRLOG("cpumask %s not a subset of app mask 0x%jx\n",
mask, spdk_app_get_core_mask());
return -1;
}
return 0;
}
struct spdk_vhost_dev *
spdk_vhost_dev_next(struct spdk_vhost_dev *prev)
{
int i = 0;
if (prev != NULL) {
for (; i < MAX_VHOST_DEVICES; i++) {
if (g_spdk_vhost_devices[i] == prev) {
break;
}
}
i++;
}
for (; i < MAX_VHOST_DEVICES; i++) {
if (g_spdk_vhost_devices[i] == NULL) {
continue;
}
return g_spdk_vhost_devices[i];
}
return NULL;
}
const char *
spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
{
assert(vdev != NULL);
return vdev->name;
}
uint64_t
spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
{
assert(vdev != NULL);
return vdev->cpumask;
}
static uint32_t
spdk_vhost_allocate_reactor(uint64_t cpumask)
{
uint32_t i, selected_core;
uint32_t min_ctrlrs;
cpumask &= spdk_app_get_core_mask();
if (cpumask == 0) {
return 0;
}
min_ctrlrs = INT_MAX;
selected_core = 0;
for (i = 0; i < RTE_MAX_LCORE && i < 64; i++) {
if (!((1ULL << i) & cpumask)) {
continue;
}
if (g_num_ctrlrs[i] < min_ctrlrs) {
selected_core = i;
min_ctrlrs = g_num_ctrlrs[i];
}
}
g_num_ctrlrs[selected_core]++;
return selected_core;
}
void
spdk_vhost_dev_unload(struct spdk_vhost_dev *vdev)
{
struct rte_vhost_vring *q;
uint16_t i;
for (i = 0; i < vdev->num_queues; i++) {
q = &vdev->virtqueue[i];
rte_vhost_set_vhost_vring_last_idx(vdev->vid, i, q->last_avail_idx, q->last_used_idx);
}
free(vdev->mem);
spdk_vhost_free_reactor(vdev->lcore);
vdev->lcore = -1;
}
struct spdk_vhost_dev *
spdk_vhost_dev_load(int vid)
{
struct spdk_vhost_dev *vdev;
char ifname[PATH_MAX];
uint16_t num_queues = rte_vhost_get_vring_num(vid);
uint16_t i;
if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
SPDK_ERRLOG("Couldn't get a valid ifname for device %d\n", vid);
return NULL;
}
vdev = spdk_vhost_dev_find(ifname);
if (vdev == NULL) {
SPDK_ERRLOG("Controller %s not found.\n", ifname);
return NULL;
}
if (vdev->lcore != -1) {
SPDK_ERRLOG("Controller %s already connected.\n", ifname);
return NULL;
}
if (num_queues > SPDK_VHOST_MAX_VQUEUES) {
SPDK_ERRLOG("vhost device %d: Too many queues (%"PRIu16"). Max %"PRIu16"\n", vid, num_queues,
SPDK_VHOST_MAX_VQUEUES);
return NULL;
}
for (i = 0; i < num_queues; i++) {
if (rte_vhost_get_vhost_vring(vid, i, &vdev->virtqueue[i])) {
SPDK_ERRLOG("vhost device %d: Failed to get information of queue %"PRIu16"\n", vid, i);
return NULL;
}
/* Disable notifications. */
if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) {
SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i);
return NULL;
}
}
vdev->vid = vid;
vdev->num_queues = num_queues;
if (rte_vhost_get_negotiated_features(vid, &vdev->negotiated_features) != 0) {
SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
return NULL;
}
if (rte_vhost_get_mem_table(vid, &vdev->mem) != 0) {
SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
return NULL;
}
vdev->lcore = spdk_vhost_allocate_reactor(vdev->cpumask);
return vdev;
}
void
spdk_vhost_startup(void *arg1, void *arg2)
{
int ret;
const char *basename = arg1;
if (basename && strlen(basename) > 0) {
ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
if ((size_t)ret >= sizeof(dev_dirname) - 2) {
SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
abort();
}
if (dev_dirname[ret - 1] != '/') {
dev_dirname[ret] = '/';
dev_dirname[ret + 1] = '\0';
}
}
ret = spdk_vhost_scsi_controller_construct();
if (ret != 0) {
SPDK_ERRLOG("Cannot construct vhost controllers\n");
abort();
}
ret = spdk_vhost_blk_controller_construct();
if (ret != 0) {
SPDK_ERRLOG("Cannot construct vhost block controllers\n");
abort();
}
}
static void *
session_shutdown(void *arg)
{
struct spdk_vhost_dev *vdev = NULL;
int i;
for (i = 0; i < MAX_VHOST_DEVICES; i++) {
vdev = g_spdk_vhost_devices[i];
if (vdev == NULL) {
continue;
}
rte_vhost_driver_unregister(vdev->name);
}
SPDK_NOTICELOG("Exiting\n");
spdk_app_stop(0);
return NULL;
}
/*
* When we receive a INT signal. Execute shutdown in separate thread to avoid deadlock.
*/
void
spdk_vhost_shutdown_cb(void)
{
pthread_t tid;
if (pthread_create(&tid, NULL, &session_shutdown, NULL) < 0) {
SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s", errno, strerror(errno));
abort();
}
pthread_detach(tid);
}
static void
vhost_timed_event_fn(void *arg1, void *arg2)
{
struct spdk_vhost_timed_event *ev = arg1;
if (ev->cb_fn) {
ev->cb_fn(arg2);
}
sem_post(&ev->sem);
}
static void
vhost_timed_event_init(struct spdk_vhost_timed_event *ev, int32_t lcore,
spdk_vhost_timed_event_fn cb_fn, void *arg, unsigned timeout_sec)
{
/* No way to free spdk event so don't allow to use it again without calling, waiting. */
assert(ev->spdk_event == NULL);
if (sem_init(&ev->sem, 0, 0) < 0)
SPDK_ERRLOG("Failed to initialize semaphore for vhost timed event\n");
ev->cb_fn = cb_fn;
clock_gettime(CLOCK_REALTIME, &ev->timeout);
ev->timeout.tv_sec += timeout_sec;
ev->spdk_event = spdk_event_allocate(lcore, vhost_timed_event_fn, ev, arg);
}
void
spdk_vhost_timed_event_init(struct spdk_vhost_timed_event *ev, int32_t lcore,
spdk_vhost_timed_event_fn cb_fn, void *arg, unsigned timeout_sec)
{
vhost_timed_event_init(ev, lcore, cb_fn, arg, timeout_sec);
}
void
spdk_vhost_timed_event_send(int32_t lcore, spdk_vhost_timed_event_fn cb_fn, void *arg,
unsigned timeout_sec, const char *errmsg)
{
struct spdk_vhost_timed_event ev = {0};
vhost_timed_event_init(&ev, lcore, cb_fn, arg, timeout_sec);
spdk_event_call(ev.spdk_event);
spdk_vhost_timed_event_wait(&ev, errmsg);
}
void
spdk_vhost_timed_event_wait(struct spdk_vhost_timed_event *ev, const char *errmsg)
{
int rc;
assert(ev->spdk_event != NULL);
rc = sem_timedwait(&ev->sem, &ev->timeout);
if (rc != 0) {
SPDK_ERRLOG("Timout waiting for event: %s.\n", errmsg);
abort();
}
ev->spdk_event = NULL;
sem_destroy(&ev->sem);
}
void
spdk_vhost_dump_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
{
assert(vdev->vhost_backend->dump_config_json != NULL);
vdev->vhost_backend->dump_config_json(vdev, w);
}
SPDK_LOG_REGISTER_TRACE_FLAG("vhost_ring", SPDK_TRACE_VHOST_RING)