From a191eedb19da403a63ab321a383e733e93b23bef Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Mon, 8 May 2017 09:31:48 -0700 Subject: [PATCH] vhost: import copy of dpdk rte_vhost v17.05 This will be decoupled from the build to start. Next patches will modify this code to prepare it for use with SPDK vhost-scsi. The final patch will replace the existing v17.02-based code with this version, and make the necessary SPDK vhost changes to use it. This enables to better track the differences between upstream DPDK and our internal copy, while not breaking the build at any point in the git history. While here, expand the POSIX include file check to exclude any directory starting with lib/vhost/rte_vhost (which would include this new directory). Signed-off-by: Jim Harris Change-Id: Icf1202c1b7a898edff12aa226943a08b578cf962 --- lib/vhost/rte_vhost_17_05/Makefile | 45 + lib/vhost/rte_vhost_17_05/fd_man.c | 300 ++++++ lib/vhost/rte_vhost_17_05/fd_man.h | 69 ++ lib/vhost/rte_vhost_17_05/rte_vhost.h | 427 +++++++++ lib/vhost/rte_vhost_17_05/socket.c | 797 ++++++++++++++++ lib/vhost/rte_vhost_17_05/vhost.c | 477 ++++++++++ lib/vhost/rte_vhost_17_05/vhost.h | 315 +++++++ lib/vhost/rte_vhost_17_05/vhost_user.c | 1107 ++++++++++++++++++++++ lib/vhost/rte_vhost_17_05/vhost_user.h | 134 +++ lib/vhost/rte_vhost_17_05/virtio_net.c | 1164 ++++++++++++++++++++++++ scripts/check_format.sh | 2 +- 11 files changed, 4836 insertions(+), 1 deletion(-) create mode 100644 lib/vhost/rte_vhost_17_05/Makefile create mode 100644 lib/vhost/rte_vhost_17_05/fd_man.c create mode 100644 lib/vhost/rte_vhost_17_05/fd_man.h create mode 100644 lib/vhost/rte_vhost_17_05/rte_vhost.h create mode 100644 lib/vhost/rte_vhost_17_05/socket.c create mode 100644 lib/vhost/rte_vhost_17_05/vhost.c create mode 100644 lib/vhost/rte_vhost_17_05/vhost.h create mode 100644 lib/vhost/rte_vhost_17_05/vhost_user.c create mode 100644 lib/vhost/rte_vhost_17_05/vhost_user.h create mode 100644 lib/vhost/rte_vhost_17_05/virtio_net.c diff --git a/lib/vhost/rte_vhost_17_05/Makefile b/lib/vhost/rte_vhost_17_05/Makefile new file mode 100644 index 000000000..76136b3b8 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I. +CFLAGS += $(ENV_CFLAGS) + +# These are the DPDK vhost files copied (for now) into SPDK +C_SRCS += fd_man.c socket.c vhost_user.c virtio_net.c vhost.c + +LIBNAME = rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/lib/vhost/rte_vhost_17_05/fd_man.c b/lib/vhost/rte_vhost_17_05/fd_man.c new file mode 100644 index 000000000..2ceacc9ab --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/fd_man.c @@ -0,0 +1,300 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fd_man.h" + +#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) + +static int +get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) +{ + int i; + + for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) + ; + + return i; +} + +static void +fdset_move(struct fdset *pfdset, int dst, int src) +{ + pfdset->fd[dst] = pfdset->fd[src]; + pfdset->rwfds[dst] = pfdset->rwfds[src]; +} + +static void +fdset_shrink_nolock(struct fdset *pfdset) +{ + int i; + int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); + + for (i = 0; i < last_valid_idx; i++) { + if (pfdset->fd[i].fd != -1) + continue; + + fdset_move(pfdset, i, last_valid_idx); + last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); + } + pfdset->num = last_valid_idx + 1; +} + +/* + * Find deleted fd entries and remove them + */ +static void +fdset_shrink(struct fdset *pfdset) +{ + pthread_mutex_lock(&pfdset->fd_mutex); + fdset_shrink_nolock(pfdset); + pthread_mutex_unlock(&pfdset->fd_mutex); +} + +/** + * Returns the index in the fdset for a given fd. + * @return + * index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) + ; + + return i == pfdset->num ? -1 : i; +} + +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, + fd_cb rcb, fd_cb wcb, void *dat) +{ + struct fdentry *pfdentry = &pfdset->fd[idx]; + struct pollfd *pfd = &pfdset->rwfds[idx]; + + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + pfd->fd = fd; + pfd->events = rcb ? POLLIN : 0; + pfd->events |= wcb ? POLLOUT : 0; + pfd->revents = 0; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + if (pfdset == NULL) + return; + + for (i = 0; i < MAX_FDS; i++) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].dat = NULL; + } + pfdset->num = 0; +} + +/** + * Register the fd in the fdset with read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -1; + + pthread_mutex_lock(&pfdset->fd_mutex); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + fdset_shrink_nolock(pfdset); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } + } + + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + pthread_mutex_unlock(&pfdset->fd_mutex); + + return 0; +} + +/** + * Unregister the fd from the fdset. + * Returns context of a given fd or NULL. + */ +void * +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + void *dat = NULL; + + if (pfdset == NULL || fd == -1) + return NULL; + + do { + pthread_mutex_lock(&pfdset->fd_mutex); + + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy == 0) { + /* busy indicates r/wcb is executing! */ + dat = pfdset->fd[i].dat; + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + i = -1; + } + pthread_mutex_unlock(&pfdset->fd_mutex); + } while (i != -1); + + return dat; +} + + +/** + * This functions runs in infinite blocking loop until there is no fd in + * pfdset. It calls corresponding r/w handler if there is event on the fd. + * + * Before the callback is called, we set the flag to busy status; If other + * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it + * will wait until the flag is reset to zero(which indicates the callback is + * finished), then it could free the context after fdset_del. + */ +void * +fdset_event_dispatch(void *arg) +{ + int i; + struct pollfd *pfd; + struct fdentry *pfdentry; + fd_cb rcb, wcb; + void *dat; + int fd, numfds; + int remove1, remove2; + int need_shrink; + struct fdset *pfdset = arg; + + if (pfdset == NULL) + return NULL; + + while (1) { + + /* + * When poll is blocked, other threads might unregister + * listenfds from and register new listenfds into fdset. + * When poll returns, the entries for listenfds in the fdset + * might have been updated. It is ok if there is unwanted call + * for new listenfds. + */ + pthread_mutex_lock(&pfdset->fd_mutex); + numfds = pfdset->num; + pthread_mutex_unlock(&pfdset->fd_mutex); + + poll(pfdset->rwfds, numfds, 1000 /* millisecs */); + + need_shrink = 0; + for (i = 0; i < numfds; i++) { + pthread_mutex_lock(&pfdset->fd_mutex); + + pfdentry = &pfdset->fd[i]; + fd = pfdentry->fd; + pfd = &pfdset->rwfds[i]; + + if (fd < 0) { + need_shrink = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + if (!pfd->revents) { + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + remove1 = remove2 = 0; + + rcb = pfdentry->rcb; + wcb = pfdentry->wcb; + dat = pfdentry->dat; + pfdentry->busy = 1; + + pthread_mutex_unlock(&pfdset->fd_mutex); + + if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) + rcb(fd, dat, &remove1); + if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) + wcb(fd, dat, &remove2); + pfdentry->busy = 0; + /* + * fdset_del needs to check busy flag. + * We don't allow fdset_del to be called in callback + * directly. + */ + /* + * When we are to clean up the fd from fdset, + * because the fd is closed in the cb, + * the old fd val could be reused by when creates new + * listen fd in another thread, we couldn't call + * fd_set_del. + */ + if (remove1 || remove2) { + pfdentry->fd = -1; + need_shrink = 1; + } + } + + if (need_shrink) + fdset_shrink(pfdset); + } + + return NULL; +} diff --git a/lib/vhost/rte_vhost_17_05/fd_man.h b/lib/vhost/rte_vhost_17_05/fd_man.h new file mode 100644 index 000000000..90d34db19 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/fd_man.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include +#include +#include + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, void *dat, int *remove); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable.*/ + void *dat; /* fd context */ + int busy; /* whether this entry is being used in cb. */ +}; + +struct fdset { + struct pollfd rwfds[MAX_FDS]; + struct fdentry fd[MAX_FDS]; + pthread_mutex_t fd_mutex; + int num; /* current fd number of this fdset */ +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, + fd_cb rcb, fd_cb wcb, void *dat); + +void *fdset_del(struct fdset *pfdset, int fd); + +void *fdset_event_dispatch(void *arg); + +#endif diff --git a/lib/vhost/rte_vhost_17_05/rte_vhost.h b/lib/vhost/rte_vhost_17_05/rte_vhost.h new file mode 100644 index 000000000..5fb9dfb88 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/rte_vhost.h @@ -0,0 +1,427 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_VHOST_H_ +#define _RTE_VHOST_H_ + +/** + * @file + * Interface to vhost-user + */ + +#include +#include +#include +#include + +#include +#include + +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct rte_vhost_mem_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + +/** + * Memory structure includes region and mapping information. + */ +struct rte_vhost_memory { + uint32_t nregions; + struct rte_vhost_mem_region regions[0]; +}; + +struct rte_vhost_vring { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + + int callfd; + int kickfd; + uint16_t size; +}; + +/** + * Device and vring operations. + */ +struct vhost_device_ops { + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ + + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + + /** + * Features could be changed after the feature negotiation. + * For example, VHOST_F_LOG_ALL will be set/cleared at the + * start/end of live migration, respectively. This callback + * is used to inform the application on such change. + */ + int (*features_changed)(int vid, uint64_t features); + + void *reserved[4]; /**< Reserved for future extension */ +}; + +/** + * Convert guest physical address to host virtual address + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @return + * the host virtual address on success, 0 on failure + */ +static inline uint64_t __attribute__((always_inline)) +rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) + +/** + * Log the memory write start with given address. + * + * This function only need be invoked when the live migration starts. + * Therefore, we won't need call it at all in the most of time. For + * making the performance impact be minimum, it's suggested to do a + * check before calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_write(vid, addr, len); + * + * @param vid + * vhost device ID + * @param addr + * the starting address for write + * @param len + * the length to write + */ +void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len); + +/** + * Log the used ring update start at given offset. + * + * Same as rte_vhost_log_write, it's suggested to do a check before + * calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_used_vring(vid, vring_idx, offset, len); + * + * @param vid + * vhost device ID + * @param vring_idx + * the vring index + * @param offset + * the offset inside the used ring + * @param len + * the length to write + */ +void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len); + +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); + +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); + +/* Unregister vhost driver. This is only meaningful to vhost user. */ +int rte_vhost_driver_unregister(const char *path); + +/** + * Set the feature bits the vhost-user driver supports. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_set_features(const char *path, uint64_t features); + +/** + * Enable vhost-user driver features. + * + * Note that + * - the param @features should be a subset of the feature bits provided + * by rte_vhost_driver_set_features(). + * - it must be invoked before vhost-user negotiation starts. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to enable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_enable_features(const char *path, uint64_t features); + +/** + * Disable vhost-user driver features. + * + * The two notes at rte_vhost_driver_enable_features() also apply here. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to disable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_disable_features(const char *path, uint64_t features); + +/** + * Get the feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_get_features(const char *path, uint64_t *features); + +/** + * Get the feature bits after negotiation + * + * @param vid + * Vhost device ID + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_negotiated_features(int vid, uint64_t *features); + +/* Register callbacks. */ +int rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops); + +/** + * + * Start the vhost-user driver. + * + * This function triggers the vhost-user negotiation. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_start(const char *path); + +/** + * Get the MTU value of the device if set in QEMU. + * + * @param vid + * virtio-net device ID + * @param mtu + * The variable to store the MTU value + * + * @return + * 0: success + * -EAGAIN: device not yet started + * -ENOTSUP: device does not support MTU feature + */ +int rte_vhost_get_mtu(int vid, uint16_t *mtu); + +/** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * vhost device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * @deprecated + * Get the number of queues the device supports. + * + * Note this function is deprecated, as it returns a queue pair number, + * which is vhost specific. Instead, rte_vhost_get_vring_num should + * be used. + * + * @param vid + * vhost device ID + * + * @return + * The number of queues, 0 on failure + */ +__rte_deprecated +uint32_t rte_vhost_get_queue_num(int vid); + +/** + * Get the number of vrings the device supports. + * + * @param vid + * vhost device ID + * + * @return + * The number of vrings, 0 on failure + */ +uint16_t rte_vhost_get_vring_num(int vid); + +/** + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. + * + * @param vid + * vhost device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +struct rte_mbuf; +struct rte_mempool; +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtual device. A packet + * count is returned to indicate the number of packets that were succesfully + * added to the RX queue. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param pkts + * array to contain packets to be enqueued + * @param count + * packets num to be enqueued + * @return + * num of packets enqueued + */ +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +/** + * This function gets guest buffers from the virtio device TX virtqueue, + * construct host mbufs, copies guest buffer content to host mbufs and + * store them in pkts to be processed. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param mbuf_pool + * mbuf_pool where host mbuf is allocated. + * @param pkts + * array to contain packets to be dequeued + * @param count + * packets num to be dequeued + * @return + * num of packets dequeued + */ +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); + +/** + * Get guest mem table: a list of memory regions. + * + * An rte_vhost_vhost_memory object will be allocated internaly, to hold the + * guest memory regions. Application should free it at destroy_device() + * callback. + * + * @param vid + * vhost device ID + * @param mem + * To store the returned mem regions + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); + +/** + * Get guest vring info, including the vring address, vring size, etc. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested vring info + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring); + +#endif /* _RTE_VHOST_H_ */ diff --git a/lib/vhost/rte_vhost_17_05/socket.c b/lib/vhost/rte_vhost_17_05/socket.c new file mode 100644 index 000000000..66fd335c8 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/socket.c @@ -0,0 +1,797 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "fd_man.h" +#include "vhost.h" +#include "vhost_user.h" + + +TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); + +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + struct vhost_user_connection_list conn_list; + pthread_mutex_t conn_mutex; + char *path; + int socket_fd; + struct sockaddr_un un; + bool is_server; + bool reconnect; + bool dequeue_zero_copy; + + /* + * The "supported_features" indicates the feature bits the + * vhost driver supports. The "features" indicates the feature + * bits after the rte_vhost_driver_features_disable/enable(). + * It is also the final feature bits used for vhost-user + * features negotiation. + */ + uint64_t supported_features; + uint64_t features; + + struct vhost_device_ops const *notify_ops; +}; + +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int connfd; + int vid; + + TAILQ_ENTRY(vhost_user_connection) next; +}; + +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; + struct fdset fdset; + int vsocket_cnt; + pthread_mutex_t mutex; +}; + +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_read_cb(int fd, void *dat, int *remove); +static int create_unix_socket(struct vhost_user_socket *vsocket); +static int vhost_user_start_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { + .fdset = { + .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, + .fd_mutex = PTHREAD_MUTEX_INITIALIZER, + .num = 0 + }, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +/* return bytes# of read on success or negative val on failure. */ +int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + + return ret; +} + +int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return ret; + } + + return ret; +} + +static void +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) +{ + int vid; + size_t size; + struct vhost_user_connection *conn; + int ret; + + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); + return; + } + + vid = vhost_new_device(); + if (vid == -1) { + close(fd); + free(conn); + return; + } + + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + if (vsocket->dequeue_zero_copy) + vhost_enable_dequeue_zero_copy(vid); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); + + conn->connfd = fd; + conn->vsocket = vsocket; + conn->vid = vid; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, + NULL, conn); + if (ret < 0) { + conn->connfd = -1; + free(conn); + close(fd); + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add fd %d into vhost server fdset\n", + fd); + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; + + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); +} + +static void +vhost_user_read_cb(int connfd, void *dat, int *remove) +{ + struct vhost_user_connection *conn = dat; + struct vhost_user_socket *vsocket = conn->vsocket; + int ret; + + ret = vhost_user_msg_handler(conn->vid, connfd); + if (ret < 0) { + close(connfd); + *remove = 1; + vhost_destroy_device(conn->vid); + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + + free(conn); + + if (vsocket->reconnect) { + create_unix_socket(vsocket); + vhost_user_start_client(vsocket); + } + } +} + +static int +create_unix_socket(struct vhost_user_socket *vsocket) +{ + int fd; + struct sockaddr_un *un = &vsocket->un; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + vsocket->is_server ? "server" : "client", fd); + + if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-user: can't set nonblocking mode for socket, fd: " + "%d (%s)\n", fd, strerror(errno)); + close(fd); + return -1; + } + + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); + un->sun_path[sizeof(un->sun_path) - 1] = '\0'; + + vsocket->socket_fd = fd; + return 0; +} + +static int +vhost_user_start_server(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + + ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; + } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add listen fd %d to vhost server fdset\n", + fd); + goto err; + } + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static int +vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) +{ + int ret, flags; + + ret = connect(fd, un, sz); + if (ret < 0 && errno != EISCONN) + return -1; + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't get flags for connfd %d\n", fd); + return -2; + } + if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't disable nonblocking on fd %d\n", fd); + return -2; + } + return 0; +} + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + int ret; + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + ret = vhost_user_connect_nonblock(reconn->fd, + (struct sockaddr *)&reconn->un, + sizeof(reconn->un)); + if (ret == -2) { + close(reconn->fd); + RTE_LOG(ERR, VHOST_CONFIG, + "reconnection for fd %d failed\n", + reconn->fd); + goto remove_fd; + } + if (ret == -1) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); +remove_fd: + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); + } + + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + pthread_mutex_init(&reconn_list.mutex, NULL); + TAILQ_INIT(&reconn_list.head); + + ret = pthread_create(&reconn_tid, NULL, + vhost_user_client_reconnect, NULL); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + + return ret; +} + +static int +vhost_user_start_client(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, + sizeof(vsocket->un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; + } + + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); + + if (ret == -2 || !vsocket->reconnect) { + close(fd); + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + if (reconn == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for reconnect\n"); + close(fd); + return -1; + } + reconn->un = vsocket->un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); + + return 0; +} + +static struct vhost_user_socket * +find_vhost_user_socket(const char *path) +{ + int i; + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) + return vsocket; + } + + return NULL; +} + +int +rte_vhost_driver_disable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->features &= ~features; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_enable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + if ((vsocket->supported_features & features) != features) { + /* + * trying to enable features the driver doesn't + * support. + */ + pthread_mutex_unlock(&vhost_user.mutex); + return -1; + } + vsocket->features |= features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_set_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + vsocket->supported_features = features; + vsocket->features = features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_features(const char *path, uint64_t *features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + *features = vsocket->features; + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + return -1; + } else { + return 0; + } +} + +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + TAILQ_INIT(&vsocket->conn_list); + pthread_mutex_init(&vsocket->conn_mutex, NULL); + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + + /* + * Set the supported features correctly for the builtin vhost-user + * net driver. + * + * Applications know nothing about features the builtin virtio net + * driver (virtio_net.c) supports, thus it's not possible for them + * to invoke rte_vhost_driver_set_features(). To workaround it, here + * we set it unconditionally. If the application want to implement + * another vhost-user driver (say SCSI), it should call the + * rte_vhost_driver_set_features(), which will overwrite following + * two values. + */ + vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + } + } else { + vsocket->is_server = true; + } + ret = create_unix_socket(vsocket); + if (ret < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} + +static bool +vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) +{ + int found = false; + struct vhost_user_reconnect *reconn, *next; + + pthread_mutex_lock(&reconn_list.mutex); + + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (reconn->vsocket == vsocket) { + TAILQ_REMOVE(&reconn_list.head, reconn, next); + close(reconn->fd); + free(reconn); + found = true; + break; + } + } + pthread_mutex_unlock(&reconn_list.mutex); + return found; +} + +/** + * Unregister the specified vhost socket + */ +int +rte_vhost_driver_unregister(const char *path) +{ + int i; + int count; + struct vhost_user_connection *conn, *next; + + pthread_mutex_lock(&vhost_user.mutex); + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) { + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, vsocket->socket_fd); + close(vsocket->socket_fd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + + pthread_mutex_lock(&vsocket->conn_mutex); + for (conn = TAILQ_FIRST(&vsocket->conn_list); + conn != NULL; + conn = next) { + next = TAILQ_NEXT(conn, next); + + fdset_del(&vhost_user.fdset, conn->connfd); + RTE_LOG(INFO, VHOST_CONFIG, + "free connfd = %d for device '%s'\n", + conn->connfd, path); + close(conn->connfd); + vhost_destroy_device(conn->vid); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + free(conn); + } + pthread_mutex_unlock(&vsocket->conn_mutex); + + free(vsocket->path); + free(vsocket); + + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); + + return 0; + } + } + pthread_mutex_unlock(&vhost_user.mutex); + + return -1; +} + +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->notify_ops = ops; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +struct vhost_device_ops const * +vhost_driver_callback_get(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? vsocket->notify_ops : NULL; +} + +int +rte_vhost_driver_start(const char *path) +{ + struct vhost_user_socket *vsocket; + static pthread_t fdset_tid; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) + return -1; + + if (fdset_tid == 0) { + int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, + &vhost_user.fdset); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create fdset handling thread"); + } + + if (vsocket->is_server) + return vhost_user_start_server(vsocket); + else + return vhost_user_start_client(vsocket); +} diff --git a/lib/vhost/rte_vhost_17_05/vhost.c b/lib/vhost/rte_vhost_17_05/vhost.c new file mode 100644 index 000000000..0b19d2eb8 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/vhost.c @@ -0,0 +1,477 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} + +static void +cleanup_vq(struct vhost_virtqueue *vq, int destroy) +{ + if ((vq->callfd >= 0) && (destroy != 0)) + close(vq->callfd); + if (vq->kickfd >= 0) + close(vq->kickfd); +} + +/* + * Unmap any memory, close any file descriptors and + * free any memory owned by a device. + */ +void +cleanup_device(struct virtio_net *dev, int destroy) +{ + uint32_t i; + + vhost_backend_cleanup(dev); + + for (i = 0; i < dev->nr_vring; i++) + cleanup_vq(dev->virtqueue[i], destroy); +} + +/* + * Release virtqueues and device memory. + */ +static void +free_device(struct virtio_net *dev) +{ + uint32_t i; + struct vhost_virtqueue *vq; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + rte_free(vq->shadow_used_ring); + + rte_free(vq); + } + + rte_free(dev); +} + +static void +init_vring_queue(struct vhost_virtqueue *vq) +{ + memset(vq, 0, sizeof(struct vhost_virtqueue)); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + /* Backends are set to -1 indicating an inactive device. */ + vq->backend = -1; + + /* + * always set the vq to enabled; this is to keep compatibility + * with the old QEMU, whereas there is no SET_VRING_ENABLE message. + */ + vq->enabled = 1; + + TAILQ_INIT(&vq->zmbuf_list); +} + +static void +reset_vring_queue(struct vhost_virtqueue *vq) +{ + int callfd; + + callfd = vq->callfd; + init_vring_queue(vq); + vq->callfd = callfd; +} + +int +alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) +{ + struct vhost_virtqueue *vq; + + vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); + if (vq == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for vring:%u.\n", vring_idx); + return -1; + } + + dev->virtqueue[vring_idx] = vq; + init_vring_queue(vq); + + dev->nr_vring += 1; + + return 0; +} + +/* + * Reset some variables in device structure, while keeping few + * others untouched, such as vid, ifname, nr_vring: they + * should be same unless the device is removed. + */ +void +reset_device(struct virtio_net *dev) +{ + uint32_t i; + + dev->features = 0; + dev->protocol_features = 0; + dev->flags = 0; + + for (i = 0; i < dev->nr_vring; i++) + reset_vring_queue(dev->virtqueue[i]); +} + +/* + * Invoked when there is a new vhost-user connection established (when + * there is a new virtio device being attached). + */ +int +vhost_new_device(void) +{ + struct virtio_net *dev; + int i; + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vhost_devices[i] == NULL) + break; + } + if (i == MAX_VHOST_DEVICE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find a free slot for new device.\n"); + rte_free(dev); + return -1; + } + + vhost_devices[i] = dev; + dev->vid = i; + + return i; +} + +/* + * Invoked when there is the vhost-user connection is broken (when + * the virtio device is being detached). + */ +void +vhost_destroy_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(vid); + } + + cleanup_device(dev, 1); + free_device(dev); + + vhost_devices[vid] = NULL; +} + +void +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) +{ + struct virtio_net *dev; + unsigned int len; + + dev = get_device(vid); + if (dev == NULL) + return; + + len = if_len > sizeof(dev->ifname) ? + sizeof(dev->ifname) : if_len; + + strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; +} + +void +vhost_enable_dequeue_zero_copy(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + dev->dequeue_zero_copy = 1; +} + +int +rte_vhost_get_mtu(int vid, uint16_t *mtu) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -ENODEV; + + if (!(dev->flags & VIRTIO_DEV_READY)) + return -EAGAIN; + + if (!(dev->features & VIRTIO_NET_F_MTU)) + return -ENOTSUP; + + *mtu = dev->mtu; + + return 0; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %d\n", vid, ret); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +uint32_t +rte_vhost_get_queue_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->nr_vring / 2; +} + +uint16_t +rte_vhost_get_vring_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->nr_vring; +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + + return 0; +} + +int +rte_vhost_get_negotiated_features(int vid, uint64_t *features) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (!dev) + return -1; + + *features = dev->features; + return 0; +} + +int +rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + struct virtio_net *dev; + struct rte_vhost_memory *m; + size_t size; + + dev = get_device(vid); + if (!dev) + return -1; + + size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); + m = malloc(size); + if (!m) + return -1; + + m->nregions = dev->mem->nregions; + memcpy(m->regions, dev->mem->regions, size); + *mem = m; + + return 0; +} + +int +rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + vring->log_guest_addr = vq->log_guest_addr; + + vring->callfd = vq->callfd; + vring->kickfd = vq->kickfd; + vring->size = vq->size; + + return 0; +} + +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + if (enable) { + RTE_LOG(ERR, VHOST_CONFIG, + "guest notification isn't supported.\n"); + return -1; + } + + dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; + return 0; +} + +void +rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_log_write(dev, addr, len); +} + +void +rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL) + return; + + if (vring_idx >= VHOST_MAX_VRING) + return; + vq = dev->virtqueue[vring_idx]; + if (!vq) + return; + + vhost_log_used_vring(dev, vq, offset, len); +} diff --git a/lib/vhost/rte_vhost_17_05/vhost.h b/lib/vhost/rte_vhost_17_05/vhost.h new file mode 100644 index 000000000..ddd8a9c43 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/vhost.h @@ -0,0 +1,315 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_CDEV_H_ +#define _VHOST_NET_CDEV_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "rte_vhost.h" + +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 +/* Used to indicate that the device is ready to operate */ +#define VIRTIO_DEV_READY 2 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint32_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + int enabled; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; + + struct vring_used_elem *shadow_used_ring; + uint16_t shadow_used_idx; +} __rte_cache_aligned; + +/* Old kernels have no such macros defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + +#ifndef VIRTIO_NET_F_MQ + #define VIRTIO_NET_F_MQ 22 +#endif + +#define VHOST_MAX_VRING 0x100 +#define VHOST_MAX_QUEUE_PAIRS 0x80 + +#ifndef VIRTIO_NET_F_MTU + #define VIRTIO_NET_F_MTU 3 +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this builtin vhost-user net driver. */ +#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_NET_F_MTU)) + + +struct guest_page { + uint64_t guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; +}; + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct rte_vhost_memory *mem; + uint64_t features; + uint64_t protocol_features; + int vid; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t nr_vring; + int dequeue_zero_copy; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + uint16_t mtu; + + struct vhost_device_ops const *notify_ops; + + uint32_t nr_guest_pages; + uint32_t max_guest_pages; + struct guest_page *guest_pages; +} __rte_cache_aligned; + + +#define VHOST_LOG_PAGE 4096 + +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define LOG_LEVEL RTE_LOG_DEBUG +#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define LOG_LEVEL RTE_LOG_INFO +#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + +extern uint64_t VHOST_FEATURES; +#define MAX_VHOST_DEVICE 1024 +extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Convert guest physical address to host physical address */ +static inline phys_addr_t __attribute__((always_inline)) +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + if (gpa >= page->guest_phys_addr && + gpa + size < page->guest_phys_addr + page->size) { + return gpa - page->guest_phys_addr + + page->host_phys_addr; + } + } + + return 0; +} + +struct virtio_net *get_device(int vid); + +int vhost_new_device(void); +void cleanup_device(struct virtio_net *dev, int destroy); +void reset_device(struct virtio_net *dev); +void vhost_destroy_device(int); + +int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); +void vhost_enable_dequeue_zero_copy(int vid); + +struct vhost_device_ops const *vhost_driver_callback_get(const char *path); + +/* + * Backend-specific cleanup. + * + * TODO: fix it; we have one backend now + */ +void vhost_backend_cleanup(struct virtio_net *dev); + +#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/rte_vhost_17_05/vhost_user.c b/lib/vhost/rte_vhost_17_05/vhost_user.c new file mode 100644 index 000000000..6cafb7500 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/vhost_user.c @@ -0,0 +1,1107 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include + +#include "vhost.h" +#include "vhost_user.h" + +#define VIRTIO_MIN_MTU 68 +#define VIRTIO_MAX_MTU 65535 + +static const char *vhost_message_str[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", +}; + +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static void +free_mem_region(struct virtio_net *dev) +{ + uint32_t i; + struct rte_vhost_mem_region *reg; + + if (!dev || !dev->mem) + return; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + +void +vhost_backend_cleanup(struct virtio_net *dev) +{ + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } +} + +/* + * This function just returns success at the moment unless + * the device hasn't been initialised. + */ +static int +vhost_user_set_owner(void) +{ + return 0; +} + +static int +vhost_user_reset_owner(struct virtio_net *dev) +{ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + cleanup_device(dev, 0); + reset_device(dev); + return 0; +} + +/* + * The features that we support are requested. + */ +static uint64_t +vhost_user_get_features(struct virtio_net *dev) +{ + uint64_t features = 0; + + rte_vhost_driver_get_features(dev->ifname, &features); + return features; +} + +/* + * We receive the negotiated features supported by us and the virtio device. + */ +static int +vhost_user_set_features(struct virtio_net *dev, uint64_t features) +{ + uint64_t vhost_features = 0; + + rte_vhost_driver_get_features(dev->ifname, &vhost_features); + if (features & ~vhost_features) + return -1; + + if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->features != features) { + if (dev->notify_ops->features_changed) + dev->notify_ops->features_changed(dev->vid, features); + } + + dev->features = features; + if (dev->features & + ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + dev->vhost_hlen = sizeof(struct virtio_net_hdr); + } + LOG_DEBUG(VHOST_CONFIG, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, + (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", + (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); + + return 0; +} + +/* + * The virtio device sends us the size of the descriptor ring. + */ +static int +vhost_user_set_vring_num(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + struct vhost_virtqueue *vq = dev->virtqueue[state->index]; + + vq->size = state->num; + + if (dev->dequeue_zero_copy) { + vq->nr_zmbuf = 0; + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->dequeue_zero_copy = 0; + } + } + + vq->shadow_used_ring = rte_malloc(NULL, + vq->size * sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_ring) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + return 0; +} + +/* + * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the + * same numa node as the memory of vring descriptor. + */ +#ifdef RTE_LIBRTE_VHOST_NUMA +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index) +{ + int oldnode, newnode; + struct virtio_net *old_dev; + struct vhost_virtqueue *old_vq, *vq; + int ret; + + old_dev = dev; + vq = old_vq = dev->virtqueue[index]; + + ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, + MPOL_F_NODE | MPOL_F_ADDR); + + /* check if we need to reallocate vq */ + ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get vq numa information.\n"); + return dev; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate vq from %d to %d node\n", oldnode, newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); + if (!vq) + return dev; + + memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM); + rte_free(old_vq); + } + + /* check if we need to reallocate dev */ + ret = get_mempolicy(&oldnode, NULL, 0, old_dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get dev numa information.\n"); + goto out; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate dev from %d to %d node\n", + oldnode, newnode); + dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); + if (!dev) { + dev = old_dev; + goto out; + } + + memcpy(dev, old_dev, sizeof(*dev)); + rte_free(old_dev); + } + +out: + dev->virtqueue[index] = vq; + vhost_devices[dev->vid] = dev; + + return dev; +} +#else +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index __rte_unused) +{ + return dev; +} +#endif + +/* + * Converts QEMU virtual address to Vhost virtual address. This function is + * used to convert the ring addresses to our address space. + */ +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qva) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + /* Find the region where the address lives. */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + + if (qva >= reg->guest_user_addr && + qva < reg->guest_user_addr + reg->size) { + return qva - reg->guest_user_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/* + * The virtio device sends us the desc, used and avail ring addresses. + * This function then converts these to our address space. + */ +static int +vhost_user_set_vring_addr(struct virtio_net *dev, struct vhost_vring_addr *addr) +{ + struct vhost_virtqueue *vq; + + if (dev->mem == NULL) + return -1; + + /* addr->index refers to the queue index. The txq 1, rxq is 0. */ + vq = dev->virtqueue[addr->index]; + + /* The addresses are converted from QEMU virtual to Vhost virtual. */ + vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, + addr->desc_user_addr); + if (vq->desc == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find desc ring address.\n", + dev->vid); + return -1; + } + + dev = numa_realloc(dev, addr->index); + vq = dev->virtqueue[addr->index]; + + vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, + addr->avail_user_addr); + if (vq->avail == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find avail ring address.\n", + dev->vid); + return -1; + } + + vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, + addr->used_user_addr); + if (vq->used == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find used ring address.\n", + dev->vid); + return -1; + } + + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + vq->last_avail_idx = vq->used->idx; + } + + vq->log_guest_addr = addr->log_guest_addr; + + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); + + return 0; +} + +/* + * The virtio device sends us the available ring last used index. + */ +static int +vhost_user_set_vring_base(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + dev->virtqueue[state->index]->last_used_idx = state->num; + dev->virtqueue[state->index]->last_avail_idx = state->num; + + return 0; +} + +static void +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, + uint64_t host_phys_addr, uint64_t size) +{ + struct guest_page *page, *last_page; + + if (dev->nr_guest_pages == dev->max_guest_pages) { + dev->max_guest_pages *= 2; + dev->guest_pages = realloc(dev->guest_pages, + dev->max_guest_pages * sizeof(*page)); + } + + if (dev->nr_guest_pages > 0) { + last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; + /* merge if the two pages are continuous */ + if (host_phys_addr == last_page->host_phys_addr + + last_page->size) { + last_page->size += size; + return; + } + } + + page = &dev->guest_pages[dev->nr_guest_pages++]; + page->guest_phys_addr = guest_phys_addr; + page->host_phys_addr = host_phys_addr; + page->size = size; +} + +static void +add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, + uint64_t page_size) +{ + uint64_t reg_size = reg->size; + uint64_t host_user_addr = reg->host_user_addr; + uint64_t guest_phys_addr = reg->guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; + + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); + size = page_size - (guest_phys_addr & (page_size - 1)); + size = RTE_MIN(size, reg_size); + + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + + while (reg_size > 0) { + size = RTE_MIN(reg_size, page_size); + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) + host_user_addr); + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + } +} + +#ifdef RTE_LIBRTE_VHOST_DEBUG +/* TODO: enable it only in debug mode? */ +static void +dump_guest_pages(struct virtio_net *dev) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + RTE_LOG(INFO, VHOST_CONFIG, + "guest physical page region %u\n" + "\t guest_phys_addr: %" PRIx64 "\n" + "\t host_phys_addr : %" PRIx64 "\n" + "\t size : %" PRIx64 "\n", + i, + page->guest_phys_addr, + page->host_phys_addr, + page->size); + } +} +#else +#define dump_guest_pages(dev) +#endif + +static int +vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct VhostUserMemory memory = pmsg->payload.memory; + struct rte_vhost_mem_region *reg; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + uint32_t i; + int fd; + + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + dev->nr_guest_pages = 0; + if (!dev->guest_pages) { + dev->max_guest_pages = 8; + dev->guest_pages = malloc(dev->max_guest_pages * + sizeof(struct guest_page)); + } + + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + + sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); + if (dev->mem == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); + return -1; + } + dev->mem->nregions = memory.nregions; + + for (i = 0; i < memory.nregions; i++) { + fd = pmsg->fds[i]; + reg = &dev->mem->regions[i]; + + reg->guest_phys_addr = memory.regions[i].guest_phys_addr; + reg->guest_user_addr = memory.regions[i].userspace_addr; + reg->size = memory.regions[i].memory_size; + reg->fd = fd; + + mmap_offset = memory.regions[i].mmap_offset; + mmap_size = reg->size + mmap_offset; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + goto err_mmap; + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + if (dev->dequeue_zero_copy) + add_guest_pages(dev, reg, alignment); + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + } + + dump_guest_pages(dev); + + return 0; + +err_mmap: + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + return -1; +} + +static int +vq_is_ready(struct vhost_virtqueue *vq) +{ + return vq && vq->desc && + vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; +} + +static int +virtio_is_ready(struct virtio_net *dev) +{ + struct vhost_virtqueue *vq; + uint32_t i; + + if (dev->nr_vring == 0) + return 0; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + if (!vq_is_ready(vq)) + return 0; + } + + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is now ready for processing.\n"); + return 1; +} + +static void +vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = file.fd; +} + +static void +vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->kickfd >= 0) + close(vq->kickfd); + vq->kickfd = file.fd; +} + +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + +/* + * when virtio is stopped, qemu will send us the GET_VRING_BASE message. + */ +static int +vhost_user_get_vring_base(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + struct vhost_virtqueue *vq = dev->virtqueue[state->index]; + + /* We have to stop the queue (virtio) if it is running. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->flags &= ~VIRTIO_DEV_READY; + + /* Here we are safe to get the last used index */ + state->num = vq->last_used_idx; + + RTE_LOG(INFO, VHOST_CONFIG, + "vring base idx:%d file:%d\n", state->index, state->num); + /* + * Based on current qemu vhost-user implementation, this message is + * sent and only sent in vhost_vring_stop. + * TODO: cleanup the vring, it isn't usable since here. + */ + if (vq->kickfd >= 0) + close(vq->kickfd); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (dev->dequeue_zero_copy) + free_zmbufs(vq); + rte_free(vq->shadow_used_ring); + vq->shadow_used_ring = NULL; + + return 0; +} + +/* + * when virtio queues are ready to work, qemu will send us to + * enable the virtio queue pair. + */ +static int +vhost_user_set_vring_enable(struct virtio_net *dev, + struct vhost_vring_state *state) +{ + int enable = (int)state->num; + + RTE_LOG(INFO, VHOST_CONFIG, + "set queue enable: %d to qp idx: %d\n", + enable, state->index); + + if (dev->notify_ops->vring_state_changed) + dev->notify_ops->vring_state_changed(dev->vid, state->index, enable); + + dev->virtqueue[state->index]->enabled = enable; + + return 0; +} + +static void +vhost_user_set_protocol_features(struct virtio_net *dev, + uint64_t protocol_features) +{ + if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) + return; + + dev->protocol_features = protocol_features; +} + +static int +vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* + * mmap from 0 to workaround a hugepage mmap bug: mmap will + * fail when offset is not page size aligned. + */ + addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); + return -1; + } + + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; + dev->log_size = size; + + return 0; +} + +/* + * An rarp packet is constructed and broadcasted to notify switches about + * the new location of the migrated VM, so that packets from outside will + * not be lost after migration. + * + * However, we don't actually "send" a rarp packet here, instead, we set + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. + */ +static int +vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint8_t *mac = (uint8_t *)&msg->payload.u64; + + RTE_LOG(DEBUG, VHOST_CONFIG, + ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + memcpy(dev->mac.addr_bytes, mac, 6); + + /* + * Set the flag to inject a RARP broadcast packet at + * rte_vhost_dequeue_burst(). + * + * rte_smp_wmb() is for making sure the mac is copied + * before the flag is set. + */ + rte_smp_wmb(); + rte_atomic16_set(&dev->broadcast_rarp, 1); + + return 0; +} + +static int +vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + if (msg->payload.u64 < VIRTIO_MIN_MTU || + msg->payload.u64 > VIRTIO_MAX_MTU) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", + msg->payload.u64); + + return -1; + } + + dev->mtu = msg->payload.u64; + + return 0; +} + +/* return bytes# of read on success or negative val on failure. */ +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg && msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid msg size: %d\n", msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret <= 0) + return ret; + if (ret != (int)msg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + if (!msg) + return 0; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +/* + * Allocate a queue pair if it hasn't been allocated yet + */ +static int +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +{ + uint16_t vring_idx; + + switch (msg->request) { + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + break; + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + vring_idx = msg->payload.state.index; + break; + case VHOST_USER_SET_VRING_ADDR: + vring_idx = msg->payload.addr.index; + break; + default: + return 0; + } + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid vring index: %u\n", vring_idx); + return -1; + } + + if (dev->virtqueue[vring_idx]) + return 0; + + return alloc_vring_queue(dev, vring_idx); +} + +int +vhost_user_msg_handler(int vid, int fd) +{ + struct virtio_net *dev; + struct VhostUserMsg msg; + int ret; + + dev = get_device(vid); + if (dev == NULL) + return -1; + + if (!dev->notify_ops) { + dev->notify_ops = vhost_driver_callback_get(dev->ifname); + if (!dev->notify_ops) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get callback ops for driver %s\n", + dev->ifname); + return -1; + } + } + + ret = read_vhost_message(fd, &msg); + if (ret <= 0 || msg.request >= VHOST_USER_MAX) { + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read message failed\n"); + else if (ret == 0) + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + else + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read incorrect message\n"); + + return -1; + } + + ret = 0; + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", + vhost_message_str[msg.request]); + + ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to alloc queue\n"); + return -1; + } + + switch (msg.request) { + case VHOST_USER_GET_FEATURES: + msg.payload.u64 = vhost_user_get_features(dev); + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_FEATURES: + vhost_user_set_features(dev, msg.payload.u64); + break; + + case VHOST_USER_GET_PROTOCOL_FEATURES: + msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + vhost_user_set_protocol_features(dev, msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + vhost_user_set_owner(); + break; + case VHOST_USER_RESET_OWNER: + vhost_user_reset_owner(dev); + break; + + case VHOST_USER_SET_MEM_TABLE: + ret = vhost_user_set_mem_table(dev, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + vhost_user_set_log_base(dev, &msg); + + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_LOG_FD: + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + vhost_user_set_vring_num(dev, &msg.payload.state); + break; + case VHOST_USER_SET_VRING_ADDR: + vhost_user_set_vring_addr(dev, &msg.payload.addr); + break; + case VHOST_USER_SET_VRING_BASE: + vhost_user_set_vring_base(dev, &msg.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + vhost_user_get_vring_base(dev, &msg.payload.state); + msg.size = sizeof(msg.payload.state); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + vhost_user_set_vring_kick(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + vhost_user_set_vring_call(dev, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + case VHOST_USER_GET_QUEUE_NUM: + msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_ENABLE: + vhost_user_set_vring_enable(dev, &msg.payload.state); + break; + case VHOST_USER_SEND_RARP: + vhost_user_send_rarp(dev, &msg); + break; + + case VHOST_USER_NET_SET_MTU: + ret = vhost_user_net_set_mtu(dev, &msg); + break; + + default: + ret = -1; + break; + + } + + if (msg.flags & VHOST_USER_NEED_REPLY) { + msg.payload.u64 = !!ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + } + + if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { + dev->flags |= VIRTIO_DEV_READY; + + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } + + return 0; +} diff --git a/lib/vhost/rte_vhost_17_05/vhost_user.h b/lib/vhost/rte_vhost_17_05/vhost_user.h new file mode 100644 index 000000000..2ba22dbb0 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/vhost_user.h @@ -0,0 +1,134 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include +#include + +#include "rte_vhost.h" + +/* refer to hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#define VHOST_USER_PROTOCOL_F_RARP 2 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 + +#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ + (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU)) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + + +/* vhost_user.c */ +int vhost_user_msg_handler(int vid, int fd); + +/* socket.c */ +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); + +#endif diff --git a/lib/vhost/rte_vhost_17_05/virtio_net.c b/lib/vhost/rte_vhost_17_05/virtio_net.c new file mode 100644 index 000000000..d6b7c7a22 --- /dev/null +++ b/lib/vhost/rte_vhost_17_05/virtio_net.c @@ -0,0 +1,1164 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define MAX_PKT_BURST 32 + +static bool +is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) +{ + return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; +} + +static inline void __attribute__((always_inline)) +do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t to, uint16_t from, uint16_t size) +{ + rte_memcpy(&vq->used->ring[to], + &vq->shadow_used_ring[from], + size * sizeof(struct vring_used_elem)); + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[to]), + size * sizeof(struct vring_used_elem)); +} + +static inline void __attribute__((always_inline)) +flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t used_idx = vq->last_used_idx & (vq->size - 1); + + if (used_idx + vq->shadow_used_idx <= vq->size) { + do_flush_shadow_used_ring(dev, vq, used_idx, 0, + vq->shadow_used_idx); + } else { + uint16_t size; + + /* update used ring interval [used_idx, vq->size] */ + size = vq->size - used_idx; + do_flush_shadow_used_ring(dev, vq, used_idx, 0, size); + + /* update the left half used ring interval [0, left_size] */ + do_flush_shadow_used_ring(dev, vq, 0, size, + vq->shadow_used_idx - size); + } + vq->last_used_idx += vq->shadow_used_idx; + + rte_smp_wmb(); + + *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); +} + +static inline void __attribute__((always_inline)) +update_shadow_used_ring(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint16_t len) +{ + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_ring[i].id = desc_idx; + vq->shadow_used_ring[i].len = len; +} + +static void +virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) +{ + if (m_buf->ol_flags & PKT_TX_L4_MASK) { + net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len; + + switch (m_buf->ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_TCP_CKSUM: + net_hdr->csum_offset = (offsetof(struct tcp_hdr, + cksum)); + break; + case PKT_TX_UDP_CKSUM: + net_hdr->csum_offset = (offsetof(struct udp_hdr, + dgram_cksum)); + break; + case PKT_TX_SCTP_CKSUM: + net_hdr->csum_offset = (offsetof(struct sctp_hdr, + cksum)); + break; + } + } + + if (m_buf->ol_flags & PKT_TX_TCP_SEG) { + if (m_buf->ol_flags & PKT_TX_IPV4) + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + net_hdr->gso_size = m_buf->tso_segsz; + net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len + + m_buf->l4_len; + } +} + +static inline void +copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr, + struct virtio_net_hdr_mrg_rxbuf hdr) +{ + if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) + *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; + else + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; +} + +static inline int __attribute__((always_inline)) +copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, + struct rte_mbuf *m, uint16_t desc_idx, uint32_t size) +{ + uint32_t desc_avail, desc_offset; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len; + struct vring_desc *desc; + uint64_t desc_addr; + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; + /* A counter to avoid desc dead loop chain */ + uint16_t nr_desc = 1; + + desc = &descs[desc_idx]; + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + /* + * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid + * performance issue with some versions of gcc (4.8.4 and 5.3.0) which + * otherwise stores offset on the stack instead of in a register. + */ + if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) + return -1; + + rte_prefetch0((void *)(uintptr_t)desc_addr); + + virtio_enqueue_offload(m, &virtio_hdr.hdr); + copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); + vhost_log_write(dev, desc->addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); + + desc_offset = dev->vhost_hlen; + desc_avail = desc->len - dev->vhost_hlen; + + mbuf_avail = rte_pktmbuf_data_len(m); + mbuf_offset = 0; + while (mbuf_avail != 0 || m->next != NULL) { + /* done with current mbuf, fetch next */ + if (mbuf_avail == 0) { + m = m->next; + + mbuf_offset = 0; + mbuf_avail = rte_pktmbuf_data_len(m); + } + + /* done with current desc buf, fetch next */ + if (desc_avail == 0) { + if ((desc->flags & VRING_DESC_F_NEXT) == 0) { + /* Room in vring buffer is not enough */ + return -1; + } + if (unlikely(desc->next >= size || ++nr_desc > size)) + return -1; + + desc = &descs[desc->next]; + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; + desc_avail = desc->len; + } + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, desc->addr + desc_offset, cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + } + + return 0; +} + +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtio device. A packet + * count is returned to indicate the number of packets that are succesfully + * added to the RX queue. This function works when the mbuf is scattered, but + * it doesn't support the mergeable feature. + */ +static inline uint32_t __attribute__((always_inline)) +virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + uint16_t avail_idx, free_entries, start_idx; + uint16_t desc_indexes[MAX_PKT_BURST]; + struct vring_desc *descs; + uint16_t used_idx; + uint32_t i, sz; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + start_idx = vq->last_used_idx; + free_entries = avail_idx - start_idx; + count = RTE_MIN(count, free_entries); + count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); + if (count == 0) + return 0; + + LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", + dev->vid, start_idx, start_idx + count); + + /* Retrieve all of the desc indexes first to avoid caching issues. */ + rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); + for (i = 0; i < count; i++) { + used_idx = (start_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[used_idx]; + vq->used->ring[used_idx].id = desc_indexes[i]; + vq->used->ring[used_idx].len = pkts[i]->pkt_len + + dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); + } + + rte_prefetch0(&vq->desc[desc_indexes[0]]); + for (i = 0; i < count; i++) { + uint16_t desc_idx = desc_indexes[i]; + int err; + + if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { + descs = (struct vring_desc *)(uintptr_t) + rte_vhost_gpa_to_vva(dev->mem, + vq->desc[desc_idx].addr); + if (unlikely(!descs)) { + count = i; + break; + } + + desc_idx = 0; + sz = vq->desc[desc_idx].len / sizeof(*descs); + } else { + descs = vq->desc; + sz = vq->size; + } + + err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz); + if (unlikely(err)) { + used_idx = (start_idx + i) & (vq->size - 1); + vq->used->ring[used_idx].len = dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); + } + + if (i + 1 < count) + rte_prefetch0(&vq->desc[desc_indexes[i+1]]); + } + + rte_smp_wmb(); + + *(volatile uint16_t *)&vq->used->idx += count; + vq->last_used_idx += count; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); + + /* flush used->idx update before we read avail->flags. */ + rte_mb(); + + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); + return count; +} + +static inline int __attribute__((always_inline)) +fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t avail_idx, uint32_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *desc_chain_head, + uint16_t *desc_chain_len) +{ + uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; + uint32_t vec_id = *vec_idx; + uint32_t len = 0; + struct vring_desc *descs = vq->desc; + + *desc_chain_head = idx; + + if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { + descs = (struct vring_desc *)(uintptr_t) + rte_vhost_gpa_to_vva(dev->mem, vq->desc[idx].addr); + if (unlikely(!descs)) + return -1; + + idx = 0; + } + + while (1) { + if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) + return -1; + + len += descs[idx].len; + buf_vec[vec_id].buf_addr = descs[idx].addr; + buf_vec[vec_id].buf_len = descs[idx].len; + buf_vec[vec_id].desc_idx = idx; + vec_id++; + + if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) + break; + + idx = descs[idx].next; + } + + *desc_chain_len = len; + *vec_idx = vec_id; + + return 0; +} + +/* + * Returns -1 on fail, 0 on success + */ +static inline int +reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t size, struct buf_vector *buf_vec, + uint16_t *num_buffers, uint16_t avail_head) +{ + uint16_t cur_idx; + uint32_t vec_idx = 0; + uint16_t tries = 0; + + uint16_t head_idx = 0; + uint16_t len = 0; + + *num_buffers = 0; + cur_idx = vq->last_avail_idx; + + while (size > 0) { + if (unlikely(cur_idx == avail_head)) + return -1; + + if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec, + &head_idx, &len) < 0)) + return -1; + len = RTE_MIN(len, size); + update_shadow_used_ring(vq, head_idx, len); + size -= len; + + cur_idx++; + tries++; + *num_buffers += 1; + + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(tries >= vq->size)) + return -1; + } + + return 0; +} + +static inline int __attribute__((always_inline)) +copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, + struct buf_vector *buf_vec, uint16_t num_buffers) +{ + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; + uint32_t vec_idx = 0; + uint64_t desc_addr; + uint32_t mbuf_offset, mbuf_avail; + uint32_t desc_offset, desc_avail; + uint32_t cpy_len; + uint64_t hdr_addr, hdr_phys_addr; + struct rte_mbuf *hdr_mbuf; + + if (unlikely(m == NULL)) + return -1; + + desc_addr = rte_vhost_gpa_to_vva(dev->mem, buf_vec[vec_idx].buf_addr); + if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) + return -1; + + hdr_mbuf = m; + hdr_addr = desc_addr; + hdr_phys_addr = buf_vec[vec_idx].buf_addr; + rte_prefetch0((void *)(uintptr_t)hdr_addr); + + virtio_hdr.num_buffers = num_buffers; + LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", + dev->vid, num_buffers); + + desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; + + mbuf_avail = rte_pktmbuf_data_len(m); + mbuf_offset = 0; + while (mbuf_avail != 0 || m->next != NULL) { + /* done with current desc buf, get the next one */ + if (desc_avail == 0) { + vec_idx++; + desc_addr = rte_vhost_gpa_to_vva(dev->mem, + buf_vec[vec_idx].buf_addr); + if (unlikely(!desc_addr)) + return -1; + + /* Prefetch buffer address. */ + rte_prefetch0((void *)(uintptr_t)desc_addr); + desc_offset = 0; + desc_avail = buf_vec[vec_idx].buf_len; + } + + /* done with current mbuf, get the next one */ + if (mbuf_avail == 0) { + m = m->next; + + mbuf_offset = 0; + mbuf_avail = rte_pktmbuf_data_len(m); + } + + if (hdr_addr) { + virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr); + copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr); + vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)hdr_addr, + dev->vhost_hlen, 0); + + hdr_addr = 0; + } + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, + cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + } + + return 0; +} + +static inline uint32_t __attribute__((always_inline)) +virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + uint32_t pkt_idx = 0; + uint16_t num_buffers; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t avail_head; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); + if (count == 0) + return 0; + + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + + vq->shadow_used_idx = 0; + avail_head = *((volatile uint16_t *)&vq->avail->idx); + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + + if (unlikely(reserve_avail_buf_mergeable(dev, vq, + pkt_len, buf_vec, &num_buffers, + avail_head) < 0)) { + LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + vq->shadow_used_idx -= num_buffers; + break; + } + + LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + num_buffers); + + if (copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx], + buf_vec, num_buffers) < 0) { + vq->shadow_used_idx -= num_buffers; + break; + } + + vq->last_avail_idx += num_buffers; + } + + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring(dev, vq); + + /* flush used->idx update before we read avail->flags. */ + rte_mb(); + + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); + } + + return pkt_idx; +} + +uint16_t +rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return 0; + + if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) + return virtio_dev_merge_rx(dev, queue_id, pkts, count); + else + return virtio_dev_rx(dev, queue_id, pkts, count); +} + +static inline bool +virtio_net_with_host_offload(struct virtio_net *dev) +{ + if (dev->features & + (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_ECN | + VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 | + VIRTIO_NET_F_HOST_UFO)) + return true; + + return false; +} + +static void +parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr) +{ + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + void *l3_hdr = NULL; + struct ether_hdr *eth_hdr; + uint16_t ethertype; + + eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + + m->l2_len = sizeof(struct ether_hdr); + ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); + + if (ethertype == ETHER_TYPE_VLAN) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + + m->l2_len += sizeof(struct vlan_hdr); + ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); + } + + l3_hdr = (char *)eth_hdr + m->l2_len; + + switch (ethertype) { + case ETHER_TYPE_IPv4: + ipv4_hdr = (struct ipv4_hdr *)l3_hdr; + *l4_proto = ipv4_hdr->next_proto_id; + m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV4; + break; + case ETHER_TYPE_IPv6: + ipv6_hdr = (struct ipv6_hdr *)l3_hdr; + *l4_proto = ipv6_hdr->proto; + m->l3_len = sizeof(struct ipv6_hdr); + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV6; + break; + default: + m->l3_len = 0; + *l4_proto = 0; + *l4_hdr = NULL; + break; + } +} + +static inline void __attribute__((always_inline)) +vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) +{ + uint16_t l4_proto = 0; + void *l4_hdr = NULL; + struct tcp_hdr *tcp_hdr = NULL; + + if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE) + return; + + parse_ethernet(m, &l4_proto, &l4_hdr); + if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (hdr->csum_start == (m->l2_len + m->l3_len)) { + switch (hdr->csum_offset) { + case (offsetof(struct tcp_hdr, cksum)): + if (l4_proto == IPPROTO_TCP) + m->ol_flags |= PKT_TX_TCP_CKSUM; + break; + case (offsetof(struct udp_hdr, dgram_cksum)): + if (l4_proto == IPPROTO_UDP) + m->ol_flags |= PKT_TX_UDP_CKSUM; + break; + case (offsetof(struct sctp_hdr, cksum)): + if (l4_proto == IPPROTO_SCTP) + m->ol_flags |= PKT_TX_SCTP_CKSUM; + break; + default: + break; + } + } + } + + if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + tcp_hdr = (struct tcp_hdr *)l4_hdr; + m->ol_flags |= PKT_TX_TCP_SEG; + m->tso_segsz = hdr->gso_size; + m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; + break; + default: + RTE_LOG(WARNING, VHOST_DATA, + "unsupported gso type %u.\n", hdr->gso_type); + break; + } + } +} + +#define RARP_PKT_SIZE 64 + +static int +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) +{ + struct ether_hdr *eth_hdr; + struct arp_hdr *rarp; + + if (rarp_mbuf->buf_len < 64) { + RTE_LOG(WARNING, VHOST_DATA, + "failed to make RARP; mbuf size too small %u (< %d)\n", + rarp_mbuf->buf_len, RARP_PKT_SIZE); + return -1; + } + + /* Ethernet header. */ + eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); + ether_addr_copy(mac, ð_hdr->s_addr); + eth_hdr->ether_type = htons(ETHER_TYPE_RARP); + + /* RARP header. */ + rarp = (struct arp_hdr *)(eth_hdr + 1); + rarp->arp_hrd = htons(ARP_HRD_ETHER); + rarp->arp_pro = htons(ETHER_TYPE_IPv4); + rarp->arp_hln = ETHER_ADDR_LEN; + rarp->arp_pln = 4; + rarp->arp_op = htons(ARP_OP_REVREQUEST); + + ether_addr_copy(mac, &rarp->arp_data.arp_sha); + ether_addr_copy(mac, &rarp->arp_data.arp_tha); + memset(&rarp->arp_data.arp_sip, 0x00, 4); + memset(&rarp->arp_data.arp_tip, 0x00, 4); + + rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE; + + return 0; +} + +static inline void __attribute__((always_inline)) +put_zmbuf(struct zcopy_mbuf *zmbuf) +{ + zmbuf->in_use = 0; +} + +static inline int __attribute__((always_inline)) +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, + uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx, + struct rte_mempool *mbuf_pool) +{ + struct vring_desc *desc; + uint64_t desc_addr; + uint32_t desc_avail, desc_offset; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len; + struct rte_mbuf *cur = m, *prev = m; + struct virtio_net_hdr *hdr = NULL; + /* A counter to avoid desc dead loop chain */ + uint32_t nr_desc = 1; + + desc = &descs[desc_idx]; + if (unlikely((desc->len < dev->vhost_hlen)) || + (desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + if (virtio_net_with_host_offload(dev)) { + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); + rte_prefetch0(hdr); + } + + /* + * A virtio driver normally uses at least 2 desc buffers + * for Tx: the first for storing the header, and others + * for storing the data. + */ + if (likely((desc->len == dev->vhost_hlen) && + (desc->flags & VRING_DESC_F_NEXT) != 0)) { + desc = &descs[desc->next]; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; + desc_avail = desc->len; + nr_desc += 1; + } else { + desc_avail = desc->len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; + } + + rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset)); + + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0); + + mbuf_offset = 0; + mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; + while (1) { + uint64_t hpa; + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + + /* + * A desc buf might across two host physical pages that are + * not continuous. In such case (gpa_to_hpa returns 0), data + * will be copied even though zero copy is enabled. + */ + if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, + desc->addr + desc_offset, cpy_len)))) { + cur->data_len = cpy_len; + cur->data_off = 0; + cur->buf_addr = (void *)(uintptr_t)desc_addr; + cur->buf_physaddr = hpa; + + /* + * In zero copy mode, one mbuf can only reference data + * for one or partial of one desc buff. + */ + mbuf_avail = cpy_len; + } else { + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset), + (void *)((uintptr_t)(desc_addr + desc_offset)), + cpy_len); + } + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + + /* This desc reaches to its end, get the next one */ + if (desc_avail == 0) { + if ((desc->flags & VRING_DESC_F_NEXT) == 0) + break; + + if (unlikely(desc->next >= max_desc || + ++nr_desc > max_desc)) + return -1; + desc = &descs[desc->next]; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = rte_vhost_gpa_to_vva(dev->mem, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + rte_prefetch0((void *)(uintptr_t)desc_addr); + + desc_offset = 0; + desc_avail = desc->len; + + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); + } + + /* + * This mbuf reaches to its end, get a new one + * to hold more data. + */ + if (mbuf_avail == 0) { + cur = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(cur == NULL)) { + RTE_LOG(ERR, VHOST_DATA, "Failed to " + "allocate memory for mbuf.\n"); + return -1; + } + + prev->next = cur; + prev->data_len = mbuf_offset; + m->nb_segs += 1; + m->pkt_len += mbuf_offset; + prev = cur; + + mbuf_offset = 0; + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; + } + } + + prev->data_len = mbuf_offset; + m->pkt_len += mbuf_offset; + + if (hdr) + vhost_dequeue_offload(hdr, m); + + return 0; +} + +static inline void __attribute__((always_inline)) +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t used_idx, uint32_t desc_idx) +{ + vq->used->ring[used_idx].id = desc_idx; + vq->used->ring[used_idx].len = 0; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); +} + +static inline void __attribute__((always_inline)) +update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t count) +{ + if (unlikely(count == 0)) + return; + + rte_smp_wmb(); + rte_smp_rmb(); + + vq->used->idx += count; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); + + /* Kick guest if required. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); +} + +static inline struct zcopy_mbuf *__attribute__((always_inline)) +get_zmbuf(struct vhost_virtqueue *vq) +{ + uint16_t i; + uint16_t last; + int tries = 0; + + /* search [last_zmbuf_idx, zmbuf_size) */ + i = vq->last_zmbuf_idx; + last = vq->zmbuf_size; + +again: + for (; i < last; i++) { + if (vq->zmbufs[i].in_use == 0) { + vq->last_zmbuf_idx = i + 1; + vq->zmbufs[i].in_use = 1; + return &vq->zmbufs[i]; + } + } + + tries++; + if (tries == 1) { + /* search [0, last_zmbuf_idx) */ + i = 0; + last = vq->last_zmbuf_idx; + goto again; + } + + return NULL; +} + +static inline bool __attribute__((always_inline)) +mbuf_is_consumed(struct rte_mbuf *m) +{ + while (m) { + if (rte_mbuf_refcnt_read(m) > 1) + return false; + m = m->next; + } + + return true; +} + +uint16_t +rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev; + struct rte_mbuf *rarp_mbuf = NULL; + struct vhost_virtqueue *vq; + uint32_t desc_indexes[MAX_PKT_BURST]; + uint32_t used_idx; + uint32_t i = 0; + uint16_t free_entries; + uint16_t avail_idx; + + dev = get_device(vid); + if (!dev) + return 0; + + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; + int nr_updated = 0; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + if (mbuf_is_consumed(zmbuf->mbuf)) { + used_idx = vq->last_used_idx++ & (vq->size - 1); + update_used_ring(dev, vq, used_idx, + zmbuf->desc_idx); + nr_updated += 1; + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + update_used_idx(dev, vq, nr_updated); + } + + /* + * Construct a RARP broadcast packet, and inject it to the "pkts" + * array, to looks like that guest actually send such packet. + * + * Check user_send_rarp() for more information. + * + * broadcast_rarp shares a cacheline in the virtio_net structure + * with some fields that are accessed during enqueue and + * rte_atomic16_cmpset() causes a write if using cmpxchg. This could + * result in false sharing between enqueue and dequeue. + * + * Prevent unnecessary false sharing by reading broadcast_rarp first + * and only performing cmpset if the read indicates it is likely to + * be set. + */ + + if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) && + rte_atomic16_cmpset((volatile uint16_t *) + &dev->broadcast_rarp.cnt, 1, 0))) { + + rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool); + if (rarp_mbuf == NULL) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return 0; + } + + if (make_rarp_packet(rarp_mbuf, &dev->mac)) { + rte_pktmbuf_free(rarp_mbuf); + rarp_mbuf = NULL; + } else { + count -= 1; + } + } + + free_entries = *((volatile uint16_t *)&vq->avail->idx) - + vq->last_avail_idx; + if (free_entries == 0) + goto out; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + + /* Prefetch available and used ring */ + avail_idx = vq->last_avail_idx & (vq->size - 1); + used_idx = vq->last_used_idx & (vq->size - 1); + rte_prefetch0(&vq->avail->ring[avail_idx]); + rte_prefetch0(&vq->used->ring[used_idx]); + + count = RTE_MIN(count, MAX_PKT_BURST); + count = RTE_MIN(count, free_entries); + LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); + + /* Retrieve all of the head indexes first to avoid caching issues. */ + for (i = 0; i < count; i++) { + avail_idx = (vq->last_avail_idx + i) & (vq->size - 1); + used_idx = (vq->last_used_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[avail_idx]; + + if (likely(dev->dequeue_zero_copy == 0)) + update_used_ring(dev, vq, used_idx, desc_indexes[i]); + } + + /* Prefetch descriptor index. */ + rte_prefetch0(&vq->desc[desc_indexes[0]]); + for (i = 0; i < count; i++) { + struct vring_desc *desc; + uint16_t sz, idx; + int err; + + if (likely(i + 1 < count)) + rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); + + if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { + desc = (struct vring_desc *)(uintptr_t) + rte_vhost_gpa_to_vva(dev->mem, + vq->desc[desc_indexes[i]].addr); + if (unlikely(!desc)) + break; + + rte_prefetch0(desc); + sz = vq->desc[desc_indexes[i]].len / sizeof(*desc); + idx = 0; + } else { + desc = vq->desc; + sz = vq->size; + idx = desc_indexes[i]; + } + + pkts[i] = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(pkts[i] == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + break; + } + + err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); + break; + } + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); + break; + } + zmbuf->mbuf = pkts[i]; + zmbuf->desc_idx = desc_indexes[i]; + + /* + * Pin lock the mbuf; we will check later to see + * whether the mbuf is freed (when we are the last + * user) or not. If that's the case, we then could + * update the used ring safely. + */ + rte_mbuf_refcnt_update(pkts[i], 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + } + vq->last_avail_idx += i; + + if (likely(dev->dequeue_zero_copy == 0)) { + vq->last_used_idx += i; + update_used_idx(dev, vq, i); + } + +out: + if (unlikely(rarp_mbuf != NULL)) { + /* + * Inject it to the head of "pkts" array, so that switch's mac + * learning table will get updated first. + */ + memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *)); + pkts[0] = rarp_mbuf; + i += 1; + } + + return i; +} diff --git a/scripts/check_format.sh b/scripts/check_format.sh index a723f93dc..d319624eb 100755 --- a/scripts/check_format.sh +++ b/scripts/check_format.sh @@ -63,7 +63,7 @@ fi rm -f eofnl.log echo -n "Checking for POSIX includes..." -git grep -I -i -f scripts/posix.txt -- './*' ':!include/spdk/stdinc.h' ':!lib/vhost/rte_vhost/**' ':!scripts/posix.txt' > scripts/posix.log || true +git grep -I -i -f scripts/posix.txt -- './*' ':!include/spdk/stdinc.h' ':!lib/vhost/rte_vhost*/**' ':!scripts/posix.txt' > scripts/posix.log || true if [ -s scripts/posix.log ]; then echo "POSIX includes detected. Please include spdk/stdinc.h instead." cat scripts/posix.log