diff --git a/lib/vhost/rte_vhost/Makefile b/lib/vhost/rte_vhost/Makefile index 336425818..537a3c70e 100644 --- a/lib/vhost/rte_vhost/Makefile +++ b/lib/vhost/rte_vhost/Makefile @@ -34,10 +34,11 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +CFLAGS += -I. CFLAGS += $(ENV_CFLAGS) # These are the DPDK vhost files copied (for now) into SPDK -C_SRCS += fd_man.c socket.c vhost_user.c virtio_net.c vhost.c +C_SRCS += fd_man.c socket.c vhost_user.c vhost.c LIBNAME = rte_vhost diff --git a/lib/vhost/rte_vhost/fd_man.c b/lib/vhost/rte_vhost/fd_man.c index 2d3eeb7d7..2ceacc9ab 100644 --- a/lib/vhost/rte_vhost/fd_man.c +++ b/lib/vhost/rte_vhost/fd_man.c @@ -35,19 +35,65 @@ #include #include #include -#include #include #include #include +#include #include #include #include "fd_man.h" +#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) + +static int +get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) +{ + int i; + + for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) + ; + + return i; +} + +static void +fdset_move(struct fdset *pfdset, int dst, int src) +{ + pfdset->fd[dst] = pfdset->fd[src]; + pfdset->rwfds[dst] = pfdset->rwfds[src]; +} + +static void +fdset_shrink_nolock(struct fdset *pfdset) +{ + int i; + int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); + + for (i = 0; i < last_valid_idx; i++) { + if (pfdset->fd[i].fd != -1) + continue; + + fdset_move(pfdset, i, last_valid_idx); + last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); + } + pfdset->num = last_valid_idx + 1; +} + +/* + * Find deleted fd entries and remove them + */ +static void +fdset_shrink(struct fdset *pfdset) +{ + pthread_mutex_lock(&pfdset->fd_mutex); + fdset_shrink_nolock(pfdset); + pthread_mutex_unlock(&pfdset->fd_mutex); +} + /** * Returns the index in the fdset for a given fd. - * If fd is -1, it means to search for a free entry. * @return * index for the fd, or -1 if fd isn't in the fdset. */ @@ -56,72 +102,28 @@ fdset_find_fd(struct fdset *pfdset, int fd) { int i; - if (pfdset == NULL) - return -1; - - for (i = 0; i < MAX_FDS && pfdset->fd[i].fd != fd; i++) + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) ; - return i == MAX_FDS ? -1 : i; + return i == pfdset->num ? -1 : i; } -static int -fdset_find_free_slot(struct fdset *pfdset) -{ - return fdset_find_fd(pfdset, -1); -} - -static int -fdset_add_fd(struct fdset *pfdset, int idx, int fd, +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, fd_cb rcb, fd_cb wcb, void *dat) { - struct fdentry *pfdentry; + struct fdentry *pfdentry = &pfdset->fd[idx]; + struct pollfd *pfd = &pfdset->rwfds[idx]; - if (pfdset == NULL || idx >= MAX_FDS || fd >= FD_SETSIZE) - return -1; - - pfdentry = &pfdset->fd[idx]; - pfdentry->fd = fd; + pfdentry->fd = fd; pfdentry->rcb = rcb; pfdentry->wcb = wcb; pfdentry->dat = dat; - return 0; -} - -/** - * Fill the read/write fd_set with the fds in the fdset. - * @return - * the maximum fds filled in the read/write fd_set. - */ -static int -fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) -{ - struct fdentry *pfdentry; - int i, maxfds = -1; - int num = MAX_FDS; - - if (pfdset == NULL) - return -1; - - for (i = 0; i < num; i++) { - pfdentry = &pfdset->fd[i]; - if (pfdentry->fd != -1) { - int added = 0; - if (pfdentry->rcb && rfset) { - FD_SET(pfdentry->fd, rfset); - added = 1; - } - if (pfdentry->wcb && wfset) { - FD_SET(pfdentry->fd, wfset); - added = 1; - } - if (added) - maxfds = pfdentry->fd < maxfds ? - maxfds : pfdentry->fd; - } - } - return maxfds; + pfd->fd = fd; + pfd->events = rcb ? POLLIN : 0; + pfd->events |= wcb ? POLLOUT : 0; + pfd->revents = 0; } void @@ -151,16 +153,17 @@ fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) return -1; pthread_mutex_lock(&pfdset->fd_mutex); - - /* Find a free slot in the list. */ - i = fdset_find_free_slot(pfdset); - if (i == -1 || fdset_add_fd(pfdset, i, fd, rcb, wcb, dat) < 0) { - pthread_mutex_unlock(&pfdset->fd_mutex); - return -2; + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + fdset_shrink_nolock(pfdset); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } } - pfdset->num++; - + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); pthread_mutex_unlock(&pfdset->fd_mutex); return 0; @@ -189,7 +192,6 @@ fdset_del(struct fdset *pfdset, int fd) pfdset->fd[i].fd = -1; pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; pfdset->fd[i].dat = NULL; - pfdset->num--; i = -1; } pthread_mutex_unlock(&pfdset->fd_mutex); @@ -198,24 +200,6 @@ fdset_del(struct fdset *pfdset, int fd) return dat; } -/** - * Unregister the fd at the specified slot from the fdset. - */ -static void -fdset_del_slot(struct fdset *pfdset, int index) -{ - if (pfdset == NULL || index < 0 || index >= MAX_FDS) - return; - - pthread_mutex_lock(&pfdset->fd_mutex); - - pfdset->fd[index].fd = -1; - pfdset->fd[index].rcb = pfdset->fd[index].wcb = NULL; - pfdset->fd[index].dat = NULL; - pfdset->num--; - - pthread_mutex_unlock(&pfdset->fd_mutex); -} /** * This functions runs in infinite blocking loop until there is no fd in @@ -226,58 +210,68 @@ fdset_del_slot(struct fdset *pfdset, int index) * will wait until the flag is reset to zero(which indicates the callback is * finished), then it could free the context after fdset_del. */ -void -fdset_event_dispatch(struct fdset *pfdset) +void * +fdset_event_dispatch(void *arg) { - fd_set rfds, wfds; - int i, maxfds; + int i; + struct pollfd *pfd; struct fdentry *pfdentry; - int num = MAX_FDS; fd_cb rcb, wcb; void *dat; - int fd; + int fd, numfds; int remove1, remove2; - int ret; + int need_shrink; + struct fdset *pfdset = arg; if (pfdset == NULL) - return; + return NULL; while (1) { - struct timeval tv; - tv.tv_sec = 1; - tv.tv_usec = 0; - FD_ZERO(&rfds); - FD_ZERO(&wfds); - pthread_mutex_lock(&pfdset->fd_mutex); - - maxfds = fdset_fill(&rfds, &wfds, pfdset); - - pthread_mutex_unlock(&pfdset->fd_mutex); /* - * When select is blocked, other threads might unregister + * When poll is blocked, other threads might unregister * listenfds from and register new listenfds into fdset. - * When select returns, the entries for listenfds in the fdset + * When poll returns, the entries for listenfds in the fdset * might have been updated. It is ok if there is unwanted call * for new listenfds. */ - ret = select(maxfds + 1, &rfds, &wfds, NULL, &tv); - if (ret <= 0) - continue; + pthread_mutex_lock(&pfdset->fd_mutex); + numfds = pfdset->num; + pthread_mutex_unlock(&pfdset->fd_mutex); - for (i = 0; i < num; i++) { - remove1 = remove2 = 0; + poll(pfdset->rwfds, numfds, 1000 /* millisecs */); + + need_shrink = 0; + for (i = 0; i < numfds; i++) { pthread_mutex_lock(&pfdset->fd_mutex); + pfdentry = &pfdset->fd[i]; fd = pfdentry->fd; + pfd = &pfdset->rwfds[i]; + + if (fd < 0) { + need_shrink = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + if (!pfd->revents) { + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + remove1 = remove2 = 0; + rcb = pfdentry->rcb; wcb = pfdentry->wcb; dat = pfdentry->dat; pfdentry->busy = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); - if (fd >= 0 && FD_ISSET(fd, &rfds) && rcb) + + if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) rcb(fd, dat, &remove1); - if (fd >= 0 && FD_ISSET(fd, &wfds) && wcb) + if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) wcb(fd, dat, &remove2); pfdentry->busy = 0; /* @@ -292,8 +286,15 @@ fdset_event_dispatch(struct fdset *pfdset) * listen fd in another thread, we couldn't call * fd_set_del. */ - if (remove1 || remove2) - fdset_del_slot(pfdset, i); + if (remove1 || remove2) { + pfdentry->fd = -1; + need_shrink = 1; + } } + + if (need_shrink) + fdset_shrink(pfdset); } + + return NULL; } diff --git a/lib/vhost/rte_vhost/fd_man.h b/lib/vhost/rte_vhost/fd_man.h index b1e7ad53a..3a9d269b3 100644 --- a/lib/vhost/rte_vhost/fd_man.h +++ b/lib/vhost/rte_vhost/fd_man.h @@ -35,6 +35,7 @@ #define _FD_MAN_H_ #include #include +#include #define MAX_FDS 1024 @@ -49,6 +50,7 @@ struct fdentry { }; struct fdset { + struct pollfd rwfds[MAX_FDS]; struct fdentry fd[MAX_FDS]; pthread_mutex_t fd_mutex; int num; /* current fd number of this fdset */ @@ -62,6 +64,6 @@ int fdset_add(struct fdset *pfdset, int fd, void *fdset_del(struct fdset *pfdset, int fd); -void fdset_event_dispatch(struct fdset *pfdset); +void *fdset_event_dispatch(void *arg); #endif diff --git a/lib/vhost/rte_vhost_17_05/rte_vhost.h b/lib/vhost/rte_vhost/rte_vhost.h similarity index 100% rename from lib/vhost/rte_vhost_17_05/rte_vhost.h rename to lib/vhost/rte_vhost/rte_vhost.h diff --git a/lib/vhost/rte_vhost/rte_virtio_net.h b/lib/vhost/rte_vhost/rte_virtio_net.h deleted file mode 100644 index 926039c5a..000000000 --- a/lib/vhost/rte_vhost/rte_virtio_net.h +++ /dev/null @@ -1,193 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VIRTIO_NET_H_ -#define _VIRTIO_NET_H_ - -/** - * @file - * Interface to vhost net - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define RTE_VHOST_USER_CLIENT (1ULL << 0) -#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) -#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) - -/* Enum for virtqueue management. */ -enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; - -/** - * Device and vring operations. - */ -struct virtio_net_device_ops { - int (*new_device)(int vid); /**< Add device. */ - void (*destroy_device)(int vid); /**< Remove device. */ - - int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ - - void *reserved[5]; /**< Reserved for future extension */ -}; - -/** - * Disable features in feature_mask. Returns 0 on success. - */ -int rte_vhost_feature_disable(uint64_t feature_mask); - -/** - * Enable features in feature_mask. Returns 0 on success. - */ -int rte_vhost_feature_enable(uint64_t feature_mask); - -/* Returns currently supported vhost features */ -uint64_t rte_vhost_feature_get(void); - -int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); - -/** - * Register vhost driver. path could be different for multiple - * instance support. - */ -int rte_vhost_driver_register(const char *path, uint64_t flags); - -/* Unregister vhost driver. This is only meaningful to vhost user. */ -int rte_vhost_driver_unregister(const char *path); - -/* Register callbacks. */ -int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); -/* Start vhost driver session blocking loop. */ -int rte_vhost_driver_session_start(void); - -/** - * Get the numa node from which the virtio net device's memory - * is allocated. - * - * @param vid - * virtio-net device ID - * - * @return - * The numa node, -1 on failure - */ -int rte_vhost_get_numa_node(int vid); - -/** - * Get the number of queues the device supports. - * - * @param vid - * virtio-net device ID - * - * @return - * The number of queues, 0 on failure - */ -uint32_t rte_vhost_get_queue_num(int vid); - -/** - * Get the virtio net device's ifname, which is the vhost-user socket - * file path. - * - * @param vid - * virtio-net device ID - * @param buf - * The buffer to stored the queried ifname - * @param len - * The length of buf - * - * @return - * 0 on success, -1 on failure - */ -int rte_vhost_get_ifname(int vid, char *buf, size_t len); - -/** - * Get how many avail entries are left in the queue - * - * @param vid - * virtio-net device ID - * @param queue_id - * virtio queue index - * - * @return - * num of avail entires left - */ -uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); - -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtual device. A packet - * count is returned to indicate the number of packets that were succesfully - * added to the RX queue. - * @param vid - * virtio-net device ID - * @param queue_id - * virtio queue index in mq case - * @param pkts - * array to contain packets to be enqueued - * @param count - * packets num to be enqueued - * @return - * num of packets enqueued - */ -uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count); - -/** - * This function gets guest buffers from the virtio device TX virtqueue, - * construct host mbufs, copies guest buffer content to host mbufs and - * store them in pkts to be processed. - * @param vid - * virtio-net device - * @param queue_id - * virtio queue index in mq case - * @param mbuf_pool - * mbuf_pool where host mbuf is allocated. - * @param pkts - * array to contain packets to be dequeued - * @param count - * packets num to be dequeued - * @return - * num of packets dequeued - */ -uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); - -#endif /* _VIRTIO_NET_H_ */ diff --git a/lib/vhost/rte_vhost/socket.c b/lib/vhost/rte_vhost/socket.c index 9276ce58c..4eea67893 100644 --- a/lib/vhost/rte_vhost/socket.c +++ b/lib/vhost/rte_vhost/socket.c @@ -52,22 +52,42 @@ #include "vhost.h" #include "vhost_user.h" + +TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); + /* * Every time rte_vhost_driver_register() is invoked, an associated * vhost_user_socket struct will be created. */ struct vhost_user_socket { + struct vhost_user_connection_list conn_list; + pthread_mutex_t conn_mutex; char *path; - int listenfd; - int connfd; + int socket_fd; + struct sockaddr_un un; bool is_server; bool reconnect; bool dequeue_zero_copy; + + /* + * The "supported_features" indicates the feature bits the + * vhost driver supports. The "features" indicates the feature + * bits after the rte_vhost_driver_features_disable/enable(). + * It is also the final feature bits used for vhost-user + * features negotiation. + */ + uint64_t supported_features; + uint64_t features; + + struct vhost_device_ops const *notify_ops; }; struct vhost_user_connection { struct vhost_user_socket *vsocket; + int connfd; int vid; + + TAILQ_ENTRY(vhost_user_connection) next; }; #define MAX_VHOST_SOCKET 1024 @@ -82,7 +102,8 @@ struct vhost_user { static void vhost_user_server_new_connection(int fd, void *data, int *remove); static void vhost_user_read_cb(int fd, void *dat, int *remove); -static int vhost_user_create_client(struct vhost_user_socket *vsocket); +static int create_unix_socket(struct vhost_user_socket *vsocket); +static int vhost_user_start_client(struct vhost_user_socket *vsocket); static struct vhost_user vhost_user = { .fdset = { @@ -160,7 +181,8 @@ send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) msgh.msg_controllen = sizeof(control); cmsg = CMSG_FIRSTHDR(&msgh); if (cmsg == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, "null cmsg\n"); + RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n"); + errno = EINVAL; return -1; } cmsg->cmsg_len = CMSG_LEN(fdsize); @@ -213,19 +235,23 @@ vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); - vsocket->connfd = fd; + conn->connfd = fd; conn->vsocket = vsocket; conn->vid = vid; ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, NULL, conn); if (ret < 0) { - vsocket->connfd = -1; + conn->connfd = -1; free(conn); close(fd); RTE_LOG(ERR, VHOST_CONFIG, "failed to add fd %d into vhost server fdset\n", fd); } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); } /* call back when there is new vhost-user connection from client */ @@ -251,29 +277,36 @@ vhost_user_read_cb(int connfd, void *dat, int *remove) ret = vhost_user_msg_handler(conn->vid, connfd); if (ret < 0) { - vsocket->connfd = -1; close(connfd); *remove = 1; vhost_destroy_device(conn->vid); + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + free(conn); - if (vsocket->reconnect) - vhost_user_create_client(vsocket); + if (vsocket->reconnect) { + create_unix_socket(vsocket); + vhost_user_start_client(vsocket); + } } } static int -create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) +create_unix_socket(struct vhost_user_socket *vsocket) { int fd; + struct sockaddr_un *un = &vsocket->un; fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd < 0) return -1; RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", - is_server ? "server" : "client", fd); + vsocket->is_server ? "server" : "client", fd); - if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { RTE_LOG(ERR, VHOST_CONFIG, "vhost-user: can't set nonblocking mode for socket, fd: " "%d (%s)\n", fd, strerror(errno)); @@ -283,25 +316,21 @@ create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) memset(un, 0, sizeof(*un)); un->sun_family = AF_UNIX; - strncpy(un->sun_path, path, sizeof(un->sun_path)); + strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); un->sun_path[sizeof(un->sun_path) - 1] = '\0'; - return fd; + vsocket->socket_fd = fd; + return 0; } static int -vhost_user_create_server(struct vhost_user_socket *vsocket) +vhost_user_start_server(struct vhost_user_socket *vsocket) { - int fd; int ret; - struct sockaddr_un un; + int fd = vsocket->socket_fd; const char *path = vsocket->path; - fd = create_unix_socket(path, &un, vsocket->is_server); - if (fd < 0) - return -1; - - ret = bind(fd, (struct sockaddr *)&un, sizeof(un)); + ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); if (ret < 0) { RTE_LOG(ERR, VHOST_CONFIG, "failed to bind to %s: %s; remove it and try again\n", @@ -314,7 +343,6 @@ vhost_user_create_server(struct vhost_user_socket *vsocket) if (ret < 0) goto err; - vsocket->listenfd = fd; ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, NULL, vsocket); if (ret < 0) { @@ -433,26 +461,21 @@ vhost_user_reconnect_init(void) } static int -vhost_user_create_client(struct vhost_user_socket *vsocket) +vhost_user_start_client(struct vhost_user_socket *vsocket) { - int fd; int ret; - struct sockaddr_un un; + int fd = vsocket->socket_fd; const char *path = vsocket->path; struct vhost_user_reconnect *reconn; - fd = create_unix_socket(path, &un, vsocket->is_server); - if (fd < 0) - return -1; - - ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un, - sizeof(un)); + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, + sizeof(vsocket->un)); if (ret == 0) { vhost_user_add_connection(fd, vsocket); return 0; } - RTE_LOG(ERR, VHOST_CONFIG, + RTE_LOG(WARNING, VHOST_CONFIG, "failed to connect to %s: %s\n", path, strerror(errno)); @@ -461,7 +484,7 @@ vhost_user_create_client(struct vhost_user_socket *vsocket) return -1; } - RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path); + RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); reconn = malloc(sizeof(*reconn)); if (reconn == NULL) { RTE_LOG(ERR, VHOST_CONFIG, @@ -469,7 +492,7 @@ vhost_user_create_client(struct vhost_user_socket *vsocket) close(fd); return -1; } - reconn->un = un; + reconn->un = vsocket->un; reconn->fd = fd; reconn->vsocket = vsocket; pthread_mutex_lock(&reconn_list.mutex); @@ -479,6 +502,94 @@ vhost_user_create_client(struct vhost_user_socket *vsocket) return 0; } +static struct vhost_user_socket * +find_vhost_user_socket(const char *path) +{ + int i; + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) + return vsocket; + } + + return NULL; +} + +int +rte_vhost_driver_disable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->features &= ~features; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_enable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + if ((vsocket->supported_features & features) != features) { + /* + * trying to enable features the driver doesn't + * support. + */ + pthread_mutex_unlock(&vhost_user.mutex); + return -1; + } + vsocket->features |= features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_set_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + vsocket->supported_features = features; + vsocket->features = features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_features(const char *path, uint64_t *features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + *features = vsocket->features; + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + return -1; + } else { + return 0; + } +} + /* * Register a new vhost-user socket; here we could act as server * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag @@ -506,9 +617,25 @@ rte_vhost_driver_register(const char *path, uint64_t flags) goto out; memset(vsocket, 0, sizeof(struct vhost_user_socket)); vsocket->path = strdup(path); - vsocket->connfd = -1; + TAILQ_INIT(&vsocket->conn_list); + pthread_mutex_init(&vsocket->conn_mutex, NULL); vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + /* + * Set the supported features correctly for the builtin vhost-user + * net driver. + * + * Applications know nothing about features the builtin virtio net + * driver (virtio_net.c) supports, thus it's not possible for them + * to invoke rte_vhost_driver_set_features(). To workaround it, here + * we set it unconditionally. If the application want to implement + * another vhost-user driver (say SCSI), it should call the + * rte_vhost_driver_set_features(), which will overwrite following + * two values. + */ + vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); if (vsocket->reconnect && reconn_tid == 0) { @@ -518,11 +645,10 @@ rte_vhost_driver_register(const char *path, uint64_t flags) goto out; } } - ret = vhost_user_create_client(vsocket); } else { vsocket->is_server = true; - ret = vhost_user_create_server(vsocket); } + ret = create_unix_socket(vsocket); if (ret < 0) { free(vsocket->path); free(vsocket); @@ -569,7 +695,7 @@ rte_vhost_driver_unregister(const char *path) { int i; int count; - struct vhost_user_connection *conn; + struct vhost_user_connection *conn, *next; pthread_mutex_lock(&vhost_user.mutex); @@ -578,22 +704,29 @@ rte_vhost_driver_unregister(const char *path) if (!strcmp(vsocket->path, path)) { if (vsocket->is_server) { - fdset_del(&vhost_user.fdset, vsocket->listenfd); - close(vsocket->listenfd); + fdset_del(&vhost_user.fdset, vsocket->socket_fd); + close(vsocket->socket_fd); unlink(path); } else if (vsocket->reconnect) { vhost_user_remove_reconnect(vsocket); } - conn = fdset_del(&vhost_user.fdset, vsocket->connfd); - if (conn) { + pthread_mutex_lock(&vsocket->conn_mutex); + for (conn = TAILQ_FIRST(&vsocket->conn_list); + conn != NULL; + conn = next) { + next = TAILQ_NEXT(conn, next); + + fdset_del(&vhost_user.fdset, conn->connfd); RTE_LOG(INFO, VHOST_CONFIG, "free connfd = %d for device '%s'\n", - vsocket->connfd, path); - close(vsocket->connfd); + conn->connfd, path); + close(conn->connfd); vhost_destroy_device(conn->vid); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); free(conn); } + pthread_mutex_unlock(&vsocket->conn_mutex); free(vsocket->path); free(vsocket); @@ -611,9 +744,59 @@ rte_vhost_driver_unregister(const char *path) return -1; } +/* + * Register ops so that we can add/remove device to data core. + */ int -rte_vhost_driver_session_start(void) +rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops) { - fdset_event_dispatch(&vhost_user.fdset); - return 0; + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->notify_ops = ops; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +struct vhost_device_ops const * +vhost_driver_callback_get(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? vsocket->notify_ops : NULL; +} + +int +rte_vhost_driver_start(const char *path) +{ + struct vhost_user_socket *vsocket; + static pthread_t fdset_tid; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) + return -1; + + if (fdset_tid == 0) { + int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, + &vhost_user.fdset); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create fdset handling thread"); + } + + if (vsocket->is_server) + return vhost_user_start_server(vsocket); + else + return vhost_user_start_client(vsocket); } diff --git a/lib/vhost/rte_vhost/vhost.c b/lib/vhost/rte_vhost/vhost.c index ccb8c4f19..74c12040e 100644 --- a/lib/vhost/rte_vhost/vhost.c +++ b/lib/vhost/rte_vhost/vhost.c @@ -45,34 +45,12 @@ #include #include #include +#include #include "vhost.h" -#define VHOST_USER_F_PROTOCOL_FEATURES 30 - -/* Features supported by this lib. */ -#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ - (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ - (1ULL << VIRTIO_NET_F_CTRL_RX) | \ - (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ - (VHOST_SUPPORTS_MQ) | \ - (1ULL << VIRTIO_F_VERSION_1) | \ - (1ULL << VHOST_F_LOG_ALL) | \ - (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ - (1ULL << VIRTIO_NET_F_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO6)) - -uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; - struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; -/* device ops to add/remove device to/from data core. */ -struct virtio_net_device_ops const *notify_ops; - struct virtio_net * get_device(int vid) { @@ -106,10 +84,8 @@ cleanup_device(struct virtio_net *dev, int destroy) vhost_backend_cleanup(dev); - for (i = 0; i < dev->virt_qp_nb; i++) { - cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy); - cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy); - } + for (i = 0; i < dev->nr_vring; i++) + cleanup_vq(dev->virtqueue[i], destroy); } /* @@ -119,24 +95,21 @@ static void free_device(struct virtio_net *dev) { uint32_t i; - struct vhost_virtqueue *rxq, *txq; + struct vhost_virtqueue *vq; - for (i = 0; i < dev->virt_qp_nb; i++) { - rxq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; - txq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; - rte_free(rxq->shadow_used_ring); - rte_free(txq->shadow_used_ring); + rte_free(vq->shadow_used_ring); - /* rxq and txq are allocated together as queue-pair */ - rte_free(rxq); + rte_free(vq); } rte_free(dev); } static void -init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +init_vring_queue(struct vhost_virtqueue *vq) { memset(vq, 0, sizeof(struct vhost_virtqueue)); @@ -146,69 +119,48 @@ init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) /* Backends are set to -1 indicating an inactive device. */ vq->backend = -1; - /* always set the default vq pair to enabled */ - if (qp_idx == 0) - vq->enabled = 1; + /* + * always set the vq to enabled; this is to keep compatibility + * with the old QEMU, whereas there is no SET_VRING_ENABLE message. + */ + vq->enabled = 1; TAILQ_INIT(&vq->zmbuf_list); } static void -init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) -{ - uint32_t base_idx = qp_idx * VIRTIO_QNUM; - - init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); - init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); -} - -static void -reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +reset_vring_queue(struct vhost_virtqueue *vq) { int callfd; callfd = vq->callfd; - init_vring_queue(vq, qp_idx); + init_vring_queue(vq); vq->callfd = callfd; } -static void -reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) -{ - uint32_t base_idx = qp_idx * VIRTIO_QNUM; - - reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); - reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); -} - int -alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) { - struct vhost_virtqueue *virtqueue = NULL; - uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ; - uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ; + struct vhost_virtqueue *vq; - virtqueue = rte_malloc(NULL, - sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0); - if (virtqueue == NULL) { + vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); + if (vq == NULL) { RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for virt qp:%d.\n", qp_idx); + "Failed to allocate memory for vring:%u.\n", vring_idx); return -1; } - dev->virtqueue[virt_rx_q_idx] = virtqueue; - dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ; + dev->virtqueue[vring_idx] = vq; + init_vring_queue(vq); - init_vring_queue_pair(dev, qp_idx); - - dev->virt_qp_nb += 1; + dev->nr_vring += 1; return 0; } /* * Reset some variables in device structure, while keeping few - * others untouched, such as vid, ifname, virt_qp_nb: they + * others untouched, such as vid, ifname, nr_vring: they * should be same unless the device is removed. */ void @@ -220,8 +172,8 @@ reset_device(struct virtio_net *dev) dev->protocol_features = 0; dev->flags = 0; - for (i = 0; i < dev->virt_qp_nb; i++) - reset_vring_queue_pair(dev, i); + for (i = 0; i < dev->nr_vring; i++) + reset_vring_queue(dev->virtqueue[i]); } /* @@ -248,6 +200,7 @@ vhost_new_device(void) if (i == MAX_VHOST_DEVICE) { RTE_LOG(ERR, VHOST_CONFIG, "Failed to find a free slot for new device.\n"); + rte_free(dev); return -1; } @@ -271,7 +224,7 @@ vhost_destroy_device(int vid) if (dev->flags & VIRTIO_DEV_RUNNING) { dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(vid); + dev->notify_ops->destroy_device(vid); } cleanup_device(dev, 1); @@ -308,6 +261,25 @@ vhost_enable_dequeue_zero_copy(int vid) dev->dequeue_zero_copy = 1; } +int +rte_vhost_get_mtu(int vid, uint16_t *mtu) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -ENODEV; + + if (!(dev->flags & VIRTIO_DEV_READY)) + return -EAGAIN; + + if (!(dev->features & VIRTIO_NET_F_MTU)) + return -ENOTSUP; + + *mtu = dev->mtu; + + return 0; +} + int rte_vhost_get_numa_node(int vid) { @@ -342,7 +314,18 @@ rte_vhost_get_queue_num(int vid) if (dev == NULL) return 0; - return dev->virt_qp_nb; + return dev->nr_vring / 2; +} + +uint16_t +rte_vhost_get_vring_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->nr_vring; } int @@ -361,6 +344,75 @@ rte_vhost_get_ifname(int vid, char *buf, size_t len) return 0; } +int +rte_vhost_get_negotiated_features(int vid, uint64_t *features) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (!dev) + return -1; + + *features = dev->features; + return 0; +} + +int +rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + struct virtio_net *dev; + struct rte_vhost_memory *m; + size_t size; + + dev = get_device(vid); + if (!dev) + return -1; + + size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); + m = malloc(sizeof(struct rte_vhost_memory) + size); + if (!m) + return -1; + + m->nregions = dev->mem->nregions; + memcpy(m->regions, dev->mem->regions, size); + *mem = m; + + return 0; +} + +int +rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + vring->log_guest_addr = vq->log_guest_addr; + + vring->callfd = vq->callfd; + vring->kickfd = vq->kickfd; + vring->size = vq->size; + + vring->last_avail_idx = vq->last_avail_idx; + vring->last_used_idx = vq->last_used_idx; + + return 0; +} + uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id) { @@ -396,33 +448,56 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) return 0; } -uint64_t rte_vhost_feature_get(void) +void +rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) { - return VHOST_FEATURES; + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_log_write(dev, addr, len); } -int rte_vhost_feature_disable(uint64_t feature_mask) +void +rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len) { - VHOST_FEATURES = VHOST_FEATURES & ~feature_mask; - return 0; + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL) + return; + + if (vring_idx >= VHOST_MAX_VRING) + return; + vq = dev->virtqueue[vring_idx]; + if (!vq) + return; + + vhost_log_used_vring(dev, vq, offset, len); } -int rte_vhost_feature_enable(uint64_t feature_mask) -{ - if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) { - VHOST_FEATURES = VHOST_FEATURES | feature_mask; - return 0; - } - return -1; -} - -/* - * Register ops so that we can add/remove device to data core. - */ int -rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops) -{ - notify_ops = ops; +rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx, + uint16_t last_avail_idx, uint16_t last_used_idx) { + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vq->last_avail_idx = last_avail_idx; + vq->last_used_idx = last_used_idx; return 0; } diff --git a/lib/vhost/rte_vhost/vhost.h b/lib/vhost/rte_vhost/vhost.h index 65ac83b14..fec175f20 100644 --- a/lib/vhost/rte_vhost/vhost.h +++ b/lib/vhost/rte_vhost/vhost.h @@ -39,14 +39,20 @@ #include #include #include +#include +#include +#include #include +#include -#include "rte_virtio_net.h" +#include "rte_vhost.h" #include "vhost_user.h" /* Used to indicate that the device is running on a data core */ #define VIRTIO_DEV_RUNNING 1 +/* Used to indicate that the device is ready to operate */ +#define VIRTIO_DEV_READY 2 /* Backend value set by guest. */ #define VIRTIO_DEV_STOPPED -1 @@ -111,24 +117,20 @@ struct vhost_virtqueue { uint16_t shadow_used_idx; } __rte_cache_aligned; -/* Old kernels have no such macro defined */ +/* Old kernels have no such macros defined */ #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 #endif +#ifndef VIRTIO_NET_F_MQ + #define VIRTIO_NET_F_MQ 22 +#endif -/* - * Make an extra wrapper for VIRTIO_NET_F_MQ and - * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are - * introduced since kernel v3.8. This makes our - * code buildable for older kernel. - */ -#ifdef VIRTIO_NET_F_MQ - #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX - #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ) -#else - #define VHOST_MAX_QUEUE_PAIRS 1 - #define VHOST_SUPPORTS_MQ 0 +#define VHOST_MAX_VRING 0x100 +#define VHOST_MAX_QUEUE_PAIRS 0x80 + +#ifndef VIRTIO_NET_F_MTU + #define VIRTIO_NET_F_MTU 3 #endif /* @@ -138,6 +140,27 @@ struct vhost_virtqueue { #define VIRTIO_F_VERSION_1 32 #endif +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this builtin vhost-user net driver. */ +#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_NET_F_MTU)) + + struct guest_page { uint64_t guest_phys_addr; uint64_t host_phys_addr; @@ -150,7 +173,7 @@ struct guest_page { */ struct virtio_net { /* Frontend (QEMU) memory and memory region information */ - struct virtio_memory *mem; + struct rte_vhost_memory *mem; uint64_t features; uint64_t protocol_features; int vid; @@ -158,8 +181,7 @@ struct virtio_net { uint16_t vhost_hlen; /* to tell if we need broadcast rarp packet */ rte_atomic16_t broadcast_rarp; - uint32_t virt_qp_nb; - uint32_t num_queues; + uint32_t nr_vring; int dequeue_zero_copy; struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) @@ -168,38 +190,55 @@ struct virtio_net { uint64_t log_base; uint64_t log_addr; struct ether_addr mac; + uint16_t mtu; + + struct vhost_device_ops const *notify_ops; uint32_t nr_guest_pages; uint32_t max_guest_pages; struct guest_page *guest_pages; - int has_new_mem_table; - struct VhostUserMemory mem_table; - int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS]; + int has_new_mem_table; + struct VhostUserMemory mem_table; + int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS]; } __rte_cache_aligned; -/** - * Information relating to memory regions including offsets to - * addresses in QEMUs memory file. - */ -struct virtio_memory_region { - uint64_t guest_phys_addr; - uint64_t guest_user_addr; - uint64_t host_user_addr; - uint64_t size; - void *mmap_addr; - uint64_t mmap_size; - int fd; -}; +#define VHOST_LOG_PAGE 4096 -/** - * Memory structure includes region and mapping information. - */ -struct virtio_memory { - uint32_t nregions; - struct virtio_memory_region regions[0]; -}; +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} /* Macros for printing using RTE_LOG */ #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 @@ -236,25 +275,6 @@ extern uint64_t VHOST_FEATURES; #define MAX_VHOST_DEVICE 1024 extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; -/* Convert guest physical Address to host virtual address */ -static inline uint64_t __attribute__((always_inline)) -gpa_to_vva(struct virtio_net *dev, uint64_t gpa) -{ - struct virtio_memory_region *reg; - uint32_t i; - - for (i = 0; i < dev->mem->nregions; i++) { - reg = &dev->mem->regions[i]; - if (gpa >= reg->guest_phys_addr && - gpa < reg->guest_phys_addr + reg->size) { - return gpa - reg->guest_phys_addr + - reg->host_user_addr; - } - } - - return 0; -} - /* Convert guest physical address to host physical address */ static inline phys_addr_t __attribute__((always_inline)) gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) @@ -275,7 +295,6 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) return 0; } -extern struct virtio_net_device_ops const *notify_ops; struct virtio_net *get_device(int vid); int vhost_new_device(void); @@ -283,11 +302,13 @@ void cleanup_device(struct virtio_net *dev, int destroy); void reset_device(struct virtio_net *dev); void vhost_destroy_device(int); -int alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx); +int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); void vhost_set_ifname(int, const char *if_name, unsigned int if_len); void vhost_enable_dequeue_zero_copy(int vid); +struct vhost_device_ops const *vhost_driver_callback_get(const char *path); + /* * Backend-specific cleanup. * diff --git a/lib/vhost/rte_vhost/vhost_user.c b/lib/vhost/rte_vhost/vhost_user.c index 5c7829275..a0b136959 100644 --- a/lib/vhost/rte_vhost/vhost_user.c +++ b/lib/vhost/rte_vhost/vhost_user.c @@ -51,6 +51,9 @@ #include "vhost.h" #include "vhost_user.h" +#define VIRTIO_MIN_MTU 68 +#define VIRTIO_MAX_MTU 65535 + static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_NONE] = "VHOST_USER_NONE", [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", @@ -72,6 +75,7 @@ static const char *vhost_message_str[VHOST_USER_MAX] = { [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", }; static uint64_t @@ -88,7 +92,7 @@ static void free_mem_region(struct virtio_net *dev) { uint32_t i; - struct virtio_memory_region *reg; + struct rte_vhost_mem_region *reg; if (!dev || !dev->mem) return; @@ -110,6 +114,10 @@ vhost_backend_cleanup(struct virtio_net *dev) rte_free(dev->mem); dev->mem = NULL; } + + free(dev->guest_pages); + dev->guest_pages = NULL; + if (dev->log_addr) { munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); dev->log_addr = 0; @@ -131,7 +139,7 @@ vhost_user_reset_owner(struct virtio_net *dev) { if (dev->flags & VIRTIO_DEV_RUNNING) { dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(dev->vid); + dev->notify_ops->destroy_device(dev->vid); } cleanup_device(dev, 0); @@ -143,9 +151,12 @@ vhost_user_reset_owner(struct virtio_net *dev) * The features that we support are requested. */ static uint64_t -vhost_user_get_features(void) +vhost_user_get_features(struct virtio_net *dev) { - return VHOST_FEATURES; + uint64_t features = 0; + + rte_vhost_driver_get_features(dev->ifname, &features); + return features; } /* @@ -154,9 +165,17 @@ vhost_user_get_features(void) static int vhost_user_set_features(struct virtio_net *dev, uint64_t features) { - if (features & ~VHOST_FEATURES) + uint64_t vhost_features = 0; + + rte_vhost_driver_get_features(dev->ifname, &vhost_features); + if (features & ~vhost_features) return -1; + if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->features != features) { + if (dev->notify_ops->features_changed) + dev->notify_ops->features_changed(dev->vid, features); + } + dev->features = features; if (dev->features & ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { @@ -223,13 +242,6 @@ numa_realloc(struct virtio_net *dev, int index) struct vhost_virtqueue *old_vq, *vq; int ret; - /* - * vq is allocated on pairs, we should try to do realloc - * on first queue of one queue pair only. - */ - if (index % VIRTIO_QNUM != 0) - return dev; - old_dev = dev; vq = old_vq = dev->virtqueue[index]; @@ -247,8 +259,7 @@ numa_realloc(struct virtio_net *dev, int index) if (oldnode != newnode) { RTE_LOG(INFO, VHOST_CONFIG, "reallocate vq from %d to %d node\n", oldnode, newnode); - vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0, - newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); if (!vq) return dev; @@ -280,7 +291,6 @@ numa_realloc(struct virtio_net *dev, int index) out: dev->virtqueue[index] = vq; - dev->virtqueue[index + 1] = vq + 1; vhost_devices[dev->vid] = dev; return dev; @@ -300,7 +310,7 @@ numa_realloc(struct virtio_net *dev, int index __rte_unused) static uint64_t qva_to_vva(struct virtio_net *dev, uint64_t qva) { - struct virtio_memory_region *reg; + struct rte_vhost_mem_region *reg; uint32_t i; /* Find the region where the address lives. */ @@ -401,6 +411,12 @@ static int vhost_user_set_vring_base(struct virtio_net *dev, VhostUserMsg *msg) { + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num; dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num; @@ -413,15 +429,10 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, { struct guest_page *page, *last_page; - if (dev->nr_guest_pages == dev->max_guest_pages && - dev->nr_guest_pages > 0) { - dev->max_guest_pages *= 2; + if (dev->nr_guest_pages == dev->max_guest_pages) { + dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2); dev->guest_pages = realloc(dev->guest_pages, dev->max_guest_pages * sizeof(*page)); - if (!dev->guest_pages) { - RTE_LOG(ERR, VHOST_CONFIG, "cannot realloc guest_pages\n"); - abort(); - } } if (dev->nr_guest_pages > 0) { @@ -441,7 +452,7 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, } static void -add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg, +add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, uint64_t page_size) { uint64_t reg_size = reg->size; @@ -460,14 +471,14 @@ add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg, reg_size -= size; while (reg_size > 0) { + size = RTE_MIN(reg_size, page_size); host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) host_user_addr); - add_one_guest_page(dev, guest_phys_addr, host_phys_addr, - page_size); + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); - host_user_addr += page_size; - guest_phys_addr += page_size; - reg_size -= page_size; + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; } } @@ -520,11 +531,11 @@ vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) return 0; } -static int + static int vhost_setup_mem_table(struct virtio_net *dev) { struct VhostUserMemory memory = dev->mem_table; - struct virtio_memory_region *reg; + struct rte_vhost_mem_region *reg; void *mmap_addr; uint64_t mmap_size; uint64_t mmap_offset; @@ -545,8 +556,8 @@ vhost_setup_mem_table(struct virtio_net *dev) sizeof(struct guest_page)); } - dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) + - sizeof(struct virtio_memory_region) * memory.nregions, 0); + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + + sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); if (dev->mem == NULL) { RTE_LOG(ERR, VHOST_CONFIG, "(%d) failed to allocate memory for dev->mem\n", @@ -597,7 +608,8 @@ vhost_setup_mem_table(struct virtio_net *dev) reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset; - add_guest_pages(dev, reg, alignment); + if (dev->dequeue_zero_copy) + add_guest_pages(dev, reg, alignment); RTE_LOG(INFO, VHOST_CONFIG, "guest memory region %u, size: 0x%" PRIx64 "\n" @@ -643,14 +655,14 @@ virtio_is_ready(struct virtio_net *dev) struct vhost_virtqueue *vq; uint32_t i; - for (i = 0; i < dev->num_queues; i++) { + if (dev->nr_vring == 0) + return 0; + + for (i = 0; i < dev->nr_vring; i++) { vq = dev->virtqueue[i]; - if (!vq_is_ready(vq)) { - RTE_LOG(INFO, VHOST_CONFIG, - "virtio is not ready for processing.\n"); + if (!vq_is_ready(vq)) return 0; - } } RTE_LOG(INFO, VHOST_CONFIG, @@ -663,7 +675,12 @@ vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) { struct vhost_vring_file file; struct vhost_virtqueue *vq; - uint32_t cur_qp_idx; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) @@ -673,43 +690,25 @@ vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) RTE_LOG(INFO, VHOST_CONFIG, "vring call idx:%d file:%d\n", file.index, file.fd); - if (file.index + 1 > dev->num_queues) { - dev->num_queues = file.index + 1; - } - - /* - * FIXME: VHOST_SET_VRING_CALL is the first per-vring message - * we get, so we do vring queue pair allocation here. - */ - cur_qp_idx = file.index / VIRTIO_QNUM; - if (cur_qp_idx + 1 > dev->virt_qp_nb) { - if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0) - return; - } - vq = dev->virtqueue[file.index]; - assert(vq != NULL); - if (vq->callfd >= 0) close(vq->callfd); vq->callfd = file.fd; - - if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { - notify_ops->new_device(dev->vid); - } } -/* - * In vhost-user, when we receive kick message, will test whether virtio - * device is ready for packet processing. - */ static void vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) { struct vhost_vring_file file; struct vhost_virtqueue *vq; + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) file.fd = VIRTIO_INVALID_EVENTFD; @@ -722,16 +721,6 @@ vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) if (vq->kickfd >= 0) close(vq->kickfd); vq->kickfd = file.fd; - - if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { - if (dev->dequeue_zero_copy) { - RTE_LOG(INFO, VHOST_CONFIG, - "dequeue zero copy is enabled\n"); - } - - if (notify_ops->new_device(dev->vid) == 0) - dev->flags |= VIRTIO_DEV_RUNNING; - } } static void @@ -762,9 +751,11 @@ vhost_user_get_vring_base(struct virtio_net *dev, /* We have to stop the queue (virtio) if it is running. */ if (dev->flags & VIRTIO_DEV_RUNNING) { dev->flags &= ~VIRTIO_DEV_RUNNING; - notify_ops->destroy_device(dev->vid); + dev->notify_ops->destroy_device(dev->vid); } + dev->flags &= ~VIRTIO_DEV_READY; + /* Here we are safe to get the last used index */ msg->payload.state.num = vq->last_used_idx; @@ -779,6 +770,10 @@ vhost_user_get_vring_base(struct virtio_net *dev, close(vq->kickfd); vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (vq->callfd >= 0) + close(vq->callfd); + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; if (dev->dequeue_zero_copy) @@ -803,8 +798,8 @@ vhost_user_set_vring_enable(struct virtio_net *dev, "set queue enable: %d to qp idx: %d\n", enable, msg->payload.state.index); - if (notify_ops->vring_state_changed) - notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable); + if (dev->notify_ops->vring_state_changed) + dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable); dev->virtqueue[msg->payload.state.index]->enabled = enable; @@ -902,6 +897,22 @@ vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) return 0; } +static int +vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + if (msg->payload.u64 < VIRTIO_MIN_MTU || + msg->payload.u64 > VIRTIO_MAX_MTU) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", + msg->payload.u64); + + return -1; + } + + dev->mtu = msg->payload.u64; + + return 0; +} + /* return bytes# of read on success or negative val on failure. */ static int read_vhost_message(int sockfd, struct VhostUserMsg *msg) @@ -941,6 +952,7 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg) return 0; msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; msg->flags |= VHOST_USER_VERSION; msg->flags |= VHOST_USER_REPLY_MASK; @@ -950,6 +962,44 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg) return ret; } +/* + * Allocate a queue pair if it hasn't been allocated yet + */ +static int +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +{ + uint16_t vring_idx; + + switch (msg->request) { + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + break; + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + vring_idx = msg->payload.state.index; + break; + case VHOST_USER_SET_VRING_ADDR: + vring_idx = msg->payload.addr.index; + break; + default: + return 0; + } + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid vring index: %u\n", vring_idx); + return -1; + } + + if (dev->virtqueue[vring_idx]) + return 0; + + return alloc_vring_queue(dev, vring_idx); +} + int vhost_user_msg_handler(int vid, int fd) { @@ -961,6 +1011,16 @@ vhost_user_msg_handler(int vid, int fd) if (dev == NULL) return -1; + if (!dev->notify_ops) { + dev->notify_ops = vhost_driver_callback_get(dev->ifname); + if (!dev->notify_ops) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get callback ops for driver %s\n", + dev->ifname); + return -1; + } + } + ret = read_vhost_message(fd, &msg); if (ret <= 0 || msg.request >= VHOST_USER_MAX) { if (ret < 0) @@ -978,9 +1038,17 @@ vhost_user_msg_handler(int vid, int fd) RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", vhost_message_str[msg.request]); + + ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to alloc queue\n"); + return -1; + } + switch (msg.request) { case VHOST_USER_GET_FEATURES: - msg.payload.u64 = vhost_user_get_features(); + msg.payload.u64 = vhost_user_get_features(dev); msg.size = sizeof(msg.payload.u64); send_vhost_message(fd, &msg); break; @@ -1005,7 +1073,7 @@ vhost_user_msg_handler(int vid, int fd) break; case VHOST_USER_SET_MEM_TABLE: - vhost_user_set_mem_table(dev, &msg); + ret = vhost_user_set_mem_table(dev, &msg); break; case VHOST_USER_SET_LOG_BASE: @@ -1062,10 +1130,35 @@ vhost_user_msg_handler(int vid, int fd) vhost_user_send_rarp(dev, &msg); break; + case VHOST_USER_NET_SET_MTU: + ret = vhost_user_net_set_mtu(dev, &msg); + break; + default: + ret = -1; break; } + if (msg.flags & VHOST_USER_NEED_REPLY) { + msg.payload.u64 = !!ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + } + + if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { + dev->flags |= VIRTIO_DEV_READY; + + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } + return 0; } diff --git a/lib/vhost/rte_vhost/vhost_user.h b/lib/vhost/rte_vhost/vhost_user.h index ba78d3268..2ba22dbb0 100644 --- a/lib/vhost/rte_vhost/vhost_user.h +++ b/lib/vhost/rte_vhost/vhost_user.h @@ -37,7 +37,7 @@ #include #include -#include "rte_virtio_net.h" +#include "rte_vhost.h" /* refer to hw/virtio/vhost-user.c */ @@ -46,10 +46,14 @@ #define VHOST_USER_PROTOCOL_F_MQ 0 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 #define VHOST_USER_PROTOCOL_F_RARP 2 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 #define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ - (1ULL << VHOST_USER_PROTOCOL_F_RARP)) + (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU)) typedef enum VhostUserRequest { VHOST_USER_NONE = 0, @@ -72,6 +76,7 @@ typedef enum VhostUserRequest { VHOST_USER_GET_QUEUE_NUM = 17, VHOST_USER_SET_VRING_ENABLE = 18, VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, VHOST_USER_MAX } VhostUserRequest; @@ -98,6 +103,7 @@ typedef struct VhostUserMsg { #define VHOST_USER_VERSION_MASK 0x3 #define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY (0x1 << 3) uint32_t flags; uint32_t size; /* the following payload size */ union { diff --git a/lib/vhost/rte_vhost/virtio_net.c b/lib/vhost/rte_vhost/virtio_net.c deleted file mode 100644 index 8a2bc5228..000000000 --- a/lib/vhost/rte_vhost/virtio_net.c +++ /dev/null @@ -1,1185 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vhost.h" - -#define MAX_PKT_BURST 32 -#define VHOST_LOG_PAGE 4096 - -static inline void __attribute__((always_inline)) -vhost_log_page(uint8_t *log_base, uint64_t page) -{ - log_base[page / 8] |= 1 << (page % 8); -} - -static inline void __attribute__((always_inline)) -vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) -{ - uint64_t page; - - if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || - !dev->log_base || !len)) - return; - - if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) - return; - - /* To make sure guest memory updates are committed before logging */ - rte_smp_wmb(); - - page = addr / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < addr + len) { - vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); - page += 1; - } -} - -static inline void __attribute__((always_inline)) -vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t offset, uint64_t len) -{ - vhost_log_write(dev, vq->log_guest_addr + offset, len); -} - -static bool -is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb) -{ - return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM; -} - -static inline void __attribute__((always_inline)) -do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint16_t to, uint16_t from, uint16_t size) -{ - rte_memcpy(&vq->used->ring[to], - &vq->shadow_used_ring[from], - size * sizeof(struct vring_used_elem)); - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[to]), - size * sizeof(struct vring_used_elem)); -} - -static inline void __attribute__((always_inline)) -flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq) -{ - uint16_t used_idx = vq->last_used_idx & (vq->size - 1); - - if (used_idx + vq->shadow_used_idx <= vq->size) { - do_flush_shadow_used_ring(dev, vq, used_idx, 0, - vq->shadow_used_idx); - } else { - uint16_t size; - - /* update used ring interval [used_idx, vq->size] */ - size = vq->size - used_idx; - do_flush_shadow_used_ring(dev, vq, used_idx, 0, size); - - /* update the left half used ring interval [0, left_size] */ - do_flush_shadow_used_ring(dev, vq, 0, size, - vq->shadow_used_idx - size); - } - vq->last_used_idx += vq->shadow_used_idx; - - rte_smp_wmb(); - - *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; - vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); -} - -static inline void __attribute__((always_inline)) -update_shadow_used_ring(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint16_t len) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_ring[i].id = desc_idx; - vq->shadow_used_ring[i].len = len; -} - -static void -virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) -{ - if (m_buf->ol_flags & PKT_TX_L4_MASK) { - net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len; - - switch (m_buf->ol_flags & PKT_TX_L4_MASK) { - case PKT_TX_TCP_CKSUM: - net_hdr->csum_offset = (offsetof(struct tcp_hdr, - cksum)); - break; - case PKT_TX_UDP_CKSUM: - net_hdr->csum_offset = (offsetof(struct udp_hdr, - dgram_cksum)); - break; - case PKT_TX_SCTP_CKSUM: - net_hdr->csum_offset = (offsetof(struct sctp_hdr, - cksum)); - break; - } - } - - if (m_buf->ol_flags & PKT_TX_TCP_SEG) { - if (m_buf->ol_flags & PKT_TX_IPV4) - net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else - net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - net_hdr->gso_size = m_buf->tso_segsz; - net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len - + m_buf->l4_len; - } -} - -static inline void -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr, - struct virtio_net_hdr_mrg_rxbuf hdr) -{ - if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) - *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; - else - *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; -} - -static inline int __attribute__((always_inline)) -copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, - struct rte_mbuf *m, uint16_t desc_idx, uint32_t size) -{ - uint32_t desc_avail, desc_offset; - uint32_t mbuf_avail, mbuf_offset; - uint32_t cpy_len; - struct vring_desc *desc; - uint64_t desc_addr; - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; - - desc = &descs[desc_idx]; - desc_addr = gpa_to_vva(dev, desc->addr); - /* - * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid - * performance issue with some versions of gcc (4.8.4 and 5.3.0) which - * otherwise stores offset on the stack instead of in a register. - */ - if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) - return -1; - - rte_prefetch0((void *)(uintptr_t)desc_addr); - - virtio_enqueue_offload(m, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); - vhost_log_write(dev, desc->addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); - - desc_offset = dev->vhost_hlen; - desc_avail = desc->len - dev->vhost_hlen; - - mbuf_avail = rte_pktmbuf_data_len(m); - mbuf_offset = 0; - while (mbuf_avail != 0 || m->next != NULL) { - /* done with current mbuf, fetch next */ - if (mbuf_avail == 0) { - m = m->next; - - mbuf_offset = 0; - mbuf_avail = rte_pktmbuf_data_len(m); - } - - /* done with current desc buf, fetch next */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) { - /* Room in vring buffer is not enough */ - return -1; - } - if (unlikely(desc->next >= size)) - return -1; - - desc = &descs[desc->next]; - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - desc_offset = 0; - desc_avail = desc->len; - } - - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_write(dev, desc->addr + desc_offset, cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - } - - return 0; -} - -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtio device. A packet - * count is returned to indicate the number of packets that are succesfully - * added to the RX queue. This function works when the mbuf is scattered, but - * it doesn't support the mergeable feature. - */ -static inline uint32_t __attribute__((always_inline)) -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) -{ - struct vhost_virtqueue *vq; - uint16_t avail_idx, free_entries, start_idx; - uint16_t desc_indexes[MAX_PKT_BURST]; - struct vring_desc *descs; - uint16_t used_idx; - uint32_t i, sz; - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } - - vq = dev->virtqueue[queue_id]; - if (unlikely(vq->enabled == 0)) - return 0; - - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - start_idx = vq->last_used_idx; - free_entries = avail_idx - start_idx; - count = RTE_MIN(count, free_entries); - count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); - if (count == 0) - return 0; - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", - dev->vid, start_idx, start_idx + count); - - /* Retrieve all of the desc indexes first to avoid caching issues. */ - rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); - for (i = 0; i < count; i++) { - used_idx = (start_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[used_idx]; - vq->used->ring[used_idx].id = desc_indexes[i]; - vq->used->ring[used_idx].len = pkts[i]->pkt_len + - dev->vhost_hlen; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - } - - rte_prefetch0(&vq->desc[desc_indexes[0]]); - for (i = 0; i < count; i++) { - uint16_t desc_idx = desc_indexes[i]; - int err; - - if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { - descs = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, - vq->desc[desc_idx].addr); - if (unlikely(!descs)) { - count = i; - break; - } - - desc_idx = 0; - sz = vq->desc[desc_idx].len / sizeof(*descs); - } else { - descs = vq->desc; - sz = vq->size; - } - - err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz); - if (unlikely(err)) { - used_idx = (start_idx + i) & (vq->size - 1); - vq->used->ring[used_idx].len = dev->vhost_hlen; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); - } - - if (i + 1 < count) - rte_prefetch0(&vq->desc[desc_indexes[i+1]]); - } - - rte_smp_wmb(); - - *(volatile uint16_t *)&vq->used->idx += count; - vq->last_used_idx += count; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - - /* flush used->idx update before we read avail->flags. */ - rte_mb(); - - /* Kick the guest if necessary. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) - eventfd_write(vq->callfd, (eventfd_t)1); - return count; -} - -static inline int __attribute__((always_inline)) -fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t avail_idx, uint32_t *vec_idx, - struct buf_vector *buf_vec, uint16_t *desc_chain_head, - uint16_t *desc_chain_len) -{ - uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; - uint32_t vec_id = *vec_idx; - uint32_t len = 0; - struct vring_desc *descs = vq->desc; - - *desc_chain_head = idx; - - if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { - descs = (struct vring_desc *)(uintptr_t) - gpa_to_vva(dev, vq->desc[idx].addr); - if (unlikely(!descs)) - return -1; - - idx = 0; - } - - while (1) { - if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) - return -1; - - len += descs[idx].len; - buf_vec[vec_id].buf_addr = descs[idx].addr; - buf_vec[vec_id].buf_len = descs[idx].len; - buf_vec[vec_id].desc_idx = idx; - vec_id++; - - if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) - break; - - idx = descs[idx].next; - } - - *desc_chain_len = len; - *vec_idx = vec_id; - - return 0; -} - -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *num_buffers, uint16_t avail_head) -{ - uint16_t cur_idx; - uint32_t vec_idx = 0; - uint16_t tries = 0; - - uint16_t head_idx = 0; - uint16_t len = 0; - - *num_buffers = 0; - cur_idx = vq->last_avail_idx; - - while (size > 0) { - if (unlikely(cur_idx == avail_head)) - return -1; - - if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec, - &head_idx, &len) < 0)) - return -1; - len = RTE_MIN(len, size); - update_shadow_used_ring(vq, head_idx, len); - size -= len; - - cur_idx++; - tries++; - *num_buffers += 1; - - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(tries >= vq->size)) - return -1; - } - - return 0; -} - -static inline int __attribute__((always_inline)) -copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, - struct buf_vector *buf_vec, uint16_t num_buffers) -{ - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; - uint32_t vec_idx = 0; - uint64_t desc_addr; - uint32_t mbuf_offset, mbuf_avail; - uint32_t desc_offset, desc_avail; - uint32_t cpy_len; - uint64_t hdr_addr, hdr_phys_addr; - struct rte_mbuf *hdr_mbuf; - - if (unlikely(m == NULL)) - return -1; - - desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); - if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) - return -1; - - hdr_mbuf = m; - hdr_addr = desc_addr; - hdr_phys_addr = buf_vec[vec_idx].buf_addr; - rte_prefetch0((void *)(uintptr_t)hdr_addr); - - virtio_hdr.num_buffers = num_buffers; - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", - dev->vid, num_buffers); - - desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; - - mbuf_avail = rte_pktmbuf_data_len(m); - mbuf_offset = 0; - while (mbuf_avail != 0 || m->next != NULL) { - /* done with current desc buf, get the next one */ - if (desc_avail == 0) { - vec_idx++; - desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); - if (unlikely(!desc_addr)) - return -1; - - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)desc_addr); - desc_offset = 0; - desc_avail = buf_vec[vec_idx].buf_len; - } - - /* done with current mbuf, get the next one */ - if (mbuf_avail == 0) { - m = m->next; - - mbuf_offset = 0; - mbuf_avail = rte_pktmbuf_data_len(m); - } - - if (hdr_addr) { - virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr); - copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr); - vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)hdr_addr, - dev->vhost_hlen, 0); - - hdr_addr = 0; - } - - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), - rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), - cpy_len); - vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, - cpy_len); - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), - cpy_len, 0); - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - } - - return 0; -} - -static inline uint32_t __attribute__((always_inline)) -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) -{ - struct vhost_virtqueue *vq; - uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t avail_head; - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } - - vq = dev->virtqueue[queue_id]; - if (unlikely(vq->enabled == 0)) - return 0; - - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); - if (count == 0) - return 0; - - rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); - - vq->shadow_used_idx = 0; - avail_head = *((volatile uint16_t *)&vq->avail->idx); - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - - if (unlikely(reserve_avail_buf_mergeable(dev, vq, - pkt_len, buf_vec, &num_buffers, - avail_head) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (pkt_len > 0 && - copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx], buf_vec, num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; - break; - } - - vq->last_avail_idx += num_buffers; - } - - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring(dev, vq); - - /* flush used->idx update before we read avail->flags. */ - rte_mb(); - - /* Kick the guest if necessary. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) - eventfd_write(vq->callfd, (eventfd_t)1); - } - - return pkt_idx; -} - -uint16_t -rte_vhost_enqueue_burst(int vid, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count) -{ - struct virtio_net *dev = get_device(vid); - - if (!dev) - return 0; - - if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) - return virtio_dev_merge_rx(dev, queue_id, pkts, count); - else - return virtio_dev_rx(dev, queue_id, pkts, count); -} - -static inline bool -virtio_net_with_host_offload(struct virtio_net *dev) -{ - if (dev->features & - (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_ECN | - VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 | - VIRTIO_NET_F_HOST_UFO)) - return true; - - return false; -} - -static void -parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr) -{ - struct ipv4_hdr *ipv4_hdr; - struct ipv6_hdr *ipv6_hdr; - void *l3_hdr = NULL; - struct ether_hdr *eth_hdr; - uint16_t ethertype; - - eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); - - m->l2_len = sizeof(struct ether_hdr); - ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); - - if (ethertype == ETHER_TYPE_VLAN) { - struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); - - m->l2_len += sizeof(struct vlan_hdr); - ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); - } - - l3_hdr = (char *)eth_hdr + m->l2_len; - - switch (ethertype) { - case ETHER_TYPE_IPv4: - ipv4_hdr = (struct ipv4_hdr *)l3_hdr; - *l4_proto = ipv4_hdr->next_proto_id; - m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; - *l4_hdr = (char *)l3_hdr + m->l3_len; - m->ol_flags |= PKT_TX_IPV4; - break; - case ETHER_TYPE_IPv6: - ipv6_hdr = (struct ipv6_hdr *)l3_hdr; - *l4_proto = ipv6_hdr->proto; - m->l3_len = sizeof(struct ipv6_hdr); - *l4_hdr = (char *)l3_hdr + m->l3_len; - m->ol_flags |= PKT_TX_IPV6; - break; - default: - m->l3_len = 0; - *l4_proto = 0; - break; - } -} - -static inline void __attribute__((always_inline)) -vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) -{ - uint16_t l4_proto = 0; - void *l4_hdr = NULL; - struct tcp_hdr *tcp_hdr = NULL; - - if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE) - return; - - parse_ethernet(m, &l4_proto, &l4_hdr); - if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (hdr->csum_start == (m->l2_len + m->l3_len)) { - switch (hdr->csum_offset) { - case (offsetof(struct tcp_hdr, cksum)): - if (l4_proto == IPPROTO_TCP) - m->ol_flags |= PKT_TX_TCP_CKSUM; - break; - case (offsetof(struct udp_hdr, dgram_cksum)): - if (l4_proto == IPPROTO_UDP) - m->ol_flags |= PKT_TX_UDP_CKSUM; - break; - case (offsetof(struct sctp_hdr, cksum)): - if (l4_proto == IPPROTO_SCTP) - m->ol_flags |= PKT_TX_SCTP_CKSUM; - break; - default: - break; - } - } - } - - if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { - switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { - case VIRTIO_NET_HDR_GSO_TCPV4: - case VIRTIO_NET_HDR_GSO_TCPV6: - if (l4_hdr == NULL) { - RTE_LOG(ERR, VHOST_DATA, "l4_hdr is NULL\n"); - break; - } - tcp_hdr = (struct tcp_hdr *)l4_hdr; - m->ol_flags |= PKT_TX_TCP_SEG; - m->tso_segsz = hdr->gso_size; - m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; - break; - default: - RTE_LOG(WARNING, VHOST_DATA, - "unsupported gso type %u.\n", hdr->gso_type); - break; - } - } -} - -#define RARP_PKT_SIZE 64 - -static int -make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) -{ - struct ether_hdr *eth_hdr; - struct arp_hdr *rarp; - - if (rarp_mbuf->buf_len < 64) { - RTE_LOG(WARNING, VHOST_DATA, - "failed to make RARP; mbuf size too small %u (< %d)\n", - rarp_mbuf->buf_len, RARP_PKT_SIZE); - return -1; - } - - /* Ethernet header. */ - eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); - memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); - ether_addr_copy(mac, ð_hdr->s_addr); - eth_hdr->ether_type = htons(ETHER_TYPE_RARP); - - /* RARP header. */ - rarp = (struct arp_hdr *)(eth_hdr + 1); - rarp->arp_hrd = htons(ARP_HRD_ETHER); - rarp->arp_pro = htons(ETHER_TYPE_IPv4); - rarp->arp_hln = ETHER_ADDR_LEN; - rarp->arp_pln = 4; - rarp->arp_op = htons(ARP_OP_REVREQUEST); - - ether_addr_copy(mac, &rarp->arp_data.arp_sha); - ether_addr_copy(mac, &rarp->arp_data.arp_tha); - memset(&rarp->arp_data.arp_sip, 0x00, 4); - memset(&rarp->arp_data.arp_tip, 0x00, 4); - - rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE; - - return 0; -} - -static inline void __attribute__((always_inline)) -put_zmbuf(struct zcopy_mbuf *zmbuf) -{ - zmbuf->in_use = 0; -} - -static inline int __attribute__((always_inline)) -copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, - uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx, - struct rte_mempool *mbuf_pool) -{ - struct vring_desc *desc; - uint64_t desc_addr; - uint32_t desc_avail, desc_offset; - uint32_t mbuf_avail, mbuf_offset; - uint32_t cpy_len; - struct rte_mbuf *cur = m, *prev = m; - struct virtio_net_hdr *hdr = NULL; - /* A counter to avoid desc dead loop chain */ - uint32_t nr_desc = 1; - - desc = &descs[desc_idx]; - if (unlikely((desc->len < dev->vhost_hlen)) || - (desc->flags & VRING_DESC_F_INDIRECT)) - return -1; - - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - if (virtio_net_with_host_offload(dev)) { - hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); - rte_prefetch0(hdr); - } - - /* - * A virtio driver normally uses at least 2 desc buffers - * for Tx: the first for storing the header, and others - * for storing the data. - */ - if (likely((desc->len == dev->vhost_hlen) && - (desc->flags & VRING_DESC_F_NEXT) != 0)) { - desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) - return -1; - - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - desc_offset = 0; - desc_avail = desc->len; - nr_desc += 1; - } else { - desc_avail = desc->len - dev->vhost_hlen; - desc_offset = dev->vhost_hlen; - } - - rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset)); - - PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0); - - mbuf_offset = 0; - mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; - while (1) { - uint64_t hpa; - - cpy_len = RTE_MIN(desc_avail, mbuf_avail); - - /* - * A desc buf might across two host physical pages that are - * not continuous. In such case (gpa_to_hpa returns 0), data - * will be copied even though zero copy is enabled. - */ - if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, - desc->addr + desc_offset, cpy_len)))) { - cur->data_len = cpy_len; - cur->data_off = 0; - cur->buf_addr = (void *)(uintptr_t)desc_addr; - cur->buf_physaddr = hpa; - - /* - * In zero copy mode, one mbuf can only reference data - * for one or partial of one desc buff. - */ - mbuf_avail = cpy_len; - } else { - rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, - mbuf_offset), - (void *)((uintptr_t)(desc_addr + desc_offset)), - cpy_len); - } - - mbuf_avail -= cpy_len; - mbuf_offset += cpy_len; - desc_avail -= cpy_len; - desc_offset += cpy_len; - - /* This desc reaches to its end, get the next one */ - if (desc_avail == 0) { - if ((desc->flags & VRING_DESC_F_NEXT) == 0) - break; - - if (unlikely(desc->next >= max_desc || - ++nr_desc > max_desc)) - return -1; - desc = &descs[desc->next]; - if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) - return -1; - - desc_addr = gpa_to_vva(dev, desc->addr); - if (unlikely(!desc_addr)) - return -1; - - rte_prefetch0((void *)(uintptr_t)desc_addr); - - desc_offset = 0; - desc_avail = desc->len; - - PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); - } - - /* - * This mbuf reaches to its end, get a new one - * to hold more data. - */ - if (mbuf_avail == 0) { - cur = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(cur == NULL)) { - RTE_LOG(ERR, VHOST_DATA, "Failed to " - "allocate memory for mbuf.\n"); - return -1; - } - - prev->next = cur; - prev->data_len = mbuf_offset; - m->nb_segs += 1; - m->pkt_len += mbuf_offset; - prev = cur; - - mbuf_offset = 0; - mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; - } - } - - prev->data_len = mbuf_offset; - m->pkt_len += mbuf_offset; - - if (hdr) - vhost_dequeue_offload(hdr, m); - - return 0; -} - -static inline void __attribute__((always_inline)) -update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t used_idx, uint32_t desc_idx) -{ - vq->used->ring[used_idx].id = desc_idx; - vq->used->ring[used_idx].len = 0; - vhost_log_used_vring(dev, vq, - offsetof(struct vring_used, ring[used_idx]), - sizeof(vq->used->ring[used_idx])); -} - -static inline void __attribute__((always_inline)) -update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t count) -{ - if (unlikely(count == 0)) - return; - - rte_smp_wmb(); - rte_smp_rmb(); - - vq->used->idx += count; - vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), - sizeof(vq->used->idx)); - - /* Kick guest if required. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && (vq->callfd >= 0)) - eventfd_write(vq->callfd, (eventfd_t)1); -} - -static inline struct zcopy_mbuf *__attribute__((always_inline)) -get_zmbuf(struct vhost_virtqueue *vq) -{ - uint16_t i; - uint16_t last; - int tries = 0; - - /* search [last_zmbuf_idx, zmbuf_size) */ - i = vq->last_zmbuf_idx; - last = vq->zmbuf_size; - -again: - for (; i < last; i++) { - if (vq->zmbufs[i].in_use == 0) { - vq->last_zmbuf_idx = i + 1; - vq->zmbufs[i].in_use = 1; - return &vq->zmbufs[i]; - } - } - - tries++; - if (tries == 1) { - /* search [0, last_zmbuf_idx) */ - i = 0; - last = vq->last_zmbuf_idx; - goto again; - } - - return NULL; -} - -static inline bool __attribute__((always_inline)) -mbuf_is_consumed(struct rte_mbuf *m) -{ - while (m) { - if (rte_mbuf_refcnt_read(m) > 1) - return false; - m = m->next; - } - - return true; -} - -uint16_t -rte_vhost_dequeue_burst(int vid, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -{ - struct virtio_net *dev; - struct rte_mbuf *rarp_mbuf = NULL; - struct vhost_virtqueue *vq; - uint32_t desc_indexes[MAX_PKT_BURST]; - uint32_t used_idx; - uint32_t i = 0; - uint16_t free_entries; - uint16_t avail_idx; - - dev = get_device(vid); - if (!dev) - return 0; - - if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) { - RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", - dev->vid, __func__, queue_id); - return 0; - } - - vq = dev->virtqueue[queue_id]; - if (unlikely(vq->enabled == 0)) - return 0; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; - int nr_updated = 0; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); - - if (mbuf_is_consumed(zmbuf->mbuf)) { - used_idx = vq->last_used_idx++ & (vq->size - 1); - update_used_ring(dev, vq, used_idx, - zmbuf->desc_idx); - nr_updated += 1; - - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; - } - } - - update_used_idx(dev, vq, nr_updated); - } - - /* - * Construct a RARP broadcast packet, and inject it to the "pkts" - * array, to looks like that guest actually send such packet. - * - * Check user_send_rarp() for more information. - */ - if (unlikely(rte_atomic16_cmpset((volatile uint16_t *) - &dev->broadcast_rarp.cnt, 1, 0))) { - rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool); - if (rarp_mbuf == NULL) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - return 0; - } - - if (make_rarp_packet(rarp_mbuf, &dev->mac)) { - rte_pktmbuf_free(rarp_mbuf); - rarp_mbuf = NULL; - } else { - count -= 1; - } - } - - free_entries = *((volatile uint16_t *)&vq->avail->idx) - - vq->last_avail_idx; - if (free_entries == 0) - goto out; - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - /* Prefetch available and used ring */ - avail_idx = vq->last_avail_idx & (vq->size - 1); - used_idx = vq->last_used_idx & (vq->size - 1); - rte_prefetch0(&vq->avail->ring[avail_idx]); - rte_prefetch0(&vq->used->ring[used_idx]); - - count = RTE_MIN(count, MAX_PKT_BURST); - count = RTE_MIN(count, free_entries); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); - - /* Retrieve all of the head indexes first to avoid caching issues. */ - for (i = 0; i < count; i++) { - avail_idx = (vq->last_avail_idx + i) & (vq->size - 1); - used_idx = (vq->last_used_idx + i) & (vq->size - 1); - desc_indexes[i] = vq->avail->ring[avail_idx]; - - if (likely(dev->dequeue_zero_copy == 0)) - update_used_ring(dev, vq, used_idx, desc_indexes[i]); - } - - /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[desc_indexes[0]]); - for (i = 0; i < count; i++) { - struct vring_desc *desc; - uint16_t sz, idx; - int err; - - if (likely(i + 1 < count)) - rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); - - if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { - desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, - vq->desc[desc_indexes[i]].addr); - if (unlikely(!desc)) - break; - - rte_prefetch0(desc); - sz = vq->desc[desc_indexes[i]].len / sizeof(*desc); - idx = 0; - } else { - desc = vq->desc; - sz = vq->size; - idx = desc_indexes[i]; - } - - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; - } - - err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); - break; - } - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; - - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = desc_indexes[i]; - - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); - - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); - } - } - vq->last_avail_idx += i; - - if (likely(dev->dequeue_zero_copy == 0)) { - vq->last_used_idx += i; - update_used_idx(dev, vq, i); - } - -out: - if (unlikely(rarp_mbuf != NULL)) { - /* - * Inject it to the head of "pkts" array, so that switch's mac - * learning table will get updated first. - */ - memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *)); - pkts[0] = rarp_mbuf; - i += 1; - } - - return i; -} diff --git a/lib/vhost/rte_vhost_17_05/Makefile b/lib/vhost/rte_vhost_17_05/Makefile deleted file mode 100644 index 537a3c70e..000000000 --- a/lib/vhost/rte_vhost_17_05/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -# -# BSD LICENSE -# -# Copyright (c) Intel Corporation. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# - -SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) -include $(SPDK_ROOT_DIR)/mk/spdk.common.mk - -CFLAGS += -I. -CFLAGS += $(ENV_CFLAGS) - -# These are the DPDK vhost files copied (for now) into SPDK -C_SRCS += fd_man.c socket.c vhost_user.c vhost.c - -LIBNAME = rte_vhost - -include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/lib/vhost/rte_vhost_17_05/fd_man.c b/lib/vhost/rte_vhost_17_05/fd_man.c deleted file mode 100644 index 2ceacc9ab..000000000 --- a/lib/vhost/rte_vhost_17_05/fd_man.c +++ /dev/null @@ -1,300 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "fd_man.h" - -#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) - -static int -get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) -{ - int i; - - for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) - ; - - return i; -} - -static void -fdset_move(struct fdset *pfdset, int dst, int src) -{ - pfdset->fd[dst] = pfdset->fd[src]; - pfdset->rwfds[dst] = pfdset->rwfds[src]; -} - -static void -fdset_shrink_nolock(struct fdset *pfdset) -{ - int i; - int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); - - for (i = 0; i < last_valid_idx; i++) { - if (pfdset->fd[i].fd != -1) - continue; - - fdset_move(pfdset, i, last_valid_idx); - last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); - } - pfdset->num = last_valid_idx + 1; -} - -/* - * Find deleted fd entries and remove them - */ -static void -fdset_shrink(struct fdset *pfdset) -{ - pthread_mutex_lock(&pfdset->fd_mutex); - fdset_shrink_nolock(pfdset); - pthread_mutex_unlock(&pfdset->fd_mutex); -} - -/** - * Returns the index in the fdset for a given fd. - * @return - * index for the fd, or -1 if fd isn't in the fdset. - */ -static int -fdset_find_fd(struct fdset *pfdset, int fd) -{ - int i; - - for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) - ; - - return i == pfdset->num ? -1 : i; -} - -static void -fdset_add_fd(struct fdset *pfdset, int idx, int fd, - fd_cb rcb, fd_cb wcb, void *dat) -{ - struct fdentry *pfdentry = &pfdset->fd[idx]; - struct pollfd *pfd = &pfdset->rwfds[idx]; - - pfdentry->fd = fd; - pfdentry->rcb = rcb; - pfdentry->wcb = wcb; - pfdentry->dat = dat; - - pfd->fd = fd; - pfd->events = rcb ? POLLIN : 0; - pfd->events |= wcb ? POLLOUT : 0; - pfd->revents = 0; -} - -void -fdset_init(struct fdset *pfdset) -{ - int i; - - if (pfdset == NULL) - return; - - for (i = 0; i < MAX_FDS; i++) { - pfdset->fd[i].fd = -1; - pfdset->fd[i].dat = NULL; - } - pfdset->num = 0; -} - -/** - * Register the fd in the fdset with read/write handler and context. - */ -int -fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) -{ - int i; - - if (pfdset == NULL || fd == -1) - return -1; - - pthread_mutex_lock(&pfdset->fd_mutex); - i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; - if (i == -1) { - fdset_shrink_nolock(pfdset); - i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; - if (i == -1) { - pthread_mutex_unlock(&pfdset->fd_mutex); - return -2; - } - } - - fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); - pthread_mutex_unlock(&pfdset->fd_mutex); - - return 0; -} - -/** - * Unregister the fd from the fdset. - * Returns context of a given fd or NULL. - */ -void * -fdset_del(struct fdset *pfdset, int fd) -{ - int i; - void *dat = NULL; - - if (pfdset == NULL || fd == -1) - return NULL; - - do { - pthread_mutex_lock(&pfdset->fd_mutex); - - i = fdset_find_fd(pfdset, fd); - if (i != -1 && pfdset->fd[i].busy == 0) { - /* busy indicates r/wcb is executing! */ - dat = pfdset->fd[i].dat; - pfdset->fd[i].fd = -1; - pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; - pfdset->fd[i].dat = NULL; - i = -1; - } - pthread_mutex_unlock(&pfdset->fd_mutex); - } while (i != -1); - - return dat; -} - - -/** - * This functions runs in infinite blocking loop until there is no fd in - * pfdset. It calls corresponding r/w handler if there is event on the fd. - * - * Before the callback is called, we set the flag to busy status; If other - * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it - * will wait until the flag is reset to zero(which indicates the callback is - * finished), then it could free the context after fdset_del. - */ -void * -fdset_event_dispatch(void *arg) -{ - int i; - struct pollfd *pfd; - struct fdentry *pfdentry; - fd_cb rcb, wcb; - void *dat; - int fd, numfds; - int remove1, remove2; - int need_shrink; - struct fdset *pfdset = arg; - - if (pfdset == NULL) - return NULL; - - while (1) { - - /* - * When poll is blocked, other threads might unregister - * listenfds from and register new listenfds into fdset. - * When poll returns, the entries for listenfds in the fdset - * might have been updated. It is ok if there is unwanted call - * for new listenfds. - */ - pthread_mutex_lock(&pfdset->fd_mutex); - numfds = pfdset->num; - pthread_mutex_unlock(&pfdset->fd_mutex); - - poll(pfdset->rwfds, numfds, 1000 /* millisecs */); - - need_shrink = 0; - for (i = 0; i < numfds; i++) { - pthread_mutex_lock(&pfdset->fd_mutex); - - pfdentry = &pfdset->fd[i]; - fd = pfdentry->fd; - pfd = &pfdset->rwfds[i]; - - if (fd < 0) { - need_shrink = 1; - pthread_mutex_unlock(&pfdset->fd_mutex); - continue; - } - - if (!pfd->revents) { - pthread_mutex_unlock(&pfdset->fd_mutex); - continue; - } - - remove1 = remove2 = 0; - - rcb = pfdentry->rcb; - wcb = pfdentry->wcb; - dat = pfdentry->dat; - pfdentry->busy = 1; - - pthread_mutex_unlock(&pfdset->fd_mutex); - - if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) - rcb(fd, dat, &remove1); - if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) - wcb(fd, dat, &remove2); - pfdentry->busy = 0; - /* - * fdset_del needs to check busy flag. - * We don't allow fdset_del to be called in callback - * directly. - */ - /* - * When we are to clean up the fd from fdset, - * because the fd is closed in the cb, - * the old fd val could be reused by when creates new - * listen fd in another thread, we couldn't call - * fd_set_del. - */ - if (remove1 || remove2) { - pfdentry->fd = -1; - need_shrink = 1; - } - } - - if (need_shrink) - fdset_shrink(pfdset); - } - - return NULL; -} diff --git a/lib/vhost/rte_vhost_17_05/fd_man.h b/lib/vhost/rte_vhost_17_05/fd_man.h deleted file mode 100644 index 3a9d269b3..000000000 --- a/lib/vhost/rte_vhost_17_05/fd_man.h +++ /dev/null @@ -1,69 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _FD_MAN_H_ -#define _FD_MAN_H_ -#include -#include -#include - -#define MAX_FDS 1024 - -typedef void (*fd_cb)(int fd, void *dat, int *remove); - -struct fdentry { - int fd; /* -1 indicates this entry is empty */ - fd_cb rcb; /* callback when this fd is readable. */ - fd_cb wcb; /* callback when this fd is writeable. */ - void *dat; /* fd context */ - int busy; /* whether this entry is being used in cb. */ -}; - -struct fdset { - struct pollfd rwfds[MAX_FDS]; - struct fdentry fd[MAX_FDS]; - pthread_mutex_t fd_mutex; - int num; /* current fd number of this fdset */ -}; - - -void fdset_init(struct fdset *pfdset); - -int fdset_add(struct fdset *pfdset, int fd, - fd_cb rcb, fd_cb wcb, void *dat); - -void *fdset_del(struct fdset *pfdset, int fd); - -void *fdset_event_dispatch(void *arg); - -#endif diff --git a/lib/vhost/rte_vhost_17_05/socket.c b/lib/vhost/rte_vhost_17_05/socket.c deleted file mode 100644 index 4eea67893..000000000 --- a/lib/vhost/rte_vhost_17_05/socket.c +++ /dev/null @@ -1,802 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "fd_man.h" -#include "vhost.h" -#include "vhost_user.h" - - -TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); - -/* - * Every time rte_vhost_driver_register() is invoked, an associated - * vhost_user_socket struct will be created. - */ -struct vhost_user_socket { - struct vhost_user_connection_list conn_list; - pthread_mutex_t conn_mutex; - char *path; - int socket_fd; - struct sockaddr_un un; - bool is_server; - bool reconnect; - bool dequeue_zero_copy; - - /* - * The "supported_features" indicates the feature bits the - * vhost driver supports. The "features" indicates the feature - * bits after the rte_vhost_driver_features_disable/enable(). - * It is also the final feature bits used for vhost-user - * features negotiation. - */ - uint64_t supported_features; - uint64_t features; - - struct vhost_device_ops const *notify_ops; -}; - -struct vhost_user_connection { - struct vhost_user_socket *vsocket; - int connfd; - int vid; - - TAILQ_ENTRY(vhost_user_connection) next; -}; - -#define MAX_VHOST_SOCKET 1024 -struct vhost_user { - struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; - struct fdset fdset; - int vsocket_cnt; - pthread_mutex_t mutex; -}; - -#define MAX_VIRTIO_BACKLOG 128 - -static void vhost_user_server_new_connection(int fd, void *data, int *remove); -static void vhost_user_read_cb(int fd, void *dat, int *remove); -static int create_unix_socket(struct vhost_user_socket *vsocket); -static int vhost_user_start_client(struct vhost_user_socket *vsocket); - -static struct vhost_user vhost_user = { - .fdset = { - .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, - .fd_mutex = PTHREAD_MUTEX_INITIALIZER, - .num = 0 - }, - .vsocket_cnt = 0, - .mutex = PTHREAD_MUTEX_INITIALIZER, -}; - -/* return bytes# of read on success or negative val on failure. */ -int -read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) -{ - struct iovec iov; - struct msghdr msgh; - size_t fdsize = fd_num * sizeof(int); - char control[CMSG_SPACE(fdsize)]; - struct cmsghdr *cmsg; - int ret; - - memset(&msgh, 0, sizeof(msgh)); - iov.iov_base = buf; - iov.iov_len = buflen; - - msgh.msg_iov = &iov; - msgh.msg_iovlen = 1; - msgh.msg_control = control; - msgh.msg_controllen = sizeof(control); - - ret = recvmsg(sockfd, &msgh, 0); - if (ret <= 0) { - RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); - return ret; - } - - if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { - RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); - return -1; - } - - for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msgh, cmsg)) { - if ((cmsg->cmsg_level == SOL_SOCKET) && - (cmsg->cmsg_type == SCM_RIGHTS)) { - memcpy(fds, CMSG_DATA(cmsg), fdsize); - break; - } - } - - return ret; -} - -int -send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) -{ - - struct iovec iov; - struct msghdr msgh; - size_t fdsize = fd_num * sizeof(int); - char control[CMSG_SPACE(fdsize)]; - struct cmsghdr *cmsg; - int ret; - - memset(&msgh, 0, sizeof(msgh)); - iov.iov_base = buf; - iov.iov_len = buflen; - - msgh.msg_iov = &iov; - msgh.msg_iovlen = 1; - - if (fds && fd_num > 0) { - msgh.msg_control = control; - msgh.msg_controllen = sizeof(control); - cmsg = CMSG_FIRSTHDR(&msgh); - if (cmsg == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n"); - errno = EINVAL; - return -1; - } - cmsg->cmsg_len = CMSG_LEN(fdsize); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - memcpy(CMSG_DATA(cmsg), fds, fdsize); - } else { - msgh.msg_control = NULL; - msgh.msg_controllen = 0; - } - - do { - ret = sendmsg(sockfd, &msgh, 0); - } while (ret < 0 && errno == EINTR); - - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); - return ret; - } - - return ret; -} - -static void -vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) -{ - int vid; - size_t size; - struct vhost_user_connection *conn; - int ret; - - conn = malloc(sizeof(*conn)); - if (conn == NULL) { - close(fd); - return; - } - - vid = vhost_new_device(); - if (vid == -1) { - close(fd); - free(conn); - return; - } - - size = strnlen(vsocket->path, PATH_MAX); - vhost_set_ifname(vid, vsocket->path, size); - - if (vsocket->dequeue_zero_copy) - vhost_enable_dequeue_zero_copy(vid); - - RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); - - conn->connfd = fd; - conn->vsocket = vsocket; - conn->vid = vid; - ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, - NULL, conn); - if (ret < 0) { - conn->connfd = -1; - free(conn); - close(fd); - RTE_LOG(ERR, VHOST_CONFIG, - "failed to add fd %d into vhost server fdset\n", - fd); - } - - pthread_mutex_lock(&vsocket->conn_mutex); - TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); - pthread_mutex_unlock(&vsocket->conn_mutex); -} - -/* call back when there is new vhost-user connection from client */ -static void -vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) -{ - struct vhost_user_socket *vsocket = dat; - - fd = accept(fd, NULL, NULL); - if (fd < 0) - return; - - RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); - vhost_user_add_connection(fd, vsocket); -} - -static void -vhost_user_read_cb(int connfd, void *dat, int *remove) -{ - struct vhost_user_connection *conn = dat; - struct vhost_user_socket *vsocket = conn->vsocket; - int ret; - - ret = vhost_user_msg_handler(conn->vid, connfd); - if (ret < 0) { - close(connfd); - *remove = 1; - vhost_destroy_device(conn->vid); - - pthread_mutex_lock(&vsocket->conn_mutex); - TAILQ_REMOVE(&vsocket->conn_list, conn, next); - pthread_mutex_unlock(&vsocket->conn_mutex); - - free(conn); - - if (vsocket->reconnect) { - create_unix_socket(vsocket); - vhost_user_start_client(vsocket); - } - } -} - -static int -create_unix_socket(struct vhost_user_socket *vsocket) -{ - int fd; - struct sockaddr_un *un = &vsocket->un; - - fd = socket(AF_UNIX, SOCK_STREAM, 0); - if (fd < 0) - return -1; - RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", - vsocket->is_server ? "server" : "client", fd); - - if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { - RTE_LOG(ERR, VHOST_CONFIG, - "vhost-user: can't set nonblocking mode for socket, fd: " - "%d (%s)\n", fd, strerror(errno)); - close(fd); - return -1; - } - - memset(un, 0, sizeof(*un)); - un->sun_family = AF_UNIX; - strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); - un->sun_path[sizeof(un->sun_path) - 1] = '\0'; - - vsocket->socket_fd = fd; - return 0; -} - -static int -vhost_user_start_server(struct vhost_user_socket *vsocket) -{ - int ret; - int fd = vsocket->socket_fd; - const char *path = vsocket->path; - - ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to bind to %s: %s; remove it and try again\n", - path, strerror(errno)); - goto err; - } - RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); - - ret = listen(fd, MAX_VIRTIO_BACKLOG); - if (ret < 0) - goto err; - - ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, - NULL, vsocket); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to add listen fd %d to vhost server fdset\n", - fd); - goto err; - } - - return 0; - -err: - close(fd); - return -1; -} - -struct vhost_user_reconnect { - struct sockaddr_un un; - int fd; - struct vhost_user_socket *vsocket; - - TAILQ_ENTRY(vhost_user_reconnect) next; -}; - -TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); -struct vhost_user_reconnect_list { - struct vhost_user_reconnect_tailq_list head; - pthread_mutex_t mutex; -}; - -static struct vhost_user_reconnect_list reconn_list; -static pthread_t reconn_tid; - -static int -vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) -{ - int ret, flags; - - ret = connect(fd, un, sz); - if (ret < 0 && errno != EISCONN) - return -1; - - flags = fcntl(fd, F_GETFL, 0); - if (flags < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "can't get flags for connfd %d\n", fd); - return -2; - } - if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { - RTE_LOG(ERR, VHOST_CONFIG, - "can't disable nonblocking on fd %d\n", fd); - return -2; - } - return 0; -} - -static void * -vhost_user_client_reconnect(void *arg __rte_unused) -{ - int ret; - struct vhost_user_reconnect *reconn, *next; - - while (1) { - pthread_mutex_lock(&reconn_list.mutex); - - /* - * An equal implementation of TAILQ_FOREACH_SAFE, - * which does not exist on all platforms. - */ - for (reconn = TAILQ_FIRST(&reconn_list.head); - reconn != NULL; reconn = next) { - next = TAILQ_NEXT(reconn, next); - - ret = vhost_user_connect_nonblock(reconn->fd, - (struct sockaddr *)&reconn->un, - sizeof(reconn->un)); - if (ret == -2) { - close(reconn->fd); - RTE_LOG(ERR, VHOST_CONFIG, - "reconnection for fd %d failed\n", - reconn->fd); - goto remove_fd; - } - if (ret == -1) - continue; - - RTE_LOG(INFO, VHOST_CONFIG, - "%s: connected\n", reconn->vsocket->path); - vhost_user_add_connection(reconn->fd, reconn->vsocket); -remove_fd: - TAILQ_REMOVE(&reconn_list.head, reconn, next); - free(reconn); - } - - pthread_mutex_unlock(&reconn_list.mutex); - sleep(1); - } - - return NULL; -} - -static int -vhost_user_reconnect_init(void) -{ - int ret; - - pthread_mutex_init(&reconn_list.mutex, NULL); - TAILQ_INIT(&reconn_list.head); - - ret = pthread_create(&reconn_tid, NULL, - vhost_user_client_reconnect, NULL); - if (ret < 0) - RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); - - return ret; -} - -static int -vhost_user_start_client(struct vhost_user_socket *vsocket) -{ - int ret; - int fd = vsocket->socket_fd; - const char *path = vsocket->path; - struct vhost_user_reconnect *reconn; - - ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, - sizeof(vsocket->un)); - if (ret == 0) { - vhost_user_add_connection(fd, vsocket); - return 0; - } - - RTE_LOG(WARNING, VHOST_CONFIG, - "failed to connect to %s: %s\n", - path, strerror(errno)); - - if (ret == -2 || !vsocket->reconnect) { - close(fd); - return -1; - } - - RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); - reconn = malloc(sizeof(*reconn)); - if (reconn == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to allocate memory for reconnect\n"); - close(fd); - return -1; - } - reconn->un = vsocket->un; - reconn->fd = fd; - reconn->vsocket = vsocket; - pthread_mutex_lock(&reconn_list.mutex); - TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); - pthread_mutex_unlock(&reconn_list.mutex); - - return 0; -} - -static struct vhost_user_socket * -find_vhost_user_socket(const char *path) -{ - int i; - - for (i = 0; i < vhost_user.vsocket_cnt; i++) { - struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; - - if (!strcmp(vsocket->path, path)) - return vsocket; - } - - return NULL; -} - -int -rte_vhost_driver_disable_features(const char *path, uint64_t features) -{ - struct vhost_user_socket *vsocket; - - pthread_mutex_lock(&vhost_user.mutex); - vsocket = find_vhost_user_socket(path); - if (vsocket) - vsocket->features &= ~features; - pthread_mutex_unlock(&vhost_user.mutex); - - return vsocket ? 0 : -1; -} - -int -rte_vhost_driver_enable_features(const char *path, uint64_t features) -{ - struct vhost_user_socket *vsocket; - - pthread_mutex_lock(&vhost_user.mutex); - vsocket = find_vhost_user_socket(path); - if (vsocket) { - if ((vsocket->supported_features & features) != features) { - /* - * trying to enable features the driver doesn't - * support. - */ - pthread_mutex_unlock(&vhost_user.mutex); - return -1; - } - vsocket->features |= features; - } - pthread_mutex_unlock(&vhost_user.mutex); - - return vsocket ? 0 : -1; -} - -int -rte_vhost_driver_set_features(const char *path, uint64_t features) -{ - struct vhost_user_socket *vsocket; - - pthread_mutex_lock(&vhost_user.mutex); - vsocket = find_vhost_user_socket(path); - if (vsocket) { - vsocket->supported_features = features; - vsocket->features = features; - } - pthread_mutex_unlock(&vhost_user.mutex); - - return vsocket ? 0 : -1; -} - -int -rte_vhost_driver_get_features(const char *path, uint64_t *features) -{ - struct vhost_user_socket *vsocket; - - pthread_mutex_lock(&vhost_user.mutex); - vsocket = find_vhost_user_socket(path); - if (vsocket) - *features = vsocket->features; - pthread_mutex_unlock(&vhost_user.mutex); - - if (!vsocket) { - RTE_LOG(ERR, VHOST_CONFIG, - "socket file %s is not registered yet.\n", path); - return -1; - } else { - return 0; - } -} - -/* - * Register a new vhost-user socket; here we could act as server - * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag - * is set. - */ -int -rte_vhost_driver_register(const char *path, uint64_t flags) -{ - int ret = -1; - struct vhost_user_socket *vsocket; - - if (!path) - return -1; - - pthread_mutex_lock(&vhost_user.mutex); - - if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { - RTE_LOG(ERR, VHOST_CONFIG, - "error: the number of vhost sockets reaches maximum\n"); - goto out; - } - - vsocket = malloc(sizeof(struct vhost_user_socket)); - if (!vsocket) - goto out; - memset(vsocket, 0, sizeof(struct vhost_user_socket)); - vsocket->path = strdup(path); - TAILQ_INIT(&vsocket->conn_list); - pthread_mutex_init(&vsocket->conn_mutex, NULL); - vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; - - /* - * Set the supported features correctly for the builtin vhost-user - * net driver. - * - * Applications know nothing about features the builtin virtio net - * driver (virtio_net.c) supports, thus it's not possible for them - * to invoke rte_vhost_driver_set_features(). To workaround it, here - * we set it unconditionally. If the application want to implement - * another vhost-user driver (say SCSI), it should call the - * rte_vhost_driver_set_features(), which will overwrite following - * two values. - */ - vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; - vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; - - if ((flags & RTE_VHOST_USER_CLIENT) != 0) { - vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); - if (vsocket->reconnect && reconn_tid == 0) { - if (vhost_user_reconnect_init() < 0) { - free(vsocket->path); - free(vsocket); - goto out; - } - } - } else { - vsocket->is_server = true; - } - ret = create_unix_socket(vsocket); - if (ret < 0) { - free(vsocket->path); - free(vsocket); - goto out; - } - - vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; - -out: - pthread_mutex_unlock(&vhost_user.mutex); - - return ret; -} - -static bool -vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) -{ - int found = false; - struct vhost_user_reconnect *reconn, *next; - - pthread_mutex_lock(&reconn_list.mutex); - - for (reconn = TAILQ_FIRST(&reconn_list.head); - reconn != NULL; reconn = next) { - next = TAILQ_NEXT(reconn, next); - - if (reconn->vsocket == vsocket) { - TAILQ_REMOVE(&reconn_list.head, reconn, next); - close(reconn->fd); - free(reconn); - found = true; - break; - } - } - pthread_mutex_unlock(&reconn_list.mutex); - return found; -} - -/** - * Unregister the specified vhost socket - */ -int -rte_vhost_driver_unregister(const char *path) -{ - int i; - int count; - struct vhost_user_connection *conn, *next; - - pthread_mutex_lock(&vhost_user.mutex); - - for (i = 0; i < vhost_user.vsocket_cnt; i++) { - struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; - - if (!strcmp(vsocket->path, path)) { - if (vsocket->is_server) { - fdset_del(&vhost_user.fdset, vsocket->socket_fd); - close(vsocket->socket_fd); - unlink(path); - } else if (vsocket->reconnect) { - vhost_user_remove_reconnect(vsocket); - } - - pthread_mutex_lock(&vsocket->conn_mutex); - for (conn = TAILQ_FIRST(&vsocket->conn_list); - conn != NULL; - conn = next) { - next = TAILQ_NEXT(conn, next); - - fdset_del(&vhost_user.fdset, conn->connfd); - RTE_LOG(INFO, VHOST_CONFIG, - "free connfd = %d for device '%s'\n", - conn->connfd, path); - close(conn->connfd); - vhost_destroy_device(conn->vid); - TAILQ_REMOVE(&vsocket->conn_list, conn, next); - free(conn); - } - pthread_mutex_unlock(&vsocket->conn_mutex); - - free(vsocket->path); - free(vsocket); - - count = --vhost_user.vsocket_cnt; - vhost_user.vsockets[i] = vhost_user.vsockets[count]; - vhost_user.vsockets[count] = NULL; - pthread_mutex_unlock(&vhost_user.mutex); - - return 0; - } - } - pthread_mutex_unlock(&vhost_user.mutex); - - return -1; -} - -/* - * Register ops so that we can add/remove device to data core. - */ -int -rte_vhost_driver_callback_register(const char *path, - struct vhost_device_ops const * const ops) -{ - struct vhost_user_socket *vsocket; - - pthread_mutex_lock(&vhost_user.mutex); - vsocket = find_vhost_user_socket(path); - if (vsocket) - vsocket->notify_ops = ops; - pthread_mutex_unlock(&vhost_user.mutex); - - return vsocket ? 0 : -1; -} - -struct vhost_device_ops const * -vhost_driver_callback_get(const char *path) -{ - struct vhost_user_socket *vsocket; - - pthread_mutex_lock(&vhost_user.mutex); - vsocket = find_vhost_user_socket(path); - pthread_mutex_unlock(&vhost_user.mutex); - - return vsocket ? vsocket->notify_ops : NULL; -} - -int -rte_vhost_driver_start(const char *path) -{ - struct vhost_user_socket *vsocket; - static pthread_t fdset_tid; - - pthread_mutex_lock(&vhost_user.mutex); - vsocket = find_vhost_user_socket(path); - pthread_mutex_unlock(&vhost_user.mutex); - - if (!vsocket) - return -1; - - if (fdset_tid == 0) { - int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, - &vhost_user.fdset); - if (ret < 0) - RTE_LOG(ERR, VHOST_CONFIG, - "failed to create fdset handling thread"); - } - - if (vsocket->is_server) - return vhost_user_start_server(vsocket); - else - return vhost_user_start_client(vsocket); -} diff --git a/lib/vhost/rte_vhost_17_05/vhost.c b/lib/vhost/rte_vhost_17_05/vhost.c deleted file mode 100644 index 74c12040e..000000000 --- a/lib/vhost/rte_vhost_17_05/vhost.c +++ /dev/null @@ -1,503 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#ifdef RTE_LIBRTE_VHOST_NUMA -#include -#endif - -#include -#include -#include -#include -#include -#include - -#include "vhost.h" - -struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; - -struct virtio_net * -get_device(int vid) -{ - struct virtio_net *dev = vhost_devices[vid]; - - if (unlikely(!dev)) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) device not found.\n", vid); - } - - return dev; -} - -static void -cleanup_vq(struct vhost_virtqueue *vq, int destroy) -{ - if ((vq->callfd >= 0) && (destroy != 0)) - close(vq->callfd); - if (vq->kickfd >= 0) - close(vq->kickfd); -} - -/* - * Unmap any memory, close any file descriptors and - * free any memory owned by a device. - */ -void -cleanup_device(struct virtio_net *dev, int destroy) -{ - uint32_t i; - - vhost_backend_cleanup(dev); - - for (i = 0; i < dev->nr_vring; i++) - cleanup_vq(dev->virtqueue[i], destroy); -} - -/* - * Release virtqueues and device memory. - */ -static void -free_device(struct virtio_net *dev) -{ - uint32_t i; - struct vhost_virtqueue *vq; - - for (i = 0; i < dev->nr_vring; i++) { - vq = dev->virtqueue[i]; - - rte_free(vq->shadow_used_ring); - - rte_free(vq); - } - - rte_free(dev); -} - -static void -init_vring_queue(struct vhost_virtqueue *vq) -{ - memset(vq, 0, sizeof(struct vhost_virtqueue)); - - vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; - vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; - - /* Backends are set to -1 indicating an inactive device. */ - vq->backend = -1; - - /* - * always set the vq to enabled; this is to keep compatibility - * with the old QEMU, whereas there is no SET_VRING_ENABLE message. - */ - vq->enabled = 1; - - TAILQ_INIT(&vq->zmbuf_list); -} - -static void -reset_vring_queue(struct vhost_virtqueue *vq) -{ - int callfd; - - callfd = vq->callfd; - init_vring_queue(vq); - vq->callfd = callfd; -} - -int -alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) -{ - struct vhost_virtqueue *vq; - - vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); - if (vq == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for vring:%u.\n", vring_idx); - return -1; - } - - dev->virtqueue[vring_idx] = vq; - init_vring_queue(vq); - - dev->nr_vring += 1; - - return 0; -} - -/* - * Reset some variables in device structure, while keeping few - * others untouched, such as vid, ifname, nr_vring: they - * should be same unless the device is removed. - */ -void -reset_device(struct virtio_net *dev) -{ - uint32_t i; - - dev->features = 0; - dev->protocol_features = 0; - dev->flags = 0; - - for (i = 0; i < dev->nr_vring; i++) - reset_vring_queue(dev->virtqueue[i]); -} - -/* - * Invoked when there is a new vhost-user connection established (when - * there is a new virtio device being attached). - */ -int -vhost_new_device(void) -{ - struct virtio_net *dev; - int i; - - dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); - if (dev == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to allocate memory for new dev.\n"); - return -1; - } - - for (i = 0; i < MAX_VHOST_DEVICE; i++) { - if (vhost_devices[i] == NULL) - break; - } - if (i == MAX_VHOST_DEVICE) { - RTE_LOG(ERR, VHOST_CONFIG, - "Failed to find a free slot for new device.\n"); - rte_free(dev); - return -1; - } - - vhost_devices[i] = dev; - dev->vid = i; - - return i; -} - -/* - * Invoked when there is the vhost-user connection is broken (when - * the virtio device is being detached). - */ -void -vhost_destroy_device(int vid) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return; - - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(vid); - } - - cleanup_device(dev, 1); - free_device(dev); - - vhost_devices[vid] = NULL; -} - -void -vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) -{ - struct virtio_net *dev; - unsigned int len; - - dev = get_device(vid); - if (dev == NULL) - return; - - len = if_len > sizeof(dev->ifname) ? - sizeof(dev->ifname) : if_len; - - strncpy(dev->ifname, if_name, len); - dev->ifname[sizeof(dev->ifname) - 1] = '\0'; -} - -void -vhost_enable_dequeue_zero_copy(int vid) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return; - - dev->dequeue_zero_copy = 1; -} - -int -rte_vhost_get_mtu(int vid, uint16_t *mtu) -{ - struct virtio_net *dev = get_device(vid); - - if (!dev) - return -ENODEV; - - if (!(dev->flags & VIRTIO_DEV_READY)) - return -EAGAIN; - - if (!(dev->features & VIRTIO_NET_F_MTU)) - return -ENOTSUP; - - *mtu = dev->mtu; - - return 0; -} - -int -rte_vhost_get_numa_node(int vid) -{ -#ifdef RTE_LIBRTE_VHOST_NUMA - struct virtio_net *dev = get_device(vid); - int numa_node; - int ret; - - if (dev == NULL) - return -1; - - ret = get_mempolicy(&numa_node, NULL, 0, dev, - MPOL_F_NODE | MPOL_F_ADDR); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to query numa node: %d\n", vid, ret); - return -1; - } - - return numa_node; -#else - RTE_SET_USED(vid); - return -1; -#endif -} - -uint32_t -rte_vhost_get_queue_num(int vid) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return 0; - - return dev->nr_vring / 2; -} - -uint16_t -rte_vhost_get_vring_num(int vid) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return 0; - - return dev->nr_vring; -} - -int -rte_vhost_get_ifname(int vid, char *buf, size_t len) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return -1; - - len = RTE_MIN(len, sizeof(dev->ifname)); - - strncpy(buf, dev->ifname, len); - buf[len - 1] = '\0'; - - return 0; -} - -int -rte_vhost_get_negotiated_features(int vid, uint64_t *features) -{ - struct virtio_net *dev; - - dev = get_device(vid); - if (!dev) - return -1; - - *features = dev->features; - return 0; -} - -int -rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) -{ - struct virtio_net *dev; - struct rte_vhost_memory *m; - size_t size; - - dev = get_device(vid); - if (!dev) - return -1; - - size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); - m = malloc(sizeof(struct rte_vhost_memory) + size); - if (!m) - return -1; - - m->nregions = dev->mem->nregions; - memcpy(m->regions, dev->mem->regions, size); - *mem = m; - - return 0; -} - -int -rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, - struct rte_vhost_vring *vring) -{ - struct virtio_net *dev; - struct vhost_virtqueue *vq; - - dev = get_device(vid); - if (!dev) - return -1; - - if (vring_idx >= VHOST_MAX_VRING) - return -1; - - vq = dev->virtqueue[vring_idx]; - if (!vq) - return -1; - - vring->desc = vq->desc; - vring->avail = vq->avail; - vring->used = vq->used; - vring->log_guest_addr = vq->log_guest_addr; - - vring->callfd = vq->callfd; - vring->kickfd = vq->kickfd; - vring->size = vq->size; - - vring->last_avail_idx = vq->last_avail_idx; - vring->last_used_idx = vq->last_used_idx; - - return 0; -} - -uint16_t -rte_vhost_avail_entries(int vid, uint16_t queue_id) -{ - struct virtio_net *dev; - struct vhost_virtqueue *vq; - - dev = get_device(vid); - if (!dev) - return 0; - - vq = dev->virtqueue[queue_id]; - if (!vq->enabled) - return 0; - - return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; -} - -int -rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return -1; - - if (enable) { - RTE_LOG(ERR, VHOST_CONFIG, - "guest notification isn't supported.\n"); - return -1; - } - - dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; - return 0; -} - -void -rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) -{ - struct virtio_net *dev = get_device(vid); - - if (dev == NULL) - return; - - vhost_log_write(dev, addr, len); -} - -void -rte_vhost_log_used_vring(int vid, uint16_t vring_idx, - uint64_t offset, uint64_t len) -{ - struct virtio_net *dev; - struct vhost_virtqueue *vq; - - dev = get_device(vid); - if (dev == NULL) - return; - - if (vring_idx >= VHOST_MAX_VRING) - return; - vq = dev->virtqueue[vring_idx]; - if (!vq) - return; - - vhost_log_used_vring(dev, vq, offset, len); -} - -int -rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx, - uint16_t last_avail_idx, uint16_t last_used_idx) { - struct virtio_net *dev; - struct vhost_virtqueue *vq; - - dev = get_device(vid); - if (!dev) - return -1; - - if (vring_idx >= VHOST_MAX_VRING) - return -1; - - vq = dev->virtqueue[vring_idx]; - if (!vq) - return -1; - - vq->last_avail_idx = last_avail_idx; - vq->last_used_idx = last_used_idx; - - return 0; -} diff --git a/lib/vhost/rte_vhost_17_05/vhost.h b/lib/vhost/rte_vhost_17_05/vhost.h deleted file mode 100644 index fd37f4340..000000000 --- a/lib/vhost/rte_vhost_17_05/vhost.h +++ /dev/null @@ -1,319 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VHOST_NET_CDEV_H_ -#define _VHOST_NET_CDEV_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "rte_vhost.h" -#include "vhost_user.h" - -/* Used to indicate that the device is running on a data core */ -#define VIRTIO_DEV_RUNNING 1 -/* Used to indicate that the device is ready to operate */ -#define VIRTIO_DEV_READY 2 - -/* Backend value set by guest. */ -#define VIRTIO_DEV_STOPPED -1 - -#define BUF_VECTOR_MAX 256 - -/** - * Structure contains buffer address, length and descriptor index - * from vring to do scatter RX. - */ -struct buf_vector { - uint64_t buf_addr; - uint32_t buf_len; - uint32_t desc_idx; -}; - -/* - * A structure to hold some fields needed in zero copy code path, - * mainly for associating an mbuf with the right desc_idx. - */ -struct zcopy_mbuf { - struct rte_mbuf *mbuf; - uint32_t desc_idx; - uint16_t in_use; - - TAILQ_ENTRY(zcopy_mbuf) next; -}; -TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); - -/** - * Structure contains variables relevant to RX/TX virtqueues. - */ -struct vhost_virtqueue { - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; - uint32_t size; - - uint16_t last_avail_idx; - uint16_t last_used_idx; -#define VIRTIO_INVALID_EVENTFD (-1) -#define VIRTIO_UNINITIALIZED_EVENTFD (-2) - - /* Backend value to determine if device should started/stopped */ - int backend; - /* Used to notify the guest (trigger interrupt) */ - int callfd; - /* Currently unused as polling mode is enabled */ - int kickfd; - int enabled; - - /* Physical address of used ring, for logging */ - uint64_t log_guest_addr; - - uint16_t nr_zmbuf; - uint16_t zmbuf_size; - uint16_t last_zmbuf_idx; - struct zcopy_mbuf *zmbufs; - struct zcopy_mbuf_list zmbuf_list; - - struct vring_used_elem *shadow_used_ring; - uint16_t shadow_used_idx; -} __rte_cache_aligned; - -/* Old kernels have no such macros defined */ -#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE - #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 -#endif - -#ifndef VIRTIO_NET_F_MQ - #define VIRTIO_NET_F_MQ 22 -#endif - -#define VHOST_MAX_VRING 0x100 -#define VHOST_MAX_QUEUE_PAIRS 0x80 - -#ifndef VIRTIO_NET_F_MTU - #define VIRTIO_NET_F_MTU 3 -#endif - -/* - * Define virtio 1.0 for older kernels - */ -#ifndef VIRTIO_F_VERSION_1 - #define VIRTIO_F_VERSION_1 32 -#endif - -#define VHOST_USER_F_PROTOCOL_FEATURES 30 - -/* Features supported by this builtin vhost-user net driver. */ -#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ - (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ - (1ULL << VIRTIO_NET_F_CTRL_RX) | \ - (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ - (1ULL << VIRTIO_NET_F_MQ) | \ - (1ULL << VIRTIO_F_VERSION_1) | \ - (1ULL << VHOST_F_LOG_ALL) | \ - (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ - (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ - (1ULL << VIRTIO_NET_F_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ - (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ - (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ - (1ULL << VIRTIO_NET_F_MTU)) - - -struct guest_page { - uint64_t guest_phys_addr; - uint64_t host_phys_addr; - uint64_t size; -}; - -/** - * Device structure contains all configuration information relating - * to the device. - */ -struct virtio_net { - /* Frontend (QEMU) memory and memory region information */ - struct rte_vhost_memory *mem; - uint64_t features; - uint64_t protocol_features; - int vid; - uint32_t flags; - uint16_t vhost_hlen; - /* to tell if we need broadcast rarp packet */ - rte_atomic16_t broadcast_rarp; - uint32_t nr_vring; - int dequeue_zero_copy; - struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; -#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) - char ifname[IF_NAME_SZ]; - uint64_t log_size; - uint64_t log_base; - uint64_t log_addr; - struct ether_addr mac; - uint16_t mtu; - - struct vhost_device_ops const *notify_ops; - - uint32_t nr_guest_pages; - uint32_t max_guest_pages; - struct guest_page *guest_pages; - int has_new_mem_table; - struct VhostUserMemory mem_table; - int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS]; -} __rte_cache_aligned; - - -#define VHOST_LOG_PAGE 4096 - -static inline void __attribute__((always_inline)) -vhost_log_page(uint8_t *log_base, uint64_t page) -{ - log_base[page / 8] |= 1 << (page % 8); -} - -static inline void __attribute__((always_inline)) -vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) -{ - uint64_t page; - - if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || - !dev->log_base || !len)) - return; - - if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) - return; - - /* To make sure guest memory updates are committed before logging */ - rte_smp_wmb(); - - page = addr / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < addr + len) { - vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); - page += 1; - } -} - -static inline void __attribute__((always_inline)) -vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint64_t offset, uint64_t len) -{ - vhost_log_write(dev, vq->log_guest_addr + offset, len); -} - -/* Macros for printing using RTE_LOG */ -#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 -#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 - -#ifdef RTE_LIBRTE_VHOST_DEBUG -#define VHOST_MAX_PRINT_BUFF 6072 -#define LOG_LEVEL RTE_LOG_DEBUG -#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) -#define PRINT_PACKET(device, addr, size, header) do { \ - char *pkt_addr = (char *)(addr); \ - unsigned int index; \ - char packet[VHOST_MAX_PRINT_BUFF]; \ - \ - if ((header)) \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ - else \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ - for (index = 0; index < (size); index++) { \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ - "%02hhx ", pkt_addr[index]); \ - } \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ - \ - LOG_DEBUG(VHOST_DATA, "%s", packet); \ -} while (0) -#else -#define LOG_LEVEL RTE_LOG_INFO -#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) -#define PRINT_PACKET(device, addr, size, header) do {} while (0) -#endif - -extern uint64_t VHOST_FEATURES; -#define MAX_VHOST_DEVICE 1024 -extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; - -/* Convert guest physical address to host physical address */ -static inline phys_addr_t __attribute__((always_inline)) -gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) -{ - uint32_t i; - struct guest_page *page; - - for (i = 0; i < dev->nr_guest_pages; i++) { - page = &dev->guest_pages[i]; - - if (gpa >= page->guest_phys_addr && - gpa + size < page->guest_phys_addr + page->size) { - return gpa - page->guest_phys_addr + - page->host_phys_addr; - } - } - - return 0; -} - -struct virtio_net *get_device(int vid); - -int vhost_new_device(void); -void cleanup_device(struct virtio_net *dev, int destroy); -void reset_device(struct virtio_net *dev); -void vhost_destroy_device(int); - -int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); - -void vhost_set_ifname(int, const char *if_name, unsigned int if_len); -void vhost_enable_dequeue_zero_copy(int vid); - -struct vhost_device_ops const *vhost_driver_callback_get(const char *path); - -/* - * Backend-specific cleanup. - * - * TODO: fix it; we have one backend now - */ -void vhost_backend_cleanup(struct virtio_net *dev); - -#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/rte_vhost_17_05/vhost_user.c b/lib/vhost/rte_vhost_17_05/vhost_user.c deleted file mode 100644 index 5b83d48a1..000000000 --- a/lib/vhost/rte_vhost_17_05/vhost_user.c +++ /dev/null @@ -1,1164 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef RTE_LIBRTE_VHOST_NUMA -#include -#endif - -#include -#include -#include - -#include "vhost.h" -#include "vhost_user.h" - -#define VIRTIO_MIN_MTU 68 -#define VIRTIO_MAX_MTU 65535 - -static const char *vhost_message_str[VHOST_USER_MAX] = { - [VHOST_USER_NONE] = "VHOST_USER_NONE", - [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", - [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", - [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", - [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", - [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", - [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", - [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", - [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", - [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", - [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", - [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", - [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", - [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", - [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", - [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", - [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", - [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", - [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", - [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", - [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", -}; - -static uint64_t -get_blk_size(int fd) -{ - struct stat stat; - int ret; - - ret = fstat(fd, &stat); - return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; -} - -static void -free_mem_region(struct virtio_net *dev) -{ - uint32_t i; - struct rte_vhost_mem_region *reg; - - if (!dev || !dev->mem) - return; - - for (i = 0; i < dev->mem->nregions; i++) { - reg = &dev->mem->regions[i]; - if (reg->host_user_addr) { - munmap(reg->mmap_addr, reg->mmap_size); - close(reg->fd); - } - } -} - -void -vhost_backend_cleanup(struct virtio_net *dev) -{ - if (dev->mem) { - free_mem_region(dev); - rte_free(dev->mem); - dev->mem = NULL; - } - - free(dev->guest_pages); - dev->guest_pages = NULL; - - if (dev->log_addr) { - munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); - dev->log_addr = 0; - } -} - -/* - * This function just returns success at the moment unless - * the device hasn't been initialised. - */ -static int -vhost_user_set_owner(void) -{ - return 0; -} - -static int -vhost_user_reset_owner(struct virtio_net *dev) -{ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } - - cleanup_device(dev, 0); - reset_device(dev); - return 0; -} - -/* - * The features that we support are requested. - */ -static uint64_t -vhost_user_get_features(struct virtio_net *dev) -{ - uint64_t features = 0; - - rte_vhost_driver_get_features(dev->ifname, &features); - return features; -} - -/* - * We receive the negotiated features supported by us and the virtio device. - */ -static int -vhost_user_set_features(struct virtio_net *dev, uint64_t features) -{ - uint64_t vhost_features = 0; - - rte_vhost_driver_get_features(dev->ifname, &vhost_features); - if (features & ~vhost_features) - return -1; - - if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->features != features) { - if (dev->notify_ops->features_changed) - dev->notify_ops->features_changed(dev->vid, features); - } - - dev->features = features; - if (dev->features & - ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { - dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); - } else { - dev->vhost_hlen = sizeof(struct virtio_net_hdr); - } - LOG_DEBUG(VHOST_CONFIG, - "(%d) mergeable RX buffers %s, virtio 1 %s\n", - dev->vid, - (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", - (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); - - return 0; -} - -/* - * The virtio device sends us the size of the descriptor ring. - */ -static int -vhost_user_set_vring_num(struct virtio_net *dev, - VhostUserMsg *msg) -{ - struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; - - vq->size = msg->payload.state.num; - - if (dev->dequeue_zero_copy) { - vq->nr_zmbuf = 0; - vq->last_zmbuf_idx = 0; - vq->zmbuf_size = vq->size; - vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * - sizeof(struct zcopy_mbuf), 0); - if (vq->zmbufs == NULL) { - RTE_LOG(WARNING, VHOST_CONFIG, - "failed to allocate mem for zero copy; " - "zero copy is force disabled\n"); - dev->dequeue_zero_copy = 0; - } - } - - vq->shadow_used_ring = rte_malloc(NULL, - vq->size * sizeof(struct vring_used_elem), - RTE_CACHE_LINE_SIZE); - if (!vq->shadow_used_ring) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to allocate memory for shadow used ring.\n"); - return -1; - } - - return 0; -} - -/* - * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the - * same numa node as the memory of vring descriptor. - */ -#ifdef RTE_LIBRTE_VHOST_NUMA -static struct virtio_net* -numa_realloc(struct virtio_net *dev, int index) -{ - int oldnode, newnode; - struct virtio_net *old_dev; - struct vhost_virtqueue *old_vq, *vq; - int ret; - - old_dev = dev; - vq = old_vq = dev->virtqueue[index]; - - ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, - MPOL_F_NODE | MPOL_F_ADDR); - - /* check if we need to reallocate vq */ - ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, - MPOL_F_NODE | MPOL_F_ADDR); - if (ret) { - RTE_LOG(ERR, VHOST_CONFIG, - "Unable to get vq numa information.\n"); - return dev; - } - if (oldnode != newnode) { - RTE_LOG(INFO, VHOST_CONFIG, - "reallocate vq from %d to %d node\n", oldnode, newnode); - vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); - if (!vq) - return dev; - - memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM); - rte_free(old_vq); - } - - /* check if we need to reallocate dev */ - ret = get_mempolicy(&oldnode, NULL, 0, old_dev, - MPOL_F_NODE | MPOL_F_ADDR); - if (ret) { - RTE_LOG(ERR, VHOST_CONFIG, - "Unable to get dev numa information.\n"); - goto out; - } - if (oldnode != newnode) { - RTE_LOG(INFO, VHOST_CONFIG, - "reallocate dev from %d to %d node\n", - oldnode, newnode); - dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); - if (!dev) { - dev = old_dev; - goto out; - } - - memcpy(dev, old_dev, sizeof(*dev)); - rte_free(old_dev); - } - -out: - dev->virtqueue[index] = vq; - vhost_devices[dev->vid] = dev; - - return dev; -} -#else -static struct virtio_net* -numa_realloc(struct virtio_net *dev, int index __rte_unused) -{ - return dev; -} -#endif - -/* - * Converts QEMU virtual address to Vhost virtual address. This function is - * used to convert the ring addresses to our address space. - */ -static uint64_t -qva_to_vva(struct virtio_net *dev, uint64_t qva) -{ - struct rte_vhost_mem_region *reg; - uint32_t i; - - /* Find the region where the address lives. */ - for (i = 0; i < dev->mem->nregions; i++) { - reg = &dev->mem->regions[i]; - - if (qva >= reg->guest_user_addr && - qva < reg->guest_user_addr + reg->size) { - return qva - reg->guest_user_addr + - reg->host_user_addr; - } - } - - return 0; -} - -static int vhost_setup_mem_table(struct virtio_net *dev); - -/* - * The virtio device sends us the desc, used and avail ring addresses. - * This function then converts these to our address space. - */ -static int -vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg) -{ - struct vhost_virtqueue *vq; - - if (dev->has_new_mem_table) { - vhost_setup_mem_table(dev); - dev->has_new_mem_table = 0; - } - - - if (dev->mem == NULL) - return -1; - - /* addr->index refers to the queue index. The txq 1, rxq is 0. */ - vq = dev->virtqueue[msg->payload.addr.index]; - - /* The addresses are converted from QEMU virtual to Vhost virtual. */ - vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, - msg->payload.addr.desc_user_addr); - if (vq->desc == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to find desc ring address.\n", - dev->vid); - return -1; - } - - dev = numa_realloc(dev, msg->payload.addr.index); - vq = dev->virtqueue[msg->payload.addr.index]; - - vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, - msg->payload.addr.avail_user_addr); - if (vq->avail == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to find avail ring address.\n", - dev->vid); - return -1; - } - - vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, - msg->payload.addr.used_user_addr); - if (vq->used == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to find used ring address.\n", - dev->vid); - return -1; - } - - if (vq->last_used_idx != vq->used->idx) { - RTE_LOG(WARNING, VHOST_CONFIG, - "last_used_idx (%u) and vq->used->idx (%u) mismatches; " - "some packets maybe resent for Tx and dropped for Rx\n", - vq->last_used_idx, vq->used->idx); - vq->last_used_idx = vq->used->idx; - vq->last_avail_idx = vq->used->idx; - } - - vq->log_guest_addr = msg->payload.addr.log_guest_addr; - - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", - dev->vid, vq->desc); - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", - dev->vid, vq->avail); - LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", - dev->vid, vq->used); - LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", - dev->vid, vq->log_guest_addr); - - return 0; -} - -/* - * The virtio device sends us the available ring last used index. - */ -static int -vhost_user_set_vring_base(struct virtio_net *dev, - VhostUserMsg *msg) -{ - /* Remove from the data plane. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } - - dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num; - dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num; - - return 0; -} - -static void -add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, - uint64_t host_phys_addr, uint64_t size) -{ - struct guest_page *page, *last_page; - - if (dev->nr_guest_pages == dev->max_guest_pages) { - dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2); - dev->guest_pages = realloc(dev->guest_pages, - dev->max_guest_pages * sizeof(*page)); - } - - if (dev->nr_guest_pages > 0) { - last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; - /* merge if the two pages are continuous */ - if (host_phys_addr == last_page->host_phys_addr + - last_page->size) { - last_page->size += size; - return; - } - } - - page = &dev->guest_pages[dev->nr_guest_pages++]; - page->guest_phys_addr = guest_phys_addr; - page->host_phys_addr = host_phys_addr; - page->size = size; -} - -static void -add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, - uint64_t page_size) -{ - uint64_t reg_size = reg->size; - uint64_t host_user_addr = reg->host_user_addr; - uint64_t guest_phys_addr = reg->guest_phys_addr; - uint64_t host_phys_addr; - uint64_t size; - - host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); - size = page_size - (guest_phys_addr & (page_size - 1)); - size = RTE_MIN(size, reg_size); - - add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); - host_user_addr += size; - guest_phys_addr += size; - reg_size -= size; - - while (reg_size > 0) { - size = RTE_MIN(reg_size, page_size); - host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) - host_user_addr); - add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); - - host_user_addr += size; - guest_phys_addr += size; - reg_size -= size; - } -} - -#ifdef RTE_LIBRTE_VHOST_DEBUG -/* TODO: enable it only in debug mode? */ -static void -dump_guest_pages(struct virtio_net *dev) -{ - uint32_t i; - struct guest_page *page; - - for (i = 0; i < dev->nr_guest_pages; i++) { - page = &dev->guest_pages[i]; - - RTE_LOG(INFO, VHOST_CONFIG, - "guest physical page region %u\n" - "\t guest_phys_addr: %" PRIx64 "\n" - "\t host_phys_addr : %" PRIx64 "\n" - "\t size : %" PRIx64 "\n", - i, - page->guest_phys_addr, - page->host_phys_addr, - page->size); - } -} -#else -#define dump_guest_pages(dev) -#endif - -static int -vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) -{ - uint32_t i; - - if (dev->has_new_mem_table) { - /* - * The previous mem table was not consumed, so close the - * file descriptors from that mem table before copying - * the new one. - */ - for (i = 0; i < dev->mem_table.nregions; i++) { - close(dev->mem_table_fds[i]); - } - } - - memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table)); - memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds)); - dev->has_new_mem_table = 1; - - return 0; -} - - static int -vhost_setup_mem_table(struct virtio_net *dev) -{ - struct VhostUserMemory memory = dev->mem_table; - struct rte_vhost_mem_region *reg; - void *mmap_addr; - uint64_t mmap_size; - uint64_t mmap_offset; - uint64_t alignment; - uint32_t i; - int fd; - - if (dev->mem) { - free_mem_region(dev); - rte_free(dev->mem); - dev->mem = NULL; - } - - dev->nr_guest_pages = 0; - if (!dev->guest_pages) { - dev->max_guest_pages = 8; - dev->guest_pages = malloc(dev->max_guest_pages * - sizeof(struct guest_page)); - } - - dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + - sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); - if (dev->mem == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%d) failed to allocate memory for dev->mem\n", - dev->vid); - return -1; - } - dev->mem->nregions = memory.nregions; - - for (i = 0; i < memory.nregions; i++) { - fd = dev->mem_table_fds[i]; - reg = &dev->mem->regions[i]; - - reg->guest_phys_addr = memory.regions[i].guest_phys_addr; - reg->guest_user_addr = memory.regions[i].userspace_addr; - reg->size = memory.regions[i].memory_size; - reg->fd = fd; - - mmap_offset = memory.regions[i].mmap_offset; - mmap_size = reg->size + mmap_offset; - - /* mmap() without flag of MAP_ANONYMOUS, should be called - * with length argument aligned with hugepagesz at older - * longterm version Linux, like 2.6.32 and 3.2.72, or - * mmap() will fail with EINVAL. - * - * to avoid failure, make sure in caller to keep length - * aligned. - */ - alignment = get_blk_size(fd); - if (alignment == (uint64_t)-1) { - RTE_LOG(ERR, VHOST_CONFIG, - "couldn't get hugepage size through fstat\n"); - goto err_mmap; - } - mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); - - mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, 0); - - if (mmap_addr == MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, - "mmap region %u failed.\n", i); - goto err_mmap; - } - - reg->mmap_addr = mmap_addr; - reg->mmap_size = mmap_size; - reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + - mmap_offset; - - if (dev->dequeue_zero_copy) - add_guest_pages(dev, reg, alignment); - - RTE_LOG(INFO, VHOST_CONFIG, - "guest memory region %u, size: 0x%" PRIx64 "\n" - "\t guest physical addr: 0x%" PRIx64 "\n" - "\t guest virtual addr: 0x%" PRIx64 "\n" - "\t host virtual addr: 0x%" PRIx64 "\n" - "\t mmap addr : 0x%" PRIx64 "\n" - "\t mmap size : 0x%" PRIx64 "\n" - "\t mmap align: 0x%" PRIx64 "\n" - "\t mmap off : 0x%" PRIx64 "\n", - i, reg->size, - reg->guest_phys_addr, - reg->guest_user_addr, - reg->host_user_addr, - (uint64_t)(uintptr_t)mmap_addr, - mmap_size, - alignment, - mmap_offset); - } - - dump_guest_pages(dev); - - return 0; - -err_mmap: - free_mem_region(dev); - rte_free(dev->mem); - dev->mem = NULL; - return -1; -} - -static int -vq_is_ready(struct vhost_virtqueue *vq) -{ - return vq && vq->desc && - vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && - vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; -} - -static int -virtio_is_ready(struct virtio_net *dev) -{ - struct vhost_virtqueue *vq; - uint32_t i; - - if (dev->nr_vring == 0) - return 0; - - for (i = 0; i < dev->nr_vring; i++) { - vq = dev->virtqueue[i]; - - if (!vq_is_ready(vq)) - return 0; - } - - RTE_LOG(INFO, VHOST_CONFIG, - "virtio is now ready for processing.\n"); - return 1; -} - -static void -vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) -{ - struct vhost_vring_file file; - struct vhost_virtqueue *vq; - - /* Remove from the data plane. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } - - file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) - file.fd = VIRTIO_INVALID_EVENTFD; - else - file.fd = pmsg->fds[0]; - RTE_LOG(INFO, VHOST_CONFIG, - "vring call idx:%d file:%d\n", file.index, file.fd); - - vq = dev->virtqueue[file.index]; - if (vq->callfd >= 0) - close(vq->callfd); - - vq->callfd = file.fd; -} - -static void -vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) -{ - struct vhost_vring_file file; - struct vhost_virtqueue *vq; - - /* Remove from the data plane. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } - - file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) - file.fd = VIRTIO_INVALID_EVENTFD; - else - file.fd = pmsg->fds[0]; - RTE_LOG(INFO, VHOST_CONFIG, - "vring kick idx:%d file:%d\n", file.index, file.fd); - - vq = dev->virtqueue[file.index]; - if (vq->kickfd >= 0) - close(vq->kickfd); - vq->kickfd = file.fd; -} - -static void -free_zmbufs(struct vhost_virtqueue *vq) -{ - struct zcopy_mbuf *zmbuf, *next; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); - - rte_pktmbuf_free(zmbuf->mbuf); - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - } - - rte_free(vq->zmbufs); -} - -/* - * when virtio is stopped, qemu will send us the GET_VRING_BASE message. - */ -static int -vhost_user_get_vring_base(struct virtio_net *dev, - VhostUserMsg *msg) -{ - struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; - - /* We have to stop the queue (virtio) if it is running. */ - if (dev->flags & VIRTIO_DEV_RUNNING) { - dev->flags &= ~VIRTIO_DEV_RUNNING; - dev->notify_ops->destroy_device(dev->vid); - } - - dev->flags &= ~VIRTIO_DEV_READY; - - /* Here we are safe to get the last used index */ - msg->payload.state.num = vq->last_used_idx; - - RTE_LOG(INFO, VHOST_CONFIG, - "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num); - /* - * Based on current qemu vhost-user implementation, this message is - * sent and only sent in vhost_vring_stop. - * TODO: cleanup the vring, it isn't usable since here. - */ - if (vq->kickfd >= 0) - close(vq->kickfd); - - vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; - - if (vq->callfd >= 0) - close(vq->callfd); - - vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; - - if (dev->dequeue_zero_copy) - free_zmbufs(vq); - rte_free(vq->shadow_used_ring); - vq->shadow_used_ring = NULL; - - return 0; -} - -/* - * when virtio queues are ready to work, qemu will send us to - * enable the virtio queue pair. - */ -static int -vhost_user_set_vring_enable(struct virtio_net *dev, - VhostUserMsg *msg) -{ - int enable = (int)msg->payload.state.num; - - RTE_LOG(INFO, VHOST_CONFIG, - "set queue enable: %d to qp idx: %d\n", - enable, msg->payload.state.index); - - if (dev->notify_ops->vring_state_changed) - dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable); - - dev->virtqueue[msg->payload.state.index]->enabled = enable; - - return 0; -} - -static void -vhost_user_set_protocol_features(struct virtio_net *dev, - uint64_t protocol_features) -{ - if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) - return; - - dev->protocol_features = protocol_features; -} - -static int -vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) -{ - int fd = msg->fds[0]; - uint64_t size, off; - void *addr; - - if (fd < 0) { - RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); - return -1; - } - - if (msg->size != sizeof(VhostUserLog)) { - RTE_LOG(ERR, VHOST_CONFIG, - "invalid log base msg size: %"PRId32" != %d\n", - msg->size, (int)sizeof(VhostUserLog)); - return -1; - } - - size = msg->payload.log.mmap_size; - off = msg->payload.log.mmap_offset; - RTE_LOG(INFO, VHOST_CONFIG, - "log mmap size: %"PRId64", offset: %"PRId64"\n", - size, off); - - /* - * mmap from 0 to workaround a hugepage mmap bug: mmap will - * fail when offset is not page size aligned. - */ - addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - close(fd); - if (addr == MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); - return -1; - } - - /* - * Free previously mapped log memory on occasionally - * multiple VHOST_USER_SET_LOG_BASE. - */ - if (dev->log_addr) { - munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); - } - dev->log_addr = (uint64_t)(uintptr_t)addr; - dev->log_base = dev->log_addr + off; - dev->log_size = size; - - return 0; -} - -/* - * An rarp packet is constructed and broadcasted to notify switches about - * the new location of the migrated VM, so that packets from outside will - * not be lost after migration. - * - * However, we don't actually "send" a rarp packet here, instead, we set - * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. - */ -static int -vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) -{ - uint8_t *mac = (uint8_t *)&msg->payload.u64; - - RTE_LOG(DEBUG, VHOST_CONFIG, - ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", - mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); - memcpy(dev->mac.addr_bytes, mac, 6); - - /* - * Set the flag to inject a RARP broadcast packet at - * rte_vhost_dequeue_burst(). - * - * rte_smp_wmb() is for making sure the mac is copied - * before the flag is set. - */ - rte_smp_wmb(); - rte_atomic16_set(&dev->broadcast_rarp, 1); - - return 0; -} - -static int -vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) -{ - if (msg->payload.u64 < VIRTIO_MIN_MTU || - msg->payload.u64 > VIRTIO_MAX_MTU) { - RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", - msg->payload.u64); - - return -1; - } - - dev->mtu = msg->payload.u64; - - return 0; -} - -/* return bytes# of read on success or negative val on failure. */ -static int -read_vhost_message(int sockfd, struct VhostUserMsg *msg) -{ - int ret; - - ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, - msg->fds, VHOST_MEMORY_MAX_NREGIONS); - if (ret <= 0) - return ret; - - if (msg && msg->size) { - if (msg->size > sizeof(msg->payload)) { - RTE_LOG(ERR, VHOST_CONFIG, - "invalid msg size: %d\n", msg->size); - return -1; - } - ret = read(sockfd, &msg->payload, msg->size); - if (ret <= 0) - return ret; - if (ret != (int)msg->size) { - RTE_LOG(ERR, VHOST_CONFIG, - "read control message failed\n"); - return -1; - } - } - - return ret; -} - -static int -send_vhost_message(int sockfd, struct VhostUserMsg *msg) -{ - int ret; - - if (!msg) - return 0; - - msg->flags &= ~VHOST_USER_VERSION_MASK; - msg->flags &= ~VHOST_USER_NEED_REPLY; - msg->flags |= VHOST_USER_VERSION; - msg->flags |= VHOST_USER_REPLY_MASK; - - ret = send_fd_message(sockfd, (char *)msg, - VHOST_USER_HDR_SIZE + msg->size, NULL, 0); - - return ret; -} - -/* - * Allocate a queue pair if it hasn't been allocated yet - */ -static int -vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) -{ - uint16_t vring_idx; - - switch (msg->request) { - case VHOST_USER_SET_VRING_KICK: - case VHOST_USER_SET_VRING_CALL: - case VHOST_USER_SET_VRING_ERR: - vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; - break; - case VHOST_USER_SET_VRING_NUM: - case VHOST_USER_SET_VRING_BASE: - case VHOST_USER_SET_VRING_ENABLE: - vring_idx = msg->payload.state.index; - break; - case VHOST_USER_SET_VRING_ADDR: - vring_idx = msg->payload.addr.index; - break; - default: - return 0; - } - - if (vring_idx >= VHOST_MAX_VRING) { - RTE_LOG(ERR, VHOST_CONFIG, - "invalid vring index: %u\n", vring_idx); - return -1; - } - - if (dev->virtqueue[vring_idx]) - return 0; - - return alloc_vring_queue(dev, vring_idx); -} - -int -vhost_user_msg_handler(int vid, int fd) -{ - struct virtio_net *dev; - struct VhostUserMsg msg; - int ret; - - dev = get_device(vid); - if (dev == NULL) - return -1; - - if (!dev->notify_ops) { - dev->notify_ops = vhost_driver_callback_get(dev->ifname); - if (!dev->notify_ops) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to get callback ops for driver %s\n", - dev->ifname); - return -1; - } - } - - ret = read_vhost_message(fd, &msg); - if (ret <= 0 || msg.request >= VHOST_USER_MAX) { - if (ret < 0) - RTE_LOG(ERR, VHOST_CONFIG, - "vhost read message failed\n"); - else if (ret == 0) - RTE_LOG(INFO, VHOST_CONFIG, - "vhost peer closed\n"); - else - RTE_LOG(ERR, VHOST_CONFIG, - "vhost read incorrect message\n"); - - return -1; - } - - RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", - vhost_message_str[msg.request]); - - ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "failed to alloc queue\n"); - return -1; - } - - switch (msg.request) { - case VHOST_USER_GET_FEATURES: - msg.payload.u64 = vhost_user_get_features(dev); - msg.size = sizeof(msg.payload.u64); - send_vhost_message(fd, &msg); - break; - case VHOST_USER_SET_FEATURES: - vhost_user_set_features(dev, msg.payload.u64); - break; - - case VHOST_USER_GET_PROTOCOL_FEATURES: - msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; - msg.size = sizeof(msg.payload.u64); - send_vhost_message(fd, &msg); - break; - case VHOST_USER_SET_PROTOCOL_FEATURES: - vhost_user_set_protocol_features(dev, msg.payload.u64); - break; - - case VHOST_USER_SET_OWNER: - vhost_user_set_owner(); - break; - case VHOST_USER_RESET_OWNER: - vhost_user_reset_owner(dev); - break; - - case VHOST_USER_SET_MEM_TABLE: - ret = vhost_user_set_mem_table(dev, &msg); - break; - - case VHOST_USER_SET_LOG_BASE: - vhost_user_set_log_base(dev, &msg); - - /* it needs a reply */ - msg.size = sizeof(msg.payload.u64); - send_vhost_message(fd, &msg); - break; - case VHOST_USER_SET_LOG_FD: - close(msg.fds[0]); - RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); - break; - - case VHOST_USER_SET_VRING_NUM: - vhost_user_set_vring_num(dev, &msg); - break; - case VHOST_USER_SET_VRING_ADDR: - vhost_user_set_vring_addr(dev, &msg); - break; - case VHOST_USER_SET_VRING_BASE: - vhost_user_set_vring_base(dev, &msg); - break; - - case VHOST_USER_GET_VRING_BASE: - vhost_user_get_vring_base(dev, &msg); - msg.size = sizeof(msg.payload.state); - send_vhost_message(fd, &msg); - break; - - case VHOST_USER_SET_VRING_KICK: - vhost_user_set_vring_kick(dev, &msg); - break; - case VHOST_USER_SET_VRING_CALL: - vhost_user_set_vring_call(dev, &msg); - break; - - case VHOST_USER_SET_VRING_ERR: - if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) - close(msg.fds[0]); - RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); - break; - - case VHOST_USER_GET_QUEUE_NUM: - msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; - msg.size = sizeof(msg.payload.u64); - send_vhost_message(fd, &msg); - break; - - case VHOST_USER_SET_VRING_ENABLE: - vhost_user_set_vring_enable(dev, &msg); - break; - case VHOST_USER_SEND_RARP: - vhost_user_send_rarp(dev, &msg); - break; - - case VHOST_USER_NET_SET_MTU: - ret = vhost_user_net_set_mtu(dev, &msg); - break; - - default: - ret = -1; - break; - - } - - if (msg.flags & VHOST_USER_NEED_REPLY) { - msg.payload.u64 = !!ret; - msg.size = sizeof(msg.payload.u64); - send_vhost_message(fd, &msg); - } - - if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { - dev->flags |= VIRTIO_DEV_READY; - - if (!(dev->flags & VIRTIO_DEV_RUNNING)) { - if (dev->dequeue_zero_copy) { - RTE_LOG(INFO, VHOST_CONFIG, - "dequeue zero copy is enabled\n"); - } - - if (dev->notify_ops->new_device(dev->vid) == 0) - dev->flags |= VIRTIO_DEV_RUNNING; - } - } - - return 0; -} diff --git a/lib/vhost/rte_vhost_17_05/vhost_user.h b/lib/vhost/rte_vhost_17_05/vhost_user.h deleted file mode 100644 index 2ba22dbb0..000000000 --- a/lib/vhost/rte_vhost_17_05/vhost_user.h +++ /dev/null @@ -1,134 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VHOST_NET_USER_H -#define _VHOST_NET_USER_H - -#include -#include - -#include "rte_vhost.h" - -/* refer to hw/virtio/vhost-user.c */ - -#define VHOST_MEMORY_MAX_NREGIONS 8 - -#define VHOST_USER_PROTOCOL_F_MQ 0 -#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 -#define VHOST_USER_PROTOCOL_F_RARP 2 -#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 -#define VHOST_USER_PROTOCOL_F_NET_MTU 4 - -#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ - (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ - (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ - (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ - (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU)) - -typedef enum VhostUserRequest { - VHOST_USER_NONE = 0, - VHOST_USER_GET_FEATURES = 1, - VHOST_USER_SET_FEATURES = 2, - VHOST_USER_SET_OWNER = 3, - VHOST_USER_RESET_OWNER = 4, - VHOST_USER_SET_MEM_TABLE = 5, - VHOST_USER_SET_LOG_BASE = 6, - VHOST_USER_SET_LOG_FD = 7, - VHOST_USER_SET_VRING_NUM = 8, - VHOST_USER_SET_VRING_ADDR = 9, - VHOST_USER_SET_VRING_BASE = 10, - VHOST_USER_GET_VRING_BASE = 11, - VHOST_USER_SET_VRING_KICK = 12, - VHOST_USER_SET_VRING_CALL = 13, - VHOST_USER_SET_VRING_ERR = 14, - VHOST_USER_GET_PROTOCOL_FEATURES = 15, - VHOST_USER_SET_PROTOCOL_FEATURES = 16, - VHOST_USER_GET_QUEUE_NUM = 17, - VHOST_USER_SET_VRING_ENABLE = 18, - VHOST_USER_SEND_RARP = 19, - VHOST_USER_NET_SET_MTU = 20, - VHOST_USER_MAX -} VhostUserRequest; - -typedef struct VhostUserMemoryRegion { - uint64_t guest_phys_addr; - uint64_t memory_size; - uint64_t userspace_addr; - uint64_t mmap_offset; -} VhostUserMemoryRegion; - -typedef struct VhostUserMemory { - uint32_t nregions; - uint32_t padding; - VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; -} VhostUserMemory; - -typedef struct VhostUserLog { - uint64_t mmap_size; - uint64_t mmap_offset; -} VhostUserLog; - -typedef struct VhostUserMsg { - VhostUserRequest request; - -#define VHOST_USER_VERSION_MASK 0x3 -#define VHOST_USER_REPLY_MASK (0x1 << 2) -#define VHOST_USER_NEED_REPLY (0x1 << 3) - uint32_t flags; - uint32_t size; /* the following payload size */ - union { -#define VHOST_USER_VRING_IDX_MASK 0xff -#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) - uint64_t u64; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - VhostUserMemory memory; - VhostUserLog log; - } payload; - int fds[VHOST_MEMORY_MAX_NREGIONS]; -} __attribute((packed)) VhostUserMsg; - -#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) - -/* The version of the protocol we support */ -#define VHOST_USER_VERSION 0x1 - - -/* vhost_user.c */ -int vhost_user_msg_handler(int vid, int fd); - -/* socket.c */ -int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); -int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); - -#endif diff --git a/lib/vhost/task.h b/lib/vhost/task.h index c60d867de..46bcffb17 100644 --- a/lib/vhost/task.h +++ b/lib/vhost/task.h @@ -52,7 +52,7 @@ struct spdk_vhost_task { int req_idx; - struct vhost_virtqueue *vq; + struct rte_vhost_vring *vq; TAILQ_ENTRY(spdk_vhost_task) iovecs_link; }; diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index a899f4667..6bc3a8516 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -36,11 +36,7 @@ #include #include -#include -#include -#include -#include -#include +#include #include "spdk_internal/log.h" #include "spdk/env.h" @@ -65,24 +61,110 @@ static uint32_t g_num_ctrlrs[RTE_MAX_LCORE]; /* Path to folder where character device will be created. Can be set by user. */ static char dev_dirname[PATH_MAX] = ""; -struct spdk_vaddr_region { - void *vaddr; - uint64_t len; +#define SPDK_CACHE_LINE_SIZE RTE_CACHE_LINE_SIZE - uint64_t host_user_addr; - uint64_t host_user_size; +#define MAX_VHOST_DEVICE 1024 +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by SPDK VHOST lib. */ +#define SPDK_VHOST_SCSI_FEATURES ((1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_SCSI_F_INOUT) | \ + (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ + (1ULL << VIRTIO_SCSI_F_CHANGE ) | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +/* Features that are specified in VIRTIO SCSI but currently not supported: + * - Live migration not supported yet + * - Hotplug/hotremove + * - LUN params change + * - T10 PI + */ +#define SPDK_VHOST_SCSI_DISABLED_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ + (1ULL << VIRTIO_SCSI_F_CHANGE ) | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +struct spdk_vhost_dev { + struct rte_vhost_memory *mem; + int vid; + uint16_t num_queues; + struct rte_vhost_vring virtqueue[0] __attribute((aligned(SPDK_CACHE_LINE_SIZE))); }; -/* - * Device linked list structure for data path. - */ +static void +spdk_vhost_dev_free(struct spdk_vhost_dev *dev) +{ + free(dev->mem); + spdk_free(dev); +} + +static void +spdk_vhost_dev_destruct(struct spdk_vhost_dev *dev) +{ + struct rte_vhost_vring *q; + uint16_t i; + + for (i = 0; i < dev->num_queues; i++) { + q = &dev->virtqueue[i]; + rte_vhost_set_vhost_vring_last_idx(dev->vid, i, q->last_avail_idx, q->last_used_idx); + } + + spdk_vhost_dev_free(dev); +} + +static struct spdk_vhost_dev * +spdk_vhost_dev_create(int vid) +{ + uint16_t num_queues = rte_vhost_get_vring_num(vid); + size_t size = sizeof(struct spdk_vhost_dev) + num_queues * sizeof(struct rte_vhost_vring); + struct spdk_vhost_dev *dev = spdk_zmalloc(size, SPDK_CACHE_LINE_SIZE, NULL); + uint16_t i; + + if (dev == NULL) { + SPDK_ERRLOG("vhost device %d: Failed to allocate new vhost device with %"PRIu16" queues\n", vid, + num_queues); + return NULL; + } + + for (i = 0; i < num_queues; i++) { + if (rte_vhost_get_vhost_vring(vid, i, &dev->virtqueue[i])) { + SPDK_ERRLOG("vhost device %d: Failed to get information of queue %"PRIu16"\n", vid, i); + goto err; + } + + /* Disable notifications. */ + if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i); + goto err; + } + + } + + dev->vid = vid; + dev->num_queues = num_queues; + + if (rte_vhost_get_mem_table(vid, &dev->mem) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid); + goto err; + } + return dev; + +err: + spdk_vhost_dev_free(dev); + return NULL; +} + +static uint64_t +gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr) +{ + return rte_vhost_gpa_to_vva(vdev->mem, addr); +} + struct spdk_vhost_scsi_ctrlr { char *name; - /**< Pointer to device created by vhost lib. */ - struct virtio_net *dev; - - struct spdk_vaddr_region region[VHOST_MEMORY_MAX_NREGIONS]; - uint32_t nregions; + struct spdk_vhost_dev *dev; /**< TODO make this an array of spdk_scsi_devs. The vhost scsi * request will tell us which scsi_dev to use. @@ -107,7 +189,7 @@ static struct spdk_vhost_scsi_ctrlr *dpdk_vid_mapping[MAX_VHOST_DEVICE]; * Get available requests from avail ring. */ static uint16_t -vq_avail_ring_get(struct vhost_virtqueue *vq, uint16_t *reqs, uint16_t reqs_len) +vq_avail_ring_get(struct rte_vhost_vring *vq, uint16_t *reqs, uint16_t reqs_len) { struct vring_avail *avail = vq->avail; uint16_t size_mask = vq->size - 1; @@ -135,7 +217,7 @@ vq_avail_ring_get(struct vhost_virtqueue *vq, uint16_t *reqs, uint16_t reqs_len) * Enqueue id and len to used ring. */ static void -vq_used_ring_enqueue(struct vhost_virtqueue *vq, uint16_t id, uint32_t len) +vq_used_ring_enqueue(struct rte_vhost_vring *vq, uint16_t id, uint32_t len) { struct vring_used *used = vq->used; uint16_t size_mask = vq->size - 1; @@ -310,7 +392,7 @@ get_scsi_lun(struct spdk_scsi_dev *scsi_dev, const __u8 *lun) } static void -process_ctrl_request(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue *controlq, +process_ctrl_request(struct spdk_vhost_scsi_ctrlr *vdev, struct rte_vhost_vring *controlq, uint16_t req_idx) { struct spdk_vhost_task *task; @@ -323,9 +405,9 @@ process_ctrl_request(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue ctrl_req = (void *)gpa_to_vva(vdev->dev, desc->addr); SPDK_TRACELOG(SPDK_TRACE_VHOST_QUEUE, - "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; enabled %d; kickfd %d; size %d\n", + "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n", req_idx, desc, (void *)desc->addr, desc->len, desc->flags, controlq->last_used_idx, - controlq->enabled, controlq->kickfd, controlq->size); + controlq->kickfd, controlq->size); SPDK_TRACEDUMP(SPDK_TRACE_VHOST_QUEUE, "Request desriptor", (uint8_t *)ctrl_req, desc->len); @@ -393,8 +475,8 @@ static int task_data_setup(struct spdk_vhost_task *task, struct virtio_scsi_cmd_req **req) { - struct vhost_virtqueue *vq = task->vq; - struct virtio_net *dev = task->vdev->dev; + struct rte_vhost_vring *vq = task->vq; + struct spdk_vhost_dev *dev = task->vdev->dev; struct vring_desc *desc = &task->vq->desc[task->req_idx]; struct iovec *iovs = task->scsi.iovs; uint16_t iovcnt = 0, iovcnt_max = task->scsi.iovcnt; @@ -550,7 +632,7 @@ process_request(struct spdk_vhost_task *task) } static void -process_controlq(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue *vq) +process_controlq(struct spdk_vhost_scsi_ctrlr *vdev, struct rte_vhost_vring *vq) { uint16_t reqs[32]; uint16_t reqs_cnt, i; @@ -562,7 +644,7 @@ process_controlq(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue *vq) } static void -process_requestq(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue *vq) +process_requestq(struct spdk_vhost_scsi_ctrlr *vdev, struct rte_vhost_vring *vq) { uint16_t reqs[32]; uint16_t reqs_cnt, i; @@ -600,7 +682,7 @@ vdev_controlq_worker(void *arg) { struct spdk_vhost_scsi_ctrlr *vdev = arg; - process_controlq(vdev, vdev->dev->virtqueue[VIRTIO_SCSI_CONTROLQ]); + process_controlq(vdev, &vdev->dev->virtqueue[VIRTIO_SCSI_CONTROLQ]); } static void @@ -610,7 +692,7 @@ vdev_worker(void *arg) uint32_t q_idx; for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < vdev->dev->num_queues; q_idx++) { - process_requestq(vdev, vdev->dev->virtqueue[q_idx]); + process_requestq(vdev, &vdev->dev->virtqueue[q_idx]); } } @@ -653,9 +735,7 @@ static void add_vdev_cb(void *arg1, void *arg2) { struct spdk_vhost_scsi_ctrlr *vdev = arg1; - struct virtio_memory_region *region; - struct spdk_vaddr_region *vregion; - uint64_t start, end, len; + struct rte_vhost_mem_region *region; uint32_t i; for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { @@ -665,22 +745,18 @@ add_vdev_cb(void *arg1, void *arg2) spdk_scsi_dev_allocate_io_channels(vdev->scsi_dev[i]); } SPDK_NOTICELOG("Started poller for vhost controller %s on lcore %d\n", vdev->name, vdev->lcore); - vdev->nregions = vdev->dev->mem->nregions; - for (i = 0; i < vdev->nregions; i++) { + + for (i = 0; i < vdev->dev->mem->nregions; i++) { + uint64_t start, end, len; region = &vdev->dev->mem->regions[i]; start = FLOOR_2MB(region->mmap_addr); end = CEIL_2MB(region->mmap_addr + region->mmap_size); len = end - start; - vregion = &vdev->region[i]; - vregion->vaddr = (void *)start; - vregion->len = len; - vregion->host_user_addr = region->host_user_addr; - vregion->host_user_size = region->size; + SPDK_NOTICELOG("Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n", + start, len); + spdk_mem_register((void *)start, len); + spdk_iommu_mem_register(region->host_user_addr, region->size); - SPDK_NOTICELOG("Registering VM memory for vtophys translation - %p len:0x%jx\n", - vdev->region[i].vaddr, vdev->region[i].len); - spdk_mem_register(vregion->vaddr, vregion->len); - spdk_iommu_mem_register(vregion->host_user_addr, vregion->host_user_size); } spdk_poller_register(&vdev->requestq_poller, vdev_worker, vdev, vdev->lcore, 0); @@ -693,7 +769,7 @@ static void remove_vdev_cb(void *arg1, void *arg2) { struct spdk_vhost_scsi_ctrlr *vdev = arg1; - struct spdk_vaddr_region *reg; + struct rte_vhost_mem_region *region; uint32_t i; for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { @@ -704,25 +780,30 @@ remove_vdev_cb(void *arg1, void *arg2) } SPDK_NOTICELOG("Stopping poller for vhost controller %s\n", vdev->name); - for (i = 0; i < vdev->nregions; i++) { - reg = &vdev->region[i]; - spdk_iommu_mem_unregister(reg->host_user_addr, reg->host_user_size); - spdk_mem_unregister(reg->vaddr, reg->len); + for (i = 0; i < vdev->dev->mem->nregions; i++) { + uint64_t start, end, len; + region = &vdev->dev->mem->regions[i]; + start = FLOOR_2MB(region->mmap_addr); + end = CEIL_2MB(region->mmap_addr + region->mmap_size); + len = end - start; + spdk_iommu_mem_unregister(region->host_user_addr, region->size); + spdk_mem_unregister((void *)start, len); } - vdev->nregions = 0; - sem_post((sem_t *)arg2); } static void destroy_device(int vid) { - struct spdk_vhost_scsi_ctrlr *vdev = dpdk_vid_mapping[vid]; + struct spdk_vhost_scsi_ctrlr *vdev; struct spdk_event *event; sem_t done_sem; uint32_t i; + assert(vid < MAX_VHOST_DEVICE); + vdev = dpdk_vid_mapping[vid]; + event = vhost_sem_event_alloc(vdev->lcore, vdev_event_done_cb, NULL, &done_sem); spdk_poller_unregister(&vdev->requestq_poller, event); if (vhost_sem_timedwait(&done_sem, 1)) @@ -749,6 +830,8 @@ destroy_device(int vid) g_num_ctrlrs[vdev->lcore]--; vdev->lcore = -1; + + spdk_vhost_dev_destruct(vdev->dev); vdev->dev = NULL; dpdk_vid_mapping[vid] = NULL; } @@ -781,6 +864,18 @@ spdk_vhost_scsi_ctrlr_find(const char *ctrlr_name) return NULL; } + +static int new_device(int vid); +static void destroy_device(int vid); +/* + * These callback allow devices to be added to the data core when configuration + * has been fully complete. + */ +static const struct vhost_device_ops spdk_vhost_scsi_device_ops = { + .new_device = new_device, + .destroy_device = destroy_device, +}; + int spdk_vhost_scsi_ctrlr_construct(const char *name, uint64_t cpumask) { @@ -836,17 +931,36 @@ spdk_vhost_scsi_ctrlr_construct(const char *name, uint64_t cpumask) SPDK_ERRLOG("Check if domain socket %s already exists\n", path); return -EIO; } + if (rte_vhost_driver_set_features(path, SPDK_VHOST_SCSI_FEATURES) || + rte_vhost_driver_disable_features(path, SPDK_VHOST_SCSI_DISABLED_FEATURES)) { + SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name); + return -EINVAL; + } - vdev = rte_zmalloc(NULL, sizeof(*vdev), RTE_CACHE_LINE_SIZE); + if (rte_vhost_driver_callback_register(path, &spdk_vhost_scsi_device_ops) != 0) { + SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name); + return -ENOENT; + } + + vdev = spdk_zmalloc(sizeof(*vdev), RTE_CACHE_LINE_SIZE, NULL); if (vdev == NULL) { SPDK_ERRLOG("Couldn't allocate memory for vhost dev\n"); return -ENOMEM; } - spdk_vhost_ctrlrs[ctrlr_num] = vdev; vdev->name = strdup(name); vdev->cpumask = cpumask; vdev->lcore = -1; + + if (rte_vhost_driver_start(path) != 0) { + SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s", name, errno, + strerror(errno)); + free(vdev->name); + spdk_free(vdev); + return -EIO; + } + + spdk_vhost_ctrlrs[ctrlr_num] = vdev; SPDK_NOTICELOG("Controller %s: new controller added\n", name); return 0; } @@ -1086,34 +1200,37 @@ spdk_vhost_scsi_allocate_reactor(uint64_t cpumask) static int new_device(int vid) { - struct virtio_net *dev = vhost_devices[vid]; struct spdk_vhost_scsi_ctrlr *vdev = NULL; struct spdk_event *event; - sem_t added; - uint32_t i; - vdev = spdk_vhost_scsi_ctrlr_find(dev->ifname); + char ifname[PATH_MAX]; + sem_t added; + + assert(vid < MAX_VHOST_DEVICE); + + if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) { + SPDK_ERRLOG("Couldn't get a valid ifname for device %d\n", vid); + return -1; + } + + vdev = spdk_vhost_scsi_ctrlr_find(ifname); if (vdev == NULL) { - SPDK_ERRLOG("Controller %s not found.\n", dev->ifname); + SPDK_ERRLOG("Controller %s not found.\n", ifname); return -1; } if (vdev->lcore != -1) { - SPDK_ERRLOG("Controller %s already connected.\n", dev->ifname); + SPDK_ERRLOG("Controller %s already connected.\n", ifname); + return -1; + } + + assert(vdev->dev == NULL); + vdev->dev = spdk_vhost_dev_create(vid); + if (vdev->dev == NULL) { return -1; } dpdk_vid_mapping[vid] = vdev; - vdev->dev = dev; - - /* Disable notifications. */ - for (i = 0; i < dev->num_queues; i++) { - rte_vhost_enable_guest_notification(vid, i, 0); - } - - dev->flags |= VIRTIO_DEV_RUNNING; - vdev->dev = dev; - vdev->lcore = spdk_vhost_scsi_allocate_reactor(vdev->cpumask); event = vhost_sem_event_alloc(vdev->lcore, add_vdev_cb, vdev, &added); @@ -1123,27 +1240,10 @@ new_device(int vid) return 0; } -/* - * These callback allow devices to be added to the data core when configuration - * has been fully complete. - */ -static const struct virtio_net_device_ops virtio_net_device_ops = { - .new_device = new_device, - .destroy_device = destroy_device, -}; - -static void * -session_start(void *arg) -{ - rte_vhost_driver_session_start(); - return NULL; -} - void spdk_vhost_startup(void *arg1, void *arg2) { int ret; - pthread_t tid; const char *basename = arg1; if (basename && strlen(basename) > 0) { @@ -1161,12 +1261,6 @@ spdk_vhost_startup(void *arg1, void *arg2) ret = spdk_vhost_scsi_controller_construct(); if (ret != 0) rte_exit(EXIT_FAILURE, "Cannot construct vhost controllers\n"); - - rte_vhost_driver_callback_register(&virtio_net_device_ops); - - if (pthread_create(&tid, NULL, &session_start, NULL) < 0) - rte_panic("Failed to start session poller thread (%d): %s", errno, strerror(errno)); - pthread_detach(tid); } static void *