From 1dbf53eebfb4ef0884ebb59f5944b972b5f483f2 Mon Sep 17 00:00:00 2001 From: Piotr Pelplinski Date: Thu, 2 Mar 2017 15:12:20 +0100 Subject: [PATCH] vhost: add a library and app for userspace vhost-scsi processing This patch adds a library, application and test scripts for extending SPDK to present virtio-scsi controllers to QEMU-based VMs and process I/O submitted to devices attached to those controllers. This functionality is dependent on QEMU patches to enable vhost-scsi in userspace - those patches are currently working their way through the QEMU mailing list, but temporary patches to enable this functionality in QEMU will be made available shortly through the SPDK github repository. Signed-off-by: Jim Harris Signed-off-by: Krzysztof Jakimiak Signed-off-by: Michal Kosciowski Signed-off-by: Karol Latecki Signed-off-by: Piotr Pelplinski Signed-off-by: Daniel Verkamp Signed-off-by: Pawel Wodkowski Signed-off-by: Tomasz Zawadzki Signed-off-by: Krzysztof Jakimiak Change-Id: I138e4021f0ac4b1cd9a6e4041783cdf06e6f0efb --- app/Makefile | 2 +- app/vhost/Makefile | 62 + app/vhost/vhost.c | 164 +++ autotest.sh | 4 + etc/spdk/vhost.conf.in | 133 ++ include/spdk/vhost.h | 70 + lib/Makefile | 2 +- lib/vhost/Makefile | 46 + lib/vhost/rte_vhost/Makefile | 44 + lib/vhost/rte_vhost/fd_man.c | 299 +++++ lib/vhost/rte_vhost/fd_man.h | 67 + lib/vhost/rte_vhost/rte_virtio_net.h | 193 +++ lib/vhost/rte_vhost/socket.c | 619 +++++++++ lib/vhost/rte_vhost/vhost.c | 429 ++++++ lib/vhost/rte_vhost/vhost.h | 294 ++++ lib/vhost/rte_vhost/vhost_user.c | 1042 +++++++++++++++ lib/vhost/rte_vhost/vhost_user.h | 128 ++ lib/vhost/rte_vhost/virtio_net.c | 1186 +++++++++++++++++ lib/vhost/task.c | 162 +++ lib/vhost/task.h | 69 + lib/vhost/vhost.c | 1161 ++++++++++++++++ lib/vhost/vhost_rpc.c | 215 +++ mk/spdk.app.mk | 3 +- scripts/check_format.sh | 8 +- scripts/rpc.py | 13 +- test/vhost/ext4test/ext4connect.sh | 55 + test/vhost/ext4test/ext4start.sh | 97 ++ test/vhost/ext4test/spdk_vm_base.xml | 69 + test/vhost/ext4test/spdk_vnet_base.xml | 11 + test/vhost/ext4test/vhost.conf | 47 + test/vhost/fiotest/README | 85 ++ test/vhost/fiotest/autotest.config | 5 + test/vhost/fiotest/autotest.sh | 257 ++++ test/vhost/fiotest/common.sh | 756 +++++++++++ .../fiotest/fio_jobs/default_integrity.job | 18 + .../fiotest/fio_jobs/default_performance.job | 15 + test/vhost/fiotest/run_fio.py | 312 +++++ test/vhost/fiotest/run_vhost.sh | 49 + test/vhost/fiotest/vhost.conf | 41 + test/vhost/fiotest/vm_run.sh | 48 + test/vhost/fiotest/vm_setup.sh | 78 ++ test/vhost/fiotest/vm_shutdown.sh | 65 + test/vhost/fiotest/vm_ssh.sh | 58 + test/vhost/spdk_vhost.sh | 40 + 44 files changed, 8510 insertions(+), 11 deletions(-) create mode 100644 app/vhost/Makefile create mode 100644 app/vhost/vhost.c create mode 100644 etc/spdk/vhost.conf.in create mode 100644 include/spdk/vhost.h create mode 100644 lib/vhost/Makefile create mode 100644 lib/vhost/rte_vhost/Makefile create mode 100644 lib/vhost/rte_vhost/fd_man.c create mode 100644 lib/vhost/rte_vhost/fd_man.h create mode 100644 lib/vhost/rte_vhost/rte_virtio_net.h create mode 100644 lib/vhost/rte_vhost/socket.c create mode 100644 lib/vhost/rte_vhost/vhost.c create mode 100644 lib/vhost/rte_vhost/vhost.h create mode 100644 lib/vhost/rte_vhost/vhost_user.c create mode 100644 lib/vhost/rte_vhost/vhost_user.h create mode 100644 lib/vhost/rte_vhost/virtio_net.c create mode 100644 lib/vhost/task.c create mode 100644 lib/vhost/task.h create mode 100644 lib/vhost/vhost.c create mode 100644 lib/vhost/vhost_rpc.c create mode 100755 test/vhost/ext4test/ext4connect.sh create mode 100755 test/vhost/ext4test/ext4start.sh create mode 100644 test/vhost/ext4test/spdk_vm_base.xml create mode 100644 test/vhost/ext4test/spdk_vnet_base.xml create mode 100644 test/vhost/ext4test/vhost.conf create mode 100644 test/vhost/fiotest/README create mode 100644 test/vhost/fiotest/autotest.config create mode 100755 test/vhost/fiotest/autotest.sh create mode 100644 test/vhost/fiotest/common.sh create mode 100644 test/vhost/fiotest/fio_jobs/default_integrity.job create mode 100644 test/vhost/fiotest/fio_jobs/default_performance.job create mode 100755 test/vhost/fiotest/run_fio.py create mode 100755 test/vhost/fiotest/run_vhost.sh create mode 100644 test/vhost/fiotest/vhost.conf create mode 100755 test/vhost/fiotest/vm_run.sh create mode 100755 test/vhost/fiotest/vm_setup.sh create mode 100755 test/vhost/fiotest/vm_shutdown.sh create mode 100755 test/vhost/fiotest/vm_ssh.sh create mode 100755 test/vhost/spdk_vhost.sh diff --git a/app/Makefile b/app/Makefile index e5698ce41..31555099a 100644 --- a/app/Makefile +++ b/app/Makefile @@ -38,7 +38,7 @@ DIRS-y += trace DIRS-y += nvmf_tgt DIRS-y += iscsi_top ifeq ($(OS),Linux) -DIRS-y += iscsi_tgt +DIRS-y += iscsi_tgt vhost endif .PHONY: all clean $(DIRS-y) diff --git a/app/vhost/Makefile b/app/vhost/Makefile new file mode 100644 index 000000000..aff865fb5 --- /dev/null +++ b/app/vhost/Makefile @@ -0,0 +1,62 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.app.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk + +APP = vhost + +CFLAGS += $(ENV_CFLAGS) + +C_SRCS := vhost.c + +SPDK_LIB_LIST = jsonrpc json rpc bdev_rpc bdev scsi net copy trace conf +SPDK_LIB_LIST += util log log_rpc event app_rpc +SPDK_LIB_LIST += vhost rte_vhost + +LIBS += $(BLOCKDEV_MODULES_LINKER_ARGS) \ + $(COPY_MODULES_LINKER_ARGS) +LIBS += $(SPDK_LIB_LINKER_ARGS) +LIBS += $(ENV_LINKER_ARGS) + +all : $(APP) + +$(APP) : $(OBJS) $(SPDK_LIB_FILES) $(ENV_LIBS) $(BLOCKDEV_MODULES_FILES) $(COPY_MODULES_FILES) + $(LINK_C) + +clean : + $(CLEAN_C) $(APP) + +include $(SPDK_ROOT_DIR)/mk/spdk.deps.mk diff --git a/app/vhost/vhost.c b/app/vhost/vhost.c new file mode 100644 index 000000000..a5cc795e3 --- /dev/null +++ b/app/vhost/vhost.c @@ -0,0 +1,164 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "spdk/log.h" +#include "spdk/conf.h" +#include "spdk/event.h" + +#include "spdk/vhost.h" + + +#define SPDK_VHOST_DEFAULT_CONFIG "/usr/local/etc/spdk/vhost.conf" +#define SPDK_VHOST_DEFAULT_ENABLE_COREDUMP true +#define SPDK_VHOST_DEFAULT_MEM_SIZE 1024 + +static void +vhost_app_opts_init(struct spdk_app_opts *opts) +{ + spdk_app_opts_init(opts); + opts->name = "vhost"; + opts->config_file = SPDK_VHOST_DEFAULT_CONFIG; + opts->dpdk_mem_size = SPDK_VHOST_DEFAULT_MEM_SIZE; +} + +static void +usage(char *executable_name) +{ + struct spdk_app_opts defaults; + + vhost_app_opts_init(&defaults); + + printf("%s [options]\n", executable_name); + printf("options:\n"); + printf(" -c config config file (default: %s)\n", defaults.config_file); + printf(" -e mask tracepoint group mask for spdk trace buffers (default: 0x0)\n"); + printf(" -m mask reactor core mask (default: 0x1)\n"); + printf(" -l facility use specific syslog facility (default: %s)\n", defaults.log_facility); + printf(" -n channel number of memory channels used for DPDK\n"); + printf(" -p core master (primary) core for DPDK\n"); + printf(" -s size memory size in MB for DPDK (default: %dMB)\n", defaults.dpdk_mem_size); + printf(" -S dir directory where to create vhost sockets (default: pwd)\n"); + spdk_tracelog_usage(stdout, "-t"); + printf(" -h show this usage\n"); + printf(" -d disable coredump file enabling\n"); + printf(" -q disable notice level logging to stderr\n"); +} + +int +main(int argc, char *argv[]) +{ + struct spdk_app_opts opts = {}; + char ch; + int rc; + const char *socket_path = NULL; + + vhost_app_opts_init(&opts); + + while ((ch = getopt(argc, argv, "c:de:l:m:p:qs:S:t:h")) != -1) { + switch (ch) { + case 'c': + opts.config_file = optarg; + break; + case 'd': + opts.enable_coredump = false; + break; + case 'e': + opts.tpoint_group_mask = optarg; + break; + case 'h': + usage(argv[0]); + exit(EXIT_SUCCESS); + case 'l': + opts.log_facility = optarg; + break; + case 'm': + opts.reactor_mask = optarg; + break; + case 'p': + opts.dpdk_master_core = strtoul(optarg, NULL, 10); + break; + case 'q': + spdk_g_notice_stderr_flag = 0; + break; + case 's': + opts.dpdk_mem_size = strtoul(optarg, NULL, 10); + break; + case 'S': + socket_path = optarg; + break; + case 't': + rc = spdk_log_set_trace_flag(optarg); + if (rc < 0) { + fprintf(stderr, "unknown flag\n"); + usage(argv[0]); + exit(EXIT_FAILURE); + } +#ifndef DEBUG + fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -t flag.\n", + argv[0]); + usage(argv[0]); + exit(EXIT_FAILURE); +#endif + break; + default: + fprintf(stderr, "%s Unknown option '-%c'.\n", argv[0], ch); + usage(argv[0]); + exit(EXIT_FAILURE); + } + } + + if (spdk_g_notice_stderr_flag == 1 && + isatty(STDERR_FILENO) && + !strncmp(ttyname(STDERR_FILENO), "/dev/tty", strlen("/dev/tty"))) { + printf("Warning: printing stderr to console terminal without -q option specified.\n"); + printf("Suggest using -q to disable logging to stderr and monitor syslog, or\n"); + printf("redirect stderr to a file.\n"); + printf("(Delaying for 10 seconds...)\n"); + sleep(10); + } + + opts.shutdown_cb = spdk_vhost_shutdown_cb; + spdk_app_init(&opts); + + /* Blocks until the application is exiting */ + rc = spdk_app_start(spdk_vhost_startup, (void *)socket_path, NULL); + + spdk_app_fini(); + + return rc; +} diff --git a/autotest.sh b/autotest.sh index 1e7294042..2483ba20a 100755 --- a/autotest.sh +++ b/autotest.sh @@ -137,6 +137,10 @@ timing_exit host timing_exit nvmf +timing_enter vhost +run_test ./test/vhost/spdk_vhost.sh --integrity +timing_exit vhost + timing_enter cleanup rbd_cleanup ./scripts/setup.sh reset diff --git a/etc/spdk/vhost.conf.in b/etc/spdk/vhost.conf.in new file mode 100644 index 000000000..fb258e3ad --- /dev/null +++ b/etc/spdk/vhost.conf.in @@ -0,0 +1,133 @@ +# SPDK vhost configuration file +# +# Please write all parameters using ASCII. +# The parameter must be quoted if it includes whitespace. + +# Configuration syntax: +# Leading whitespace is ignored. +# Lines starting with '#' are comments. +# Lines ending with '\' are concatenated with the next line. +# Bracketed ([]) names define sections + +[Global] + # Instance ID for multi-process support + # Default: 0 + #InstanceID 0 + + # Users can restrict work items to only run on certain cores by + # specifying a ReactorMask. Default is to allow work items to run + # on core 0. + #ReactorMask 0xFFFF + + # Tracepoint group mask for spdk trace buffers + # Default: 0x0 (all tracepoint groups disabled) + # Set to 0xFFFFFFFFFFFFFFFF to enable all tracepoint groups. + #TpointGroupMask 0x0 + + # syslog facility + LogFacility "local7" + +[Rpc] + # Defines whether SPDK vhost will enable configuration via RPC. + # Default is disabled. Note that the RPC interface is not + # authenticated, so users should be careful about enabling + # RPC in non-trusted environments. + Enable No + # Listen address for the RPC service. + # May be an IP address or an absolute path to a Unix socket. + Listen 127.0.0.1 + +# Users may not want to use offload even it is available. +# Users may use the whitelist to initialize specified devices, IDS +# uses BUS:DEVICE.FUNCTION to identify each Ioat channel. +[Ioat] + Disable Yes + #Whitelist 00:04.0 + #Whitelist 00:04.1 + +# Users must change this section to match the /dev/sdX devices to be +# exported as vhost scsi drives. The devices are accessed using Linux AIO. +[AIO] + #AIO /dev/sdb + #AIO /dev/sdc + +# Users may change this section to create a different number or size of +# malloc LUNs. +# If the system has hardware DMA engine, it will use an IOAT +# (i.e. Crystal Beach DMA) channel to do the copy instead of memcpy. +# Of course, users can disable offload even it is available. +[Malloc] + # Number of Malloc targets + NumberOfLuns 3 + # Malloc targets are 128M + LunSizeInMB 128 + # Block size. Default is 512 bytes. + BlockSize 4096 + +# NVMe configuration options +[Nvme] + # NVMe Device Whitelist + # Users may specify which NVMe devices to claim by their PCI + # domain, bus, device, and function. The format is dddd:bb:dd.f, which is + # the same format displayed by lspci or in /sys/bus/pci/devices. The second + # argument is a "name" for the device that can be anything. The name + # is referenced later in the Subsystem section. + # + # Alternatively, the user can specify ClaimAllDevices. All + # NVMe devices will be claimed and named Nvme0, Nvme1, etc. + #BDF 0000:81:00.0 Nvme0 + #BDF 0000:01:00.0 Nvme1 + ClaimAllDevices + + # The number of attempts per I/O when an I/O fails. Do not include + # this key to get the default behavior. + NvmeRetryCount 4 + # The maximum number of NVMe controllers to claim. Do not include this key to + # claim all of them. + NumControllers 2 + # Registers the application to receive timeout callback and to reset the controller. + ResetControllerOnTimeout Yes + # Timeout value. + NvmeTimeoutValue 30 + # Set how often the admin queue is polled for asynchronous events. + # Units in microseconds. + AdminPollRate 100000 + +# The Split virtual block device slices block devices into multiple smaller bdevs. +[Split] + # Syntax: + # Split [] + # + # Split Nvme1n1 into two equally-sized portions, Nvme1n1p0 and Nvme1n1p1 + #Split Nvme1n1 2 + + # Split Malloc2 into eight 1-megabyte portions, Malloc2p0 ... Malloc2p7, + # leaving the rest of the device inaccessible + #Split Malloc2 8 1 + +# Vhost scsi controller configuration +# Users should change the VhostScsi section(s) below to match the desired +# vhost configuration. +# Name is minimum required +[VhostScsi0] + # Define name for controller + Name vhost.0 + # Assign devices from backend + # Use the first malloc device + Dev0 Malloc0 + # Use the first AIO device + #Dev1 AIO0 + # Use the frist Nvme device + #Dev2 Nvme0n1 + # Use the third partition from second Nvme device + #Dev3 Nvme1n1p2 + + # Start the poller for this vhost controller on one of the cores in + # this cpumask. By default, it not specified, will use any core in the + # SPDK process. + #Cpumask 0x1 + +#[VhostScsi1] +# Name vhost.1 +# Dev0 AIO1 +# Cpumask 0x1 diff --git a/include/spdk/vhost.h b/include/spdk/vhost.h new file mode 100644 index 000000000..75e362f57 --- /dev/null +++ b/include/spdk/vhost.h @@ -0,0 +1,70 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * SPDK vhost + */ + +#ifndef SPDK_VHOST_H +#define SPDK_VHOST_H + +#include "spdk/event.h" + +#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8 + +/** + * \param event event object. event arg1 is optional path to vhost socket. + */ +void spdk_vhost_startup(void *arg1, void *arg2); +void spdk_vhost_shutdown_cb(void); + +/* Forward declaration */ +struct spdk_vhost_scsi_ctrlr; + +/** + * Get handle to next controller. + * \param prev Previous controller or NULL to get first one. + * \return handle to next controller ot NULL if prev was the last one. + */ +struct spdk_vhost_scsi_ctrlr *spdk_vhost_scsi_ctrlr_next(struct spdk_vhost_scsi_ctrlr *prev); + +const char *spdk_vhost_scsi_ctrlr_get_name(struct spdk_vhost_scsi_ctrlr *ctrl); +uint64_t spdk_vhost_scsi_ctrlr_get_cpumask(struct spdk_vhost_scsi_ctrlr *ctrl); +int spdk_vhost_scsi_ctrlr_construct(const char *name, uint64_t cpumask); +int spdk_vhost_parse_core_mask(const char *mask, uint64_t *cpumask); +struct spdk_scsi_dev *spdk_vhost_scsi_ctrlr_get_dev(struct spdk_vhost_scsi_ctrlr *ctrl, + uint8_t num); +int spdk_vhost_scsi_ctrlr_add_dev(const char *name, unsigned scsi_dev_num, const char *lun_name); + +#endif /* SPDK_VHOST_H */ diff --git a/lib/Makefile b/lib/Makefile index 3f86753b9..b8f54e0d9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -37,7 +37,7 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk DIRS-y += bdev conf copy cunit event json jsonrpc \ log env_dpdk net rpc trace util nvme nvmf scsi ioat ifeq ($(OS),Linux) -DIRS-y += iscsi +DIRS-y += iscsi vhost endif diff --git a/lib/vhost/Makefile b/lib/vhost/Makefile new file mode 100644 index 000000000..bbf38fc3d --- /dev/null +++ b/lib/vhost/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -Irte_vhost +CFLAGS += $(ENV_CFLAGS) + +C_SRCS = task.c vhost.c vhost_rpc.c + +LIBNAME = vhost + +DIRS-y += rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/lib/vhost/rte_vhost/Makefile b/lib/vhost/rte_vhost/Makefile new file mode 100644 index 000000000..336425818 --- /dev/null +++ b/lib/vhost/rte_vhost/Makefile @@ -0,0 +1,44 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += $(ENV_CFLAGS) + +# These are the DPDK vhost files copied (for now) into SPDK +C_SRCS += fd_man.c socket.c vhost_user.c virtio_net.c vhost.c + +LIBNAME = rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/lib/vhost/rte_vhost/fd_man.c b/lib/vhost/rte_vhost/fd_man.c new file mode 100644 index 000000000..2d3eeb7d7 --- /dev/null +++ b/lib/vhost/rte_vhost/fd_man.c @@ -0,0 +1,299 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fd_man.h" + +/** + * Returns the index in the fdset for a given fd. + * If fd is -1, it means to search for a free entry. + * @return + * index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + if (pfdset == NULL) + return -1; + + for (i = 0; i < MAX_FDS && pfdset->fd[i].fd != fd; i++) + ; + + return i == MAX_FDS ? -1 : i; +} + +static int +fdset_find_free_slot(struct fdset *pfdset) +{ + return fdset_find_fd(pfdset, -1); +} + +static int +fdset_add_fd(struct fdset *pfdset, int idx, int fd, + fd_cb rcb, fd_cb wcb, void *dat) +{ + struct fdentry *pfdentry; + + if (pfdset == NULL || idx >= MAX_FDS || fd >= FD_SETSIZE) + return -1; + + pfdentry = &pfdset->fd[idx]; + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + return 0; +} + +/** + * Fill the read/write fd_set with the fds in the fdset. + * @return + * the maximum fds filled in the read/write fd_set. + */ +static int +fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) +{ + struct fdentry *pfdentry; + int i, maxfds = -1; + int num = MAX_FDS; + + if (pfdset == NULL) + return -1; + + for (i = 0; i < num; i++) { + pfdentry = &pfdset->fd[i]; + if (pfdentry->fd != -1) { + int added = 0; + if (pfdentry->rcb && rfset) { + FD_SET(pfdentry->fd, rfset); + added = 1; + } + if (pfdentry->wcb && wfset) { + FD_SET(pfdentry->fd, wfset); + added = 1; + } + if (added) + maxfds = pfdentry->fd < maxfds ? + maxfds : pfdentry->fd; + } + } + return maxfds; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + if (pfdset == NULL) + return; + + for (i = 0; i < MAX_FDS; i++) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].dat = NULL; + } + pfdset->num = 0; +} + +/** + * Register the fd in the fdset with read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -1; + + pthread_mutex_lock(&pfdset->fd_mutex); + + /* Find a free slot in the list. */ + i = fdset_find_free_slot(pfdset); + if (i == -1 || fdset_add_fd(pfdset, i, fd, rcb, wcb, dat) < 0) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } + + pfdset->num++; + + pthread_mutex_unlock(&pfdset->fd_mutex); + + return 0; +} + +/** + * Unregister the fd from the fdset. + * Returns context of a given fd or NULL. + */ +void * +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + void *dat = NULL; + + if (pfdset == NULL || fd == -1) + return NULL; + + do { + pthread_mutex_lock(&pfdset->fd_mutex); + + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy == 0) { + /* busy indicates r/wcb is executing! */ + dat = pfdset->fd[i].dat; + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + pfdset->num--; + i = -1; + } + pthread_mutex_unlock(&pfdset->fd_mutex); + } while (i != -1); + + return dat; +} + +/** + * Unregister the fd at the specified slot from the fdset. + */ +static void +fdset_del_slot(struct fdset *pfdset, int index) +{ + if (pfdset == NULL || index < 0 || index >= MAX_FDS) + return; + + pthread_mutex_lock(&pfdset->fd_mutex); + + pfdset->fd[index].fd = -1; + pfdset->fd[index].rcb = pfdset->fd[index].wcb = NULL; + pfdset->fd[index].dat = NULL; + pfdset->num--; + + pthread_mutex_unlock(&pfdset->fd_mutex); +} + +/** + * This functions runs in infinite blocking loop until there is no fd in + * pfdset. It calls corresponding r/w handler if there is event on the fd. + * + * Before the callback is called, we set the flag to busy status; If other + * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it + * will wait until the flag is reset to zero(which indicates the callback is + * finished), then it could free the context after fdset_del. + */ +void +fdset_event_dispatch(struct fdset *pfdset) +{ + fd_set rfds, wfds; + int i, maxfds; + struct fdentry *pfdentry; + int num = MAX_FDS; + fd_cb rcb, wcb; + void *dat; + int fd; + int remove1, remove2; + int ret; + + if (pfdset == NULL) + return; + + while (1) { + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + pthread_mutex_lock(&pfdset->fd_mutex); + + maxfds = fdset_fill(&rfds, &wfds, pfdset); + + pthread_mutex_unlock(&pfdset->fd_mutex); + + /* + * When select is blocked, other threads might unregister + * listenfds from and register new listenfds into fdset. + * When select returns, the entries for listenfds in the fdset + * might have been updated. It is ok if there is unwanted call + * for new listenfds. + */ + ret = select(maxfds + 1, &rfds, &wfds, NULL, &tv); + if (ret <= 0) + continue; + + for (i = 0; i < num; i++) { + remove1 = remove2 = 0; + pthread_mutex_lock(&pfdset->fd_mutex); + pfdentry = &pfdset->fd[i]; + fd = pfdentry->fd; + rcb = pfdentry->rcb; + wcb = pfdentry->wcb; + dat = pfdentry->dat; + pfdentry->busy = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + if (fd >= 0 && FD_ISSET(fd, &rfds) && rcb) + rcb(fd, dat, &remove1); + if (fd >= 0 && FD_ISSET(fd, &wfds) && wcb) + wcb(fd, dat, &remove2); + pfdentry->busy = 0; + /* + * fdset_del needs to check busy flag. + * We don't allow fdset_del to be called in callback + * directly. + */ + /* + * When we are to clean up the fd from fdset, + * because the fd is closed in the cb, + * the old fd val could be reused by when creates new + * listen fd in another thread, we couldn't call + * fd_set_del. + */ + if (remove1 || remove2) + fdset_del_slot(pfdset, i); + } + } +} diff --git a/lib/vhost/rte_vhost/fd_man.h b/lib/vhost/rte_vhost/fd_man.h new file mode 100644 index 000000000..bd66ed1c5 --- /dev/null +++ b/lib/vhost/rte_vhost/fd_man.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include +#include + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, void *dat, int *remove); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable.*/ + void *dat; /* fd context */ + int busy; /* whether this entry is being used in cb. */ +}; + +struct fdset { + struct fdentry fd[MAX_FDS]; + pthread_mutex_t fd_mutex; + int num; /* current fd number of this fdset */ +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, + fd_cb rcb, fd_cb wcb, void *dat); + +void *fdset_del(struct fdset *pfdset, int fd); + +void fdset_event_dispatch(struct fdset *pfdset); + +#endif diff --git a/lib/vhost/rte_vhost/rte_virtio_net.h b/lib/vhost/rte_vhost/rte_virtio_net.h new file mode 100644 index 000000000..926039c5a --- /dev/null +++ b/lib/vhost/rte_vhost/rte_virtio_net.h @@ -0,0 +1,193 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VIRTIO_NET_H_ +#define _VIRTIO_NET_H_ + +/** + * @file + * Interface to vhost net + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) + +/* Enum for virtqueue management. */ +enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; + +/** + * Device and vring operations. + */ +struct virtio_net_device_ops { + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ + + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + + void *reserved[5]; /**< Reserved for future extension */ +}; + +/** + * Disable features in feature_mask. Returns 0 on success. + */ +int rte_vhost_feature_disable(uint64_t feature_mask); + +/** + * Enable features in feature_mask. Returns 0 on success. + */ +int rte_vhost_feature_enable(uint64_t feature_mask); + +/* Returns currently supported vhost features */ +uint64_t rte_vhost_feature_get(void); + +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); + +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); + +/* Unregister vhost driver. This is only meaningful to vhost user. */ +int rte_vhost_driver_unregister(const char *path); + +/* Register callbacks. */ +int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); +/* Start vhost driver session blocking loop. */ +int rte_vhost_driver_session_start(void); + +/** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * virtio-net device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * Get the number of queues the device supports. + * + * @param vid + * virtio-net device ID + * + * @return + * The number of queues, 0 on failure + */ +uint32_t rte_vhost_get_queue_num(int vid); + +/** + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. + * + * @param vid + * virtio-net device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * virtio-net device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtual device. A packet + * count is returned to indicate the number of packets that were succesfully + * added to the RX queue. + * @param vid + * virtio-net device ID + * @param queue_id + * virtio queue index in mq case + * @param pkts + * array to contain packets to be enqueued + * @param count + * packets num to be enqueued + * @return + * num of packets enqueued + */ +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +/** + * This function gets guest buffers from the virtio device TX virtqueue, + * construct host mbufs, copies guest buffer content to host mbufs and + * store them in pkts to be processed. + * @param vid + * virtio-net device + * @param queue_id + * virtio queue index in mq case + * @param mbuf_pool + * mbuf_pool where host mbuf is allocated. + * @param pkts + * array to contain packets to be dequeued + * @param count + * packets num to be dequeued + * @return + * num of packets dequeued + */ +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); + +#endif /* _VIRTIO_NET_H_ */ diff --git a/lib/vhost/rte_vhost/socket.c b/lib/vhost/rte_vhost/socket.c new file mode 100644 index 000000000..9276ce58c --- /dev/null +++ b/lib/vhost/rte_vhost/socket.c @@ -0,0 +1,619 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "fd_man.h" +#include "vhost.h" +#include "vhost_user.h" + +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + char *path; + int listenfd; + int connfd; + bool is_server; + bool reconnect; + bool dequeue_zero_copy; +}; + +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int vid; +}; + +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; + struct fdset fdset; + int vsocket_cnt; + pthread_mutex_t mutex; +}; + +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_read_cb(int fd, void *dat, int *remove); +static int vhost_user_create_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { + .fdset = { + .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, + .fd_mutex = PTHREAD_MUTEX_INITIALIZER, + .num = 0 + }, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +/* return bytes# of read on success or negative val on failure. */ +int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + + return ret; +} + +int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + if (cmsg == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "null cmsg\n"); + return -1; + } + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return ret; + } + + return ret; +} + +static void +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) +{ + int vid; + size_t size; + struct vhost_user_connection *conn; + int ret; + + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); + return; + } + + vid = vhost_new_device(); + if (vid == -1) { + close(fd); + free(conn); + return; + } + + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + if (vsocket->dequeue_zero_copy) + vhost_enable_dequeue_zero_copy(vid); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); + + vsocket->connfd = fd; + conn->vsocket = vsocket; + conn->vid = vid; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, + NULL, conn); + if (ret < 0) { + vsocket->connfd = -1; + free(conn); + close(fd); + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add fd %d into vhost server fdset\n", + fd); + } +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; + + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); +} + +static void +vhost_user_read_cb(int connfd, void *dat, int *remove) +{ + struct vhost_user_connection *conn = dat; + struct vhost_user_socket *vsocket = conn->vsocket; + int ret; + + ret = vhost_user_msg_handler(conn->vid, connfd); + if (ret < 0) { + vsocket->connfd = -1; + close(connfd); + *remove = 1; + vhost_destroy_device(conn->vid); + free(conn); + + if (vsocket->reconnect) + vhost_user_create_client(vsocket); + } +} + +static int +create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server) +{ + int fd; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + is_server ? "server" : "client", fd); + + if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-user: can't set nonblocking mode for socket, fd: " + "%d (%s)\n", fd, strerror(errno)); + close(fd); + return -1; + } + + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, path, sizeof(un->sun_path)); + un->sun_path[sizeof(un->sun_path) - 1] = '\0'; + + return fd; +} + +static int +vhost_user_create_server(struct vhost_user_socket *vsocket) +{ + int fd; + int ret; + struct sockaddr_un un; + const char *path = vsocket->path; + + fd = create_unix_socket(path, &un, vsocket->is_server); + if (fd < 0) + return -1; + + ret = bind(fd, (struct sockaddr *)&un, sizeof(un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; + } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + vsocket->listenfd = fd; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add listen fd %d to vhost server fdset\n", + fd); + goto err; + } + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static int +vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) +{ + int ret, flags; + + ret = connect(fd, un, sz); + if (ret < 0 && errno != EISCONN) + return -1; + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't get flags for connfd %d\n", fd); + return -2; + } + if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't disable nonblocking on fd %d\n", fd); + return -2; + } + return 0; +} + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + int ret; + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + ret = vhost_user_connect_nonblock(reconn->fd, + (struct sockaddr *)&reconn->un, + sizeof(reconn->un)); + if (ret == -2) { + close(reconn->fd); + RTE_LOG(ERR, VHOST_CONFIG, + "reconnection for fd %d failed\n", + reconn->fd); + goto remove_fd; + } + if (ret == -1) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); +remove_fd: + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); + } + + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + pthread_mutex_init(&reconn_list.mutex, NULL); + TAILQ_INIT(&reconn_list.head); + + ret = pthread_create(&reconn_tid, NULL, + vhost_user_client_reconnect, NULL); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + + return ret; +} + +static int +vhost_user_create_client(struct vhost_user_socket *vsocket) +{ + int fd; + int ret; + struct sockaddr_un un; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + fd = create_unix_socket(path, &un, vsocket->is_server); + if (fd < 0) + return -1; + + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un, + sizeof(un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; + } + + RTE_LOG(ERR, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); + + if (ret == -2 || !vsocket->reconnect) { + close(fd); + return -1; + } + + RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + if (reconn == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for reconnect\n"); + close(fd); + return -1; + } + reconn->un = un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); + + return 0; +} + +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + vsocket->connfd = -1; + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + } + ret = vhost_user_create_client(vsocket); + } else { + vsocket->is_server = true; + ret = vhost_user_create_server(vsocket); + } + if (ret < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} + +static bool +vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) +{ + int found = false; + struct vhost_user_reconnect *reconn, *next; + + pthread_mutex_lock(&reconn_list.mutex); + + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (reconn->vsocket == vsocket) { + TAILQ_REMOVE(&reconn_list.head, reconn, next); + close(reconn->fd); + free(reconn); + found = true; + break; + } + } + pthread_mutex_unlock(&reconn_list.mutex); + return found; +} + +/** + * Unregister the specified vhost socket + */ +int +rte_vhost_driver_unregister(const char *path) +{ + int i; + int count; + struct vhost_user_connection *conn; + + pthread_mutex_lock(&vhost_user.mutex); + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) { + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, vsocket->listenfd); + close(vsocket->listenfd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + + conn = fdset_del(&vhost_user.fdset, vsocket->connfd); + if (conn) { + RTE_LOG(INFO, VHOST_CONFIG, + "free connfd = %d for device '%s'\n", + vsocket->connfd, path); + close(vsocket->connfd); + vhost_destroy_device(conn->vid); + free(conn); + } + + free(vsocket->path); + free(vsocket); + + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); + + return 0; + } + } + pthread_mutex_unlock(&vhost_user.mutex); + + return -1; +} + +int +rte_vhost_driver_session_start(void) +{ + fdset_event_dispatch(&vhost_user.fdset); + return 0; +} diff --git a/lib/vhost/rte_vhost/vhost.c b/lib/vhost/rte_vhost/vhost.c new file mode 100644 index 000000000..5270410b6 --- /dev/null +++ b/lib/vhost/rte_vhost/vhost.c @@ -0,0 +1,429 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this lib. */ +#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (VHOST_SUPPORTS_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6)) + +uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; + +struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* device ops to add/remove device to/from data core. */ +struct virtio_net_device_ops const *notify_ops; + +struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} + +static void +cleanup_vq(struct vhost_virtqueue *vq, int destroy) +{ + if ((vq->callfd >= 0) && (destroy != 0)) + close(vq->callfd); + if (vq->kickfd >= 0) + close(vq->kickfd); +} + +/* + * Unmap any memory, close any file descriptors and + * free any memory owned by a device. + */ +void +cleanup_device(struct virtio_net *dev, int destroy) +{ + uint32_t i; + + vhost_backend_cleanup(dev); + + for (i = 0; i < dev->virt_qp_nb; i++) { + cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy); + cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy); + } +} + +/* + * Release virtqueues and device memory. + */ +static void +free_device(struct virtio_net *dev) +{ + uint32_t i; + struct vhost_virtqueue *rxq, *txq; + + for (i = 0; i < dev->virt_qp_nb; i++) { + rxq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ]; + txq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ]; + + rte_free(rxq->shadow_used_ring); + rte_free(txq->shadow_used_ring); + + /* rxq and txq are allocated together as queue-pair */ + rte_free(rxq); + } + + rte_free(dev); +} + +static void +init_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +{ + memset(vq, 0, sizeof(struct vhost_virtqueue)); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + /* Backends are set to -1 indicating an inactive device. */ + vq->backend = -1; + + /* always set the default vq pair to enabled */ + if (qp_idx == 0) + vq->enabled = 1; + + TAILQ_INIT(&vq->zmbuf_list); +} + +static void +init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +{ + uint32_t base_idx = qp_idx * VIRTIO_QNUM; + + init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); + init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); +} + +static void +reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx) +{ + int callfd; + + callfd = vq->callfd; + init_vring_queue(vq, qp_idx); + vq->callfd = callfd; +} + +static void +reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +{ + uint32_t base_idx = qp_idx * VIRTIO_QNUM; + + reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx); + reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx); +} + +int +alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx) +{ + struct vhost_virtqueue *virtqueue = NULL; + uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ; + uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ; + + virtqueue = rte_malloc(NULL, + sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0); + if (virtqueue == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for virt qp:%d.\n", qp_idx); + return -1; + } + + dev->virtqueue[virt_rx_q_idx] = virtqueue; + dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ; + + init_vring_queue_pair(dev, qp_idx); + + dev->virt_qp_nb += 1; + + return 0; +} + +/* + * Reset some variables in device structure, while keeping few + * others untouched, such as vid, ifname, virt_qp_nb: they + * should be same unless the device is removed. + */ +void +reset_device(struct virtio_net *dev) +{ + uint32_t i; + + dev->features = 0; + dev->protocol_features = 0; + dev->flags = 0; + + for (i = 0; i < dev->virt_qp_nb; i++) + reset_vring_queue_pair(dev, i); +} + +/* + * Invoked when there is a new vhost-user connection established (when + * there is a new virtio device being attached). + */ +int +vhost_new_device(void) +{ + struct virtio_net *dev; + int i; + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vhost_devices[i] == NULL) + break; + } + if (i == MAX_VHOST_DEVICE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find a free slot for new device.\n"); + return -1; + } + + vhost_devices[i] = dev; + dev->vid = i; + + return i; +} + +/* + * Invoked when there is the vhost-user connection is broken (when + * the virtio device is being detached). + */ +void +vhost_destroy_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(vid); + } + + cleanup_device(dev, 1); + free_device(dev); + + vhost_devices[vid] = NULL; +} + +void +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) +{ + struct virtio_net *dev; + unsigned int len; + + dev = get_device(vid); + if (dev == NULL) + return; + + len = if_len > sizeof(dev->ifname) ? + sizeof(dev->ifname) : if_len; + + strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; +} + +void +vhost_enable_dequeue_zero_copy(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + dev->dequeue_zero_copy = 1; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %d\n", vid, ret); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +uint32_t +rte_vhost_get_queue_num(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return 0; + + return dev->virt_qp_nb; +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + + return 0; +} + +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + if (enable) { + RTE_LOG(ERR, VHOST_CONFIG, + "guest notification isn't supported.\n"); + return -1; + } + + dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; + return 0; +} + +uint64_t rte_vhost_feature_get(void) +{ + return VHOST_FEATURES; +} + +int rte_vhost_feature_disable(uint64_t feature_mask) +{ + VHOST_FEATURES = VHOST_FEATURES & ~feature_mask; + return 0; +} + +int rte_vhost_feature_enable(uint64_t feature_mask) +{ + if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) { + VHOST_FEATURES = VHOST_FEATURES | feature_mask; + return 0; + } + return -1; +} + +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops) +{ + notify_ops = ops; + + return 0; +} diff --git a/lib/vhost/rte_vhost/vhost.h b/lib/vhost/rte_vhost/vhost.h new file mode 100644 index 000000000..0c297d4bf --- /dev/null +++ b/lib/vhost/rte_vhost/vhost.h @@ -0,0 +1,294 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_CDEV_H_ +#define _VHOST_NET_CDEV_H_ +#include +#include +#include +#include +#include +#include + +#include + +#include "rte_virtio_net.h" + +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint32_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + int enabled; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; + + struct vring_used_elem *shadow_used_ring; + uint16_t shadow_used_idx; +} __rte_cache_aligned; + +/* Old kernels have no such macro defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + + +/* + * Make an extra wrapper for VIRTIO_NET_F_MQ and + * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are + * introduced since kernel v3.8. This makes our + * code buildable for older kernel. + */ +#ifdef VIRTIO_NET_F_MQ + #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX + #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ) +#else + #define VHOST_MAX_QUEUE_PAIRS 1 + #define VHOST_SUPPORTS_MQ 0 +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +struct guest_page { + uint64_t guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; +}; + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct virtio_memory *mem; + uint64_t features; + uint64_t protocol_features; + int vid; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t virt_qp_nb; + uint32_t num_queues; + int dequeue_zero_copy; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + + uint32_t nr_guest_pages; + uint32_t max_guest_pages; + struct guest_page *guest_pages; +} __rte_cache_aligned; + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct virtio_memory_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + + +/** + * Memory structure includes region and mapping information. + */ +struct virtio_memory { + uint32_t nregions; + struct virtio_memory_region regions[0]; +}; + + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define LOG_LEVEL RTE_LOG_DEBUG +#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define LOG_LEVEL RTE_LOG_INFO +#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + +extern uint64_t VHOST_FEATURES; +#define MAX_VHOST_DEVICE 1024 +extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Convert guest physical Address to host virtual address */ +static inline uint64_t __attribute__((always_inline)) +gpa_to_vva(struct virtio_net *dev, uint64_t gpa) +{ + struct virtio_memory_region *reg; + uint32_t i; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/* Convert guest physical address to host physical address */ +static inline phys_addr_t __attribute__((always_inline)) +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + if (gpa >= page->guest_phys_addr && + gpa + size < page->guest_phys_addr + page->size) { + return gpa - page->guest_phys_addr + + page->host_phys_addr; + } + } + + return 0; +} + +extern struct virtio_net_device_ops const *notify_ops; +struct virtio_net *get_device(int vid); + +int vhost_new_device(void); +void cleanup_device(struct virtio_net *dev, int destroy); +void reset_device(struct virtio_net *dev); +void vhost_destroy_device(int); + +int alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); +void vhost_enable_dequeue_zero_copy(int vid); + +/* + * Backend-specific cleanup. + * + * TODO: fix it; we have one backend now + */ +void vhost_backend_cleanup(struct virtio_net *dev); + +#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/vhost/rte_vhost/vhost_user.c b/lib/vhost/rte_vhost/vhost_user.c new file mode 100644 index 000000000..7693af71d --- /dev/null +++ b/lib/vhost/rte_vhost/vhost_user.c @@ -0,0 +1,1042 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include + +#include "vhost.h" +#include "vhost_user.h" + +static const char *vhost_message_str[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", +}; + +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static void +free_mem_region(struct virtio_net *dev) +{ + uint32_t i; + struct virtio_memory_region *reg; + + if (!dev || !dev->mem) + return; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + +void +vhost_backend_cleanup(struct virtio_net *dev) +{ + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } +} + +/* + * This function just returns success at the moment unless + * the device hasn't been initialised. + */ +static int +vhost_user_set_owner(void) +{ + return 0; +} + +static int +vhost_user_reset_owner(struct virtio_net *dev) +{ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(dev->vid); + } + + cleanup_device(dev, 0); + reset_device(dev); + return 0; +} + +/* + * The features that we support are requested. + */ +static uint64_t +vhost_user_get_features(void) +{ + return VHOST_FEATURES; +} + +/* + * We receive the negotiated features supported by us and the virtio device. + */ +static int +vhost_user_set_features(struct virtio_net *dev, uint64_t features) +{ + if (features & ~VHOST_FEATURES) + return -1; + + dev->features = features; + if (dev->features & + ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + dev->vhost_hlen = sizeof(struct virtio_net_hdr); + } + LOG_DEBUG(VHOST_CONFIG, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, + (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", + (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); + + return 0; +} + +/* + * The virtio device sends us the size of the descriptor ring. + */ +static int +vhost_user_set_vring_num(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + vq->size = msg->payload.state.num; + + if (dev->dequeue_zero_copy) { + vq->nr_zmbuf = 0; + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->dequeue_zero_copy = 0; + } + } + + vq->shadow_used_ring = rte_malloc(NULL, + vq->size * sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_ring) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + return 0; +} + +/* + * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the + * same numa node as the memory of vring descriptor. + */ +#ifdef RTE_LIBRTE_VHOST_NUMA +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index) +{ + int oldnode, newnode; + struct virtio_net *old_dev; + struct vhost_virtqueue *old_vq, *vq; + int ret; + + /* + * vq is allocated on pairs, we should try to do realloc + * on first queue of one queue pair only. + */ + if (index % VIRTIO_QNUM != 0) + return dev; + + old_dev = dev; + vq = old_vq = dev->virtqueue[index]; + + ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, + MPOL_F_NODE | MPOL_F_ADDR); + + /* check if we need to reallocate vq */ + ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get vq numa information.\n"); + return dev; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate vq from %d to %d node\n", oldnode, newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0, + newnode); + if (!vq) + return dev; + + memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM); + rte_free(old_vq); + } + + /* check if we need to reallocate dev */ + ret = get_mempolicy(&oldnode, NULL, 0, old_dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get dev numa information.\n"); + goto out; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate dev from %d to %d node\n", + oldnode, newnode); + dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); + if (!dev) { + dev = old_dev; + goto out; + } + + memcpy(dev, old_dev, sizeof(*dev)); + rte_free(old_dev); + } + +out: + dev->virtqueue[index] = vq; + dev->virtqueue[index + 1] = vq + 1; + vhost_devices[dev->vid] = dev; + + return dev; +} +#else +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index __rte_unused) +{ + return dev; +} +#endif + +/* + * Converts QEMU virtual address to Vhost virtual address. This function is + * used to convert the ring addresses to our address space. + */ +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qva) +{ + struct virtio_memory_region *reg; + uint32_t i; + + /* Find the region where the address lives. */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + + if (qva >= reg->guest_user_addr && + qva < reg->guest_user_addr + reg->size) { + return qva - reg->guest_user_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/* + * The virtio device sends us the desc, used and avail ring addresses. + * This function then converts these to our address space. + */ +static int +vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq; + + if (dev->mem == NULL) + return -1; + + /* addr->index refers to the queue index. The txq 1, rxq is 0. */ + vq = dev->virtqueue[msg->payload.addr.index]; + + /* The addresses are converted from QEMU virtual to Vhost virtual. */ + vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.desc_user_addr); + if (vq->desc == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find desc ring address.\n", + dev->vid); + return -1; + } + + dev = numa_realloc(dev, msg->payload.addr.index); + vq = dev->virtqueue[msg->payload.addr.index]; + + vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.avail_user_addr); + if (vq->avail == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find avail ring address.\n", + dev->vid); + return -1; + } + + vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.used_user_addr); + if (vq->used == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find used ring address.\n", + dev->vid); + return -1; + } + + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + vq->last_avail_idx = vq->used->idx; + } + + vq->log_guest_addr = msg->payload.addr.log_guest_addr; + + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); + + return 0; +} + +/* + * The virtio device sends us the available ring last used index. + */ +static int +vhost_user_set_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num; + dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num; + + return 0; +} + +static void +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, + uint64_t host_phys_addr, uint64_t size) +{ + struct guest_page *page, *last_page; + + if (dev->nr_guest_pages == dev->max_guest_pages && + dev->nr_guest_pages > 0) { + dev->max_guest_pages *= 2; + dev->guest_pages = realloc(dev->guest_pages, + dev->max_guest_pages * sizeof(*page)); + } + + if (dev->nr_guest_pages > 0) { + last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; + /* merge if the two pages are continuous */ + if (host_phys_addr == last_page->host_phys_addr + + last_page->size) { + last_page->size += size; + return; + } + } + + page = &dev->guest_pages[dev->nr_guest_pages++]; + page->guest_phys_addr = guest_phys_addr; + page->host_phys_addr = host_phys_addr; + page->size = size; +} + +static void +add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg, + uint64_t page_size) +{ + uint64_t reg_size = reg->size; + uint64_t host_user_addr = reg->host_user_addr; + uint64_t guest_phys_addr = reg->guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; + + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); + size = page_size - (guest_phys_addr & (page_size - 1)); + size = RTE_MIN(size, reg_size); + + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + + while (reg_size > 0) { + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) + host_user_addr); + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, + page_size); + + host_user_addr += page_size; + guest_phys_addr += page_size; + reg_size -= page_size; + } +} + +#ifdef RTE_LIBRTE_VHOST_DEBUG +/* TODO: enable it only in debug mode? */ +static void +dump_guest_pages(struct virtio_net *dev) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + RTE_LOG(INFO, VHOST_CONFIG, + "guest physical page region %u\n" + "\t guest_phys_addr: %" PRIx64 "\n" + "\t host_phys_addr : %" PRIx64 "\n" + "\t size : %" PRIx64 "\n", + i, + page->guest_phys_addr, + page->host_phys_addr, + page->size); + } +} +#else +#define dump_guest_pages(dev) +#endif + +static int +vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct VhostUserMemory memory = pmsg->payload.memory; + struct virtio_memory_region *reg; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + uint32_t i; + int fd; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(dev->vid); + } + + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + dev->nr_guest_pages = 0; + if (!dev->guest_pages) { + dev->max_guest_pages = 8; + dev->guest_pages = malloc(dev->max_guest_pages * + sizeof(struct guest_page)); + } + + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) + + sizeof(struct virtio_memory_region) * memory.nregions, 0); + if (dev->mem == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); + return -1; + } + dev->mem->nregions = memory.nregions; + + for (i = 0; i < memory.nregions; i++) { + fd = pmsg->fds[i]; + reg = &dev->mem->regions[i]; + + reg->guest_phys_addr = memory.regions[i].guest_phys_addr; + reg->guest_user_addr = memory.regions[i].userspace_addr; + reg->size = memory.regions[i].memory_size; + reg->fd = fd; + + mmap_offset = memory.regions[i].mmap_offset; + mmap_size = reg->size + mmap_offset; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + goto err_mmap; + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + add_guest_pages(dev, reg, alignment); + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + } + + dump_guest_pages(dev); + + return 0; + +err_mmap: + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + return -1; +} + +static int +vq_is_ready(struct vhost_virtqueue *vq) +{ + return vq && vq->desc && + vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD; +} + +static int +virtio_is_ready(struct virtio_net *dev) +{ + struct vhost_virtqueue *vq; + uint32_t i; + + for (i = 0; i < dev->num_queues; i++) { + vq = dev->virtqueue[i]; + + if (!vq_is_ready(vq)) { + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is not ready for processing.\n"); + return 0; + } + } + + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is now ready for processing.\n"); + return 1; +} + +static void +vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + uint32_t cur_qp_idx; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + + if (file.index + 1 > dev->num_queues) { + dev->num_queues = file.index + 1; + } + + /* + * FIXME: VHOST_SET_VRING_CALL is the first per-vring message + * we get, so we do vring queue pair allocation here. + */ + cur_qp_idx = file.index / VIRTIO_QNUM; + if (cur_qp_idx + 1 > dev->virt_qp_nb) { + if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0) + return; + } + + vq = dev->virtqueue[file.index]; + assert(vq != NULL); + + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = file.fd; + + if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { + notify_ops->new_device(dev->vid); + } +} + +/* + * In vhost-user, when we receive kick message, will test whether virtio + * device is ready for packet processing. + */ +static void +vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->kickfd >= 0) + close(vq->kickfd); + vq->kickfd = file.fd; + + if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } +} + +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + +/* + * when virtio is stopped, qemu will send us the GET_VRING_BASE message. + */ +static int +vhost_user_get_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + /* We have to stop the queue (virtio) if it is running. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + notify_ops->destroy_device(dev->vid); + } + + /* Here we are safe to get the last used index */ + msg->payload.state.num = vq->last_used_idx; + + RTE_LOG(INFO, VHOST_CONFIG, + "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num); + /* + * Based on current qemu vhost-user implementation, this message is + * sent and only sent in vhost_vring_stop. + * TODO: cleanup the vring, it isn't usable since here. + */ + if (vq->kickfd >= 0) + close(vq->kickfd); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (dev->dequeue_zero_copy) + free_zmbufs(vq); + rte_free(vq->shadow_used_ring); + vq->shadow_used_ring = NULL; + + return 0; +} + +/* + * when virtio queues are ready to work, qemu will send us to + * enable the virtio queue pair. + */ +static int +vhost_user_set_vring_enable(struct virtio_net *dev, + VhostUserMsg *msg) +{ + int enable = (int)msg->payload.state.num; + + RTE_LOG(INFO, VHOST_CONFIG, + "set queue enable: %d to qp idx: %d\n", + enable, msg->payload.state.index); + + if (notify_ops->vring_state_changed) + notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable); + + dev->virtqueue[msg->payload.state.index]->enabled = enable; + + return 0; +} + +static void +vhost_user_set_protocol_features(struct virtio_net *dev, + uint64_t protocol_features) +{ + if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) + return; + + dev->protocol_features = protocol_features; +} + +static int +vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* + * mmap from 0 to workaround a hugepage mmap bug: mmap will + * fail when offset is not page size aligned. + */ + addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); + return -1; + } + + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; + dev->log_size = size; + + return 0; +} + +/* + * An rarp packet is constructed and broadcasted to notify switches about + * the new location of the migrated VM, so that packets from outside will + * not be lost after migration. + * + * However, we don't actually "send" a rarp packet here, instead, we set + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. + */ +static int +vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint8_t *mac = (uint8_t *)&msg->payload.u64; + + RTE_LOG(DEBUG, VHOST_CONFIG, + ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + memcpy(dev->mac.addr_bytes, mac, 6); + + /* + * Set the flag to inject a RARP broadcast packet at + * rte_vhost_dequeue_burst(). + * + * rte_smp_wmb() is for making sure the mac is copied + * before the flag is set. + */ + rte_smp_wmb(); + rte_atomic16_set(&dev->broadcast_rarp, 1); + + return 0; +} + +/* return bytes# of read on success or negative val on failure. */ +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg && msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid msg size: %d\n", msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret <= 0) + return ret; + if (ret != (int)msg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + if (!msg) + return 0; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +int +vhost_user_msg_handler(int vid, int fd) +{ + struct virtio_net *dev; + struct VhostUserMsg msg; + int ret; + + dev = get_device(vid); + if (dev == NULL) + return -1; + + ret = read_vhost_message(fd, &msg); + if (ret <= 0 || msg.request >= VHOST_USER_MAX) { + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read message failed\n"); + else if (ret == 0) + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + else + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read incorrect message\n"); + + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", + vhost_message_str[msg.request]); + switch (msg.request) { + case VHOST_USER_GET_FEATURES: + msg.payload.u64 = vhost_user_get_features(); + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_FEATURES: + vhost_user_set_features(dev, msg.payload.u64); + break; + + case VHOST_USER_GET_PROTOCOL_FEATURES: + msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + vhost_user_set_protocol_features(dev, msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + vhost_user_set_owner(); + break; + case VHOST_USER_RESET_OWNER: + vhost_user_reset_owner(dev); + break; + + case VHOST_USER_SET_MEM_TABLE: + vhost_user_set_mem_table(dev, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + vhost_user_set_log_base(dev, &msg); + + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_LOG_FD: + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + vhost_user_set_vring_num(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + vhost_user_set_vring_addr(dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + vhost_user_set_vring_base(dev, &msg); + break; + + case VHOST_USER_GET_VRING_BASE: + vhost_user_get_vring_base(dev, &msg); + msg.size = sizeof(msg.payload.state); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + vhost_user_set_vring_kick(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + vhost_user_set_vring_call(dev, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + case VHOST_USER_GET_QUEUE_NUM: + msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_ENABLE: + vhost_user_set_vring_enable(dev, &msg); + break; + case VHOST_USER_SEND_RARP: + vhost_user_send_rarp(dev, &msg); + break; + + default: + break; + + } + + return 0; +} diff --git a/lib/vhost/rte_vhost/vhost_user.h b/lib/vhost/rte_vhost/vhost_user.h new file mode 100644 index 000000000..ba78d3268 --- /dev/null +++ b/lib/vhost/rte_vhost/vhost_user.h @@ -0,0 +1,128 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include +#include + +#include "rte_virtio_net.h" + +/* refer to hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#define VHOST_USER_PROTOCOL_F_RARP 2 + +#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ + (1ULL << VHOST_USER_PROTOCOL_F_RARP)) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + + +/* vhost_user.c */ +int vhost_user_msg_handler(int vid, int fd); + +/* socket.c */ +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); + +#endif diff --git a/lib/vhost/rte_vhost/virtio_net.c b/lib/vhost/rte_vhost/virtio_net.c new file mode 100644 index 000000000..e0df0b972 --- /dev/null +++ b/lib/vhost/rte_vhost/virtio_net.c @@ -0,0 +1,1186 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define MAX_PKT_BURST 32 +#define VHOST_LOG_PAGE 4096 + +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + +static bool +is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb) +{ + return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM; +} + +static inline void __attribute__((always_inline)) +do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t to, uint16_t from, uint16_t size) +{ + rte_memcpy(&vq->used->ring[to], + &vq->shadow_used_ring[from], + size * sizeof(struct vring_used_elem)); + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[to]), + size * sizeof(struct vring_used_elem)); +} + +static inline void __attribute__((always_inline)) +flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t used_idx = vq->last_used_idx & (vq->size - 1); + + if (used_idx + vq->shadow_used_idx <= vq->size) { + do_flush_shadow_used_ring(dev, vq, used_idx, 0, + vq->shadow_used_idx); + } else { + uint16_t size; + + /* update used ring interval [used_idx, vq->size] */ + size = vq->size - used_idx; + do_flush_shadow_used_ring(dev, vq, used_idx, 0, size); + + /* update the left half used ring interval [0, left_size] */ + do_flush_shadow_used_ring(dev, vq, 0, size, + vq->shadow_used_idx - size); + } + vq->last_used_idx += vq->shadow_used_idx; + + rte_smp_wmb(); + + *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); +} + +static inline void __attribute__((always_inline)) +update_shadow_used_ring(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint16_t len) +{ + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_ring[i].id = desc_idx; + vq->shadow_used_ring[i].len = len; +} + +static void +virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr) +{ + if (m_buf->ol_flags & PKT_TX_L4_MASK) { + net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len; + + switch (m_buf->ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_TCP_CKSUM: + net_hdr->csum_offset = (offsetof(struct tcp_hdr, + cksum)); + break; + case PKT_TX_UDP_CKSUM: + net_hdr->csum_offset = (offsetof(struct udp_hdr, + dgram_cksum)); + break; + case PKT_TX_SCTP_CKSUM: + net_hdr->csum_offset = (offsetof(struct sctp_hdr, + cksum)); + break; + } + } + + if (m_buf->ol_flags & PKT_TX_TCP_SEG) { + if (m_buf->ol_flags & PKT_TX_IPV4) + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else + net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + net_hdr->gso_size = m_buf->tso_segsz; + net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len + + m_buf->l4_len; + } +} + +static inline void +copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr, + struct virtio_net_hdr_mrg_rxbuf hdr) +{ + if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) + *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr; + else + *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr; +} + +static inline int __attribute__((always_inline)) +copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc *descs, + struct rte_mbuf *m, uint16_t desc_idx, uint32_t size) +{ + uint32_t desc_avail, desc_offset; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len; + struct vring_desc *desc; + uint64_t desc_addr; + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; + + desc = &descs[desc_idx]; + desc_addr = gpa_to_vva(dev, desc->addr); + /* + * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid + * performance issue with some versions of gcc (4.8.4 and 5.3.0) which + * otherwise stores offset on the stack instead of in a register. + */ + if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) + return -1; + + rte_prefetch0((void *)(uintptr_t)desc_addr); + + virtio_enqueue_offload(m, &virtio_hdr.hdr); + copy_virtio_net_hdr(dev, desc_addr, virtio_hdr); + vhost_log_write(dev, desc->addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0); + + desc_offset = dev->vhost_hlen; + desc_avail = desc->len - dev->vhost_hlen; + + mbuf_avail = rte_pktmbuf_data_len(m); + mbuf_offset = 0; + while (mbuf_avail != 0 || m->next != NULL) { + /* done with current mbuf, fetch next */ + if (mbuf_avail == 0) { + m = m->next; + + mbuf_offset = 0; + mbuf_avail = rte_pktmbuf_data_len(m); + } + + /* done with current desc buf, fetch next */ + if (desc_avail == 0) { + if ((desc->flags & VRING_DESC_F_NEXT) == 0) { + /* Room in vring buffer is not enough */ + return -1; + } + if (unlikely(desc->next >= size)) + return -1; + + desc = &descs[desc->next]; + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; + desc_avail = desc->len; + } + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, desc->addr + desc_offset, cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + } + + return 0; +} + +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtio device. A packet + * count is returned to indicate the number of packets that are succesfully + * added to the RX queue. This function works when the mbuf is scattered, but + * it doesn't support the mergeable feature. + */ +static inline uint32_t __attribute__((always_inline)) +virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + uint16_t avail_idx, free_entries, start_idx; + uint16_t desc_indexes[MAX_PKT_BURST]; + struct vring_desc *descs; + uint16_t used_idx; + uint32_t i, sz; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + start_idx = vq->last_used_idx; + free_entries = avail_idx - start_idx; + count = RTE_MIN(count, free_entries); + count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST); + if (count == 0) + return 0; + + LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n", + dev->vid, start_idx, start_idx + count); + + /* Retrieve all of the desc indexes first to avoid caching issues. */ + rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]); + for (i = 0; i < count; i++) { + used_idx = (start_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[used_idx]; + vq->used->ring[used_idx].id = desc_indexes[i]; + vq->used->ring[used_idx].len = pkts[i]->pkt_len + + dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); + } + + rte_prefetch0(&vq->desc[desc_indexes[0]]); + for (i = 0; i < count; i++) { + uint16_t desc_idx = desc_indexes[i]; + int err; + + if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) { + descs = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, + vq->desc[desc_idx].addr); + if (unlikely(!descs)) { + count = i; + break; + } + + desc_idx = 0; + sz = vq->desc[desc_idx].len / sizeof(*descs); + } else { + descs = vq->desc; + sz = vq->size; + } + + err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz); + if (unlikely(err)) { + used_idx = (start_idx + i) & (vq->size - 1); + vq->used->ring[used_idx].len = dev->vhost_hlen; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); + } + + if (i + 1 < count) + rte_prefetch0(&vq->desc[desc_indexes[i+1]]); + } + + rte_smp_wmb(); + + *(volatile uint16_t *)&vq->used->idx += count; + vq->last_used_idx += count; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); + + /* flush used->idx update before we read avail->flags. */ + rte_mb(); + + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); + return count; +} + +static inline int __attribute__((always_inline)) +fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t avail_idx, uint32_t *vec_idx, + struct buf_vector *buf_vec, uint16_t *desc_chain_head, + uint16_t *desc_chain_len) +{ + uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)]; + uint32_t vec_id = *vec_idx; + uint32_t len = 0; + struct vring_desc *descs = vq->desc; + + *desc_chain_head = idx; + + if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) { + descs = (struct vring_desc *)(uintptr_t) + gpa_to_vva(dev, vq->desc[idx].addr); + if (unlikely(!descs)) + return -1; + + idx = 0; + } + + while (1) { + if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) + return -1; + + len += descs[idx].len; + buf_vec[vec_id].buf_addr = descs[idx].addr; + buf_vec[vec_id].buf_len = descs[idx].len; + buf_vec[vec_id].desc_idx = idx; + vec_id++; + + if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0) + break; + + idx = descs[idx].next; + } + + *desc_chain_len = len; + *vec_idx = vec_id; + + return 0; +} + +/* + * Returns -1 on fail, 0 on success + */ +static inline int +reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t size, struct buf_vector *buf_vec, + uint16_t *num_buffers, uint16_t avail_head) +{ + uint16_t cur_idx; + uint32_t vec_idx = 0; + uint16_t tries = 0; + + uint16_t head_idx = 0; + uint16_t len = 0; + + *num_buffers = 0; + cur_idx = vq->last_avail_idx; + + while (size > 0) { + if (unlikely(cur_idx == avail_head)) + return -1; + + if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec, + &head_idx, &len) < 0)) + return -1; + len = RTE_MIN(len, size); + update_shadow_used_ring(vq, head_idx, len); + size -= len; + + cur_idx++; + tries++; + *num_buffers += 1; + + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(tries >= vq->size)) + return -1; + } + + return 0; +} + +static inline int __attribute__((always_inline)) +copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct rte_mbuf *m, + struct buf_vector *buf_vec, uint16_t num_buffers) +{ + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; + uint32_t vec_idx = 0; + uint64_t desc_addr; + uint32_t mbuf_offset, mbuf_avail; + uint32_t desc_offset, desc_avail; + uint32_t cpy_len; + uint64_t hdr_addr, hdr_phys_addr; + struct rte_mbuf *hdr_mbuf; + + if (unlikely(m == NULL)) + return -1; + + desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); + if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) + return -1; + + hdr_mbuf = m; + hdr_addr = desc_addr; + hdr_phys_addr = buf_vec[vec_idx].buf_addr; + rte_prefetch0((void *)(uintptr_t)hdr_addr); + + virtio_hdr.num_buffers = num_buffers; + LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n", + dev->vid, num_buffers); + + desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; + + mbuf_avail = rte_pktmbuf_data_len(m); + mbuf_offset = 0; + while (mbuf_avail != 0 || m->next != NULL) { + /* done with current desc buf, get the next one */ + if (desc_avail == 0) { + vec_idx++; + desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr); + if (unlikely(!desc_addr)) + return -1; + + /* Prefetch buffer address. */ + rte_prefetch0((void *)(uintptr_t)desc_addr); + desc_offset = 0; + desc_avail = buf_vec[vec_idx].buf_len; + } + + /* done with current mbuf, get the next one */ + if (mbuf_avail == 0) { + m = m->next; + + mbuf_offset = 0; + mbuf_avail = rte_pktmbuf_data_len(m); + } + + if (hdr_addr) { + virtio_enqueue_offload(hdr_mbuf, &virtio_hdr.hdr); + copy_virtio_net_hdr(dev, hdr_addr, virtio_hdr); + vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen); + PRINT_PACKET(dev, (uintptr_t)hdr_addr, + dev->vhost_hlen, 0); + + hdr_addr = 0; + } + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)), + rte_pktmbuf_mtod_offset(m, void *, mbuf_offset), + cpy_len); + vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset, + cpy_len); + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), + cpy_len, 0); + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + } + + return 0; +} + +static inline uint32_t __attribute__((always_inline)) +virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + struct vhost_virtqueue *vq; + uint32_t pkt_idx = 0; + uint16_t num_buffers; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t avail_head; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); + if (count == 0) + return 0; + + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + + vq->shadow_used_idx = 0; + avail_head = *((volatile uint16_t *)&vq->avail->idx); + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; + + if (unlikely(reserve_avail_buf_mergeable(dev, vq, + pkt_len, buf_vec, &num_buffers, + avail_head) < 0)) { + LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + vq->shadow_used_idx -= num_buffers; + break; + } + + LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + num_buffers); + + if (pkt_len > 0 && + copy_mbuf_to_desc_mergeable(dev, pkts[pkt_idx], buf_vec, num_buffers) < 0) { + vq->shadow_used_idx -= num_buffers; + break; + } + + vq->last_avail_idx += num_buffers; + } + + if (likely(vq->shadow_used_idx)) { + flush_shadow_used_ring(dev, vq); + + /* flush used->idx update before we read avail->flags. */ + rte_mb(); + + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); + } + + return pkt_idx; +} + +uint16_t +rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return 0; + + if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) + return virtio_dev_merge_rx(dev, queue_id, pkts, count); + else + return virtio_dev_rx(dev, queue_id, pkts, count); +} + +static inline bool +virtio_net_with_host_offload(struct virtio_net *dev) +{ + if (dev->features & + (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_ECN | + VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 | + VIRTIO_NET_F_HOST_UFO)) + return true; + + return false; +} + +static void +parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr) +{ + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + void *l3_hdr = NULL; + struct ether_hdr *eth_hdr; + uint16_t ethertype; + + eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + + m->l2_len = sizeof(struct ether_hdr); + ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); + + if (ethertype == ETHER_TYPE_VLAN) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + + m->l2_len += sizeof(struct vlan_hdr); + ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); + } + + l3_hdr = (char *)eth_hdr + m->l2_len; + + switch (ethertype) { + case ETHER_TYPE_IPv4: + ipv4_hdr = (struct ipv4_hdr *)l3_hdr; + *l4_proto = ipv4_hdr->next_proto_id; + m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV4; + break; + case ETHER_TYPE_IPv6: + ipv6_hdr = (struct ipv6_hdr *)l3_hdr; + *l4_proto = ipv6_hdr->proto; + m->l3_len = sizeof(struct ipv6_hdr); + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV6; + break; + default: + m->l3_len = 0; + *l4_proto = 0; + break; + } +} + +static inline void __attribute__((always_inline)) +vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m) +{ + uint16_t l4_proto = 0; + void *l4_hdr = NULL; + struct tcp_hdr *tcp_hdr = NULL; + + if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE) + return; + + parse_ethernet(m, &l4_proto, &l4_hdr); + if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (hdr->csum_start == (m->l2_len + m->l3_len)) { + switch (hdr->csum_offset) { + case (offsetof(struct tcp_hdr, cksum)): + if (l4_proto == IPPROTO_TCP) + m->ol_flags |= PKT_TX_TCP_CKSUM; + break; + case (offsetof(struct udp_hdr, dgram_cksum)): + if (l4_proto == IPPROTO_UDP) + m->ol_flags |= PKT_TX_UDP_CKSUM; + break; + case (offsetof(struct sctp_hdr, cksum)): + if (l4_proto == IPPROTO_SCTP) + m->ol_flags |= PKT_TX_SCTP_CKSUM; + break; + default: + break; + } + } + } + + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + if (l4_hdr == NULL) { + RTE_LOG(ERR, VHOST_DATA, "l4_hdr is NULL\n"); + break; + } + tcp_hdr = (struct tcp_hdr *)l4_hdr; + m->ol_flags |= PKT_TX_TCP_SEG; + m->tso_segsz = hdr->gso_size; + m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; + break; + default: + RTE_LOG(WARNING, VHOST_DATA, + "unsupported gso type %u.\n", hdr->gso_type); + break; + } + } +} + +#define RARP_PKT_SIZE 64 + +static int +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) +{ + struct ether_hdr *eth_hdr; + struct arp_hdr *rarp; + + if (rarp_mbuf->buf_len < 64) { + RTE_LOG(WARNING, VHOST_DATA, + "failed to make RARP; mbuf size too small %u (< %d)\n", + rarp_mbuf->buf_len, RARP_PKT_SIZE); + return -1; + } + + /* Ethernet header. */ + eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); + ether_addr_copy(mac, ð_hdr->s_addr); + eth_hdr->ether_type = htons(ETHER_TYPE_RARP); + + /* RARP header. */ + rarp = (struct arp_hdr *)(eth_hdr + 1); + rarp->arp_hrd = htons(ARP_HRD_ETHER); + rarp->arp_pro = htons(ETHER_TYPE_IPv4); + rarp->arp_hln = ETHER_ADDR_LEN; + rarp->arp_pln = 4; + rarp->arp_op = htons(ARP_OP_REVREQUEST); + + ether_addr_copy(mac, &rarp->arp_data.arp_sha); + ether_addr_copy(mac, &rarp->arp_data.arp_tha); + memset(&rarp->arp_data.arp_sip, 0x00, 4); + memset(&rarp->arp_data.arp_tip, 0x00, 4); + + rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE; + + return 0; +} + +static inline void __attribute__((always_inline)) +put_zmbuf(struct zcopy_mbuf *zmbuf) +{ + zmbuf->in_use = 0; +} + +static inline int __attribute__((always_inline)) +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs, + uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx, + struct rte_mempool *mbuf_pool) +{ + struct vring_desc *desc; + uint64_t desc_addr; + uint32_t desc_avail, desc_offset; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len; + struct rte_mbuf *cur = m, *prev = m; + struct virtio_net_hdr *hdr = NULL; + /* A counter to avoid desc dead loop chain */ + uint32_t nr_desc = 1; + + desc = &descs[desc_idx]; + if (unlikely((desc->len < dev->vhost_hlen)) || + (desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + if (virtio_net_with_host_offload(dev)) { + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr); + rte_prefetch0(hdr); + } + + /* + * A virtio driver normally uses at least 2 desc buffers + * for Tx: the first for storing the header, and others + * for storing the data. + */ + if (likely((desc->len == dev->vhost_hlen) && + (desc->flags & VRING_DESC_F_NEXT) != 0)) { + desc = &descs[desc->next]; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + desc_offset = 0; + desc_avail = desc->len; + nr_desc += 1; + } else { + desc_avail = desc->len - dev->vhost_hlen; + desc_offset = dev->vhost_hlen; + } + + rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset)); + + PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0); + + mbuf_offset = 0; + mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; + while (1) { + uint64_t hpa; + + cpy_len = RTE_MIN(desc_avail, mbuf_avail); + + /* + * A desc buf might across two host physical pages that are + * not continuous. In such case (gpa_to_hpa returns 0), data + * will be copied even though zero copy is enabled. + */ + if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev, + desc->addr + desc_offset, cpy_len)))) { + cur->data_len = cpy_len; + cur->data_off = 0; + cur->buf_addr = (void *)(uintptr_t)desc_addr; + cur->buf_physaddr = hpa; + + /* + * In zero copy mode, one mbuf can only reference data + * for one or partial of one desc buff. + */ + mbuf_avail = cpy_len; + } else { + rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset), + (void *)((uintptr_t)(desc_addr + desc_offset)), + cpy_len); + } + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + desc_avail -= cpy_len; + desc_offset += cpy_len; + + /* This desc reaches to its end, get the next one */ + if (desc_avail == 0) { + if ((desc->flags & VRING_DESC_F_NEXT) == 0) + break; + + if (unlikely(desc->next >= max_desc || + ++nr_desc > max_desc)) + return -1; + desc = &descs[desc->next]; + if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) + return -1; + + desc_addr = gpa_to_vva(dev, desc->addr); + if (unlikely(!desc_addr)) + return -1; + + rte_prefetch0((void *)(uintptr_t)desc_addr); + + desc_offset = 0; + desc_avail = desc->len; + + PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0); + } + + /* + * This mbuf reaches to its end, get a new one + * to hold more data. + */ + if (mbuf_avail == 0) { + cur = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(cur == NULL)) { + RTE_LOG(ERR, VHOST_DATA, "Failed to " + "allocate memory for mbuf.\n"); + return -1; + } + + prev->next = cur; + prev->data_len = mbuf_offset; + m->nb_segs += 1; + m->pkt_len += mbuf_offset; + prev = cur; + + mbuf_offset = 0; + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; + } + } + + prev->data_len = mbuf_offset; + m->pkt_len += mbuf_offset; + + if (hdr) + vhost_dequeue_offload(hdr, m); + + return 0; +} + +static inline void __attribute__((always_inline)) +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t used_idx, uint32_t desc_idx) +{ + vq->used->ring[used_idx].id = desc_idx; + vq->used->ring[used_idx].len = 0; + vhost_log_used_vring(dev, vq, + offsetof(struct vring_used, ring[used_idx]), + sizeof(vq->used->ring[used_idx])); +} + +static inline void __attribute__((always_inline)) +update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint32_t count) +{ + if (unlikely(count == 0)) + return; + + rte_smp_wmb(); + rte_smp_rmb(); + + vq->used->idx += count; + vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), + sizeof(vq->used->idx)); + + /* Kick guest if required. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && (vq->callfd >= 0)) + eventfd_write(vq->callfd, (eventfd_t)1); +} + +static inline struct zcopy_mbuf *__attribute__((always_inline)) +get_zmbuf(struct vhost_virtqueue *vq) +{ + uint16_t i; + uint16_t last; + int tries = 0; + + /* search [last_zmbuf_idx, zmbuf_size) */ + i = vq->last_zmbuf_idx; + last = vq->zmbuf_size; + +again: + for (; i < last; i++) { + if (vq->zmbufs[i].in_use == 0) { + vq->last_zmbuf_idx = i + 1; + vq->zmbufs[i].in_use = 1; + return &vq->zmbufs[i]; + } + } + + tries++; + if (tries == 1) { + /* search [0, last_zmbuf_idx) */ + i = 0; + last = vq->last_zmbuf_idx; + goto again; + } + + return NULL; +} + +static inline bool __attribute__((always_inline)) +mbuf_is_consumed(struct rte_mbuf *m) +{ + while (m) { + if (rte_mbuf_refcnt_read(m) > 1) + return false; + m = m->next; + } + + return true; +} + +uint16_t +rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + struct virtio_net *dev; + struct rte_mbuf *rarp_mbuf = NULL; + struct vhost_virtqueue *vq; + uint32_t desc_indexes[MAX_PKT_BURST]; + uint32_t used_idx; + uint32_t i = 0; + uint16_t free_entries; + uint16_t avail_idx; + + dev = get_device(vid); + if (!dev) + return 0; + + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) { + RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + if (unlikely(vq->enabled == 0)) + return 0; + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; + int nr_updated = 0; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + if (mbuf_is_consumed(zmbuf->mbuf)) { + used_idx = vq->last_used_idx++ & (vq->size - 1); + update_used_ring(dev, vq, used_idx, + zmbuf->desc_idx); + nr_updated += 1; + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } + + update_used_idx(dev, vq, nr_updated); + } + + /* + * Construct a RARP broadcast packet, and inject it to the "pkts" + * array, to looks like that guest actually send such packet. + * + * Check user_send_rarp() for more information. + */ + if (unlikely(rte_atomic16_cmpset((volatile uint16_t *) + &dev->broadcast_rarp.cnt, 1, 0))) { + rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool); + if (rarp_mbuf == NULL) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return 0; + } + + if (make_rarp_packet(rarp_mbuf, &dev->mac)) { + rte_pktmbuf_free(rarp_mbuf); + rarp_mbuf = NULL; + } else { + count -= 1; + } + } + + free_entries = *((volatile uint16_t *)&vq->avail->idx) - + vq->last_avail_idx; + if (free_entries == 0) + goto out; + + LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + + /* Prefetch available and used ring */ + avail_idx = vq->last_avail_idx & (vq->size - 1); + used_idx = vq->last_used_idx & (vq->size - 1); + rte_prefetch0(&vq->avail->ring[avail_idx]); + rte_prefetch0(&vq->used->ring[used_idx]); + + count = RTE_MIN(count, MAX_PKT_BURST); + count = RTE_MIN(count, free_entries); + LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", + dev->vid, count); + + /* Retrieve all of the head indexes first to avoid caching issues. */ + for (i = 0; i < count; i++) { + avail_idx = (vq->last_avail_idx + i) & (vq->size - 1); + used_idx = (vq->last_used_idx + i) & (vq->size - 1); + desc_indexes[i] = vq->avail->ring[avail_idx]; + + if (likely(dev->dequeue_zero_copy == 0)) + update_used_ring(dev, vq, used_idx, desc_indexes[i]); + } + + /* Prefetch descriptor index. */ + rte_prefetch0(&vq->desc[desc_indexes[0]]); + for (i = 0; i < count; i++) { + struct vring_desc *desc; + uint16_t sz, idx; + int err; + + if (likely(i + 1 < count)) + rte_prefetch0(&vq->desc[desc_indexes[i + 1]]); + + if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) { + desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev, + vq->desc[desc_indexes[i]].addr); + if (unlikely(!desc)) + break; + + rte_prefetch0(desc); + sz = vq->desc[desc_indexes[i]].len / sizeof(*desc); + idx = 0; + } else { + desc = vq->desc; + sz = vq->size; + idx = desc_indexes[i]; + } + + pkts[i] = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(pkts[i] == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + break; + } + + err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); + break; + } + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); + break; + } + zmbuf->mbuf = pkts[i]; + zmbuf->desc_idx = desc_indexes[i]; + + /* + * Pin lock the mbuf; we will check later to see + * whether the mbuf is freed (when we are the last + * user) or not. If that's the case, we then could + * update the used ring safely. + */ + rte_mbuf_refcnt_update(pkts[i], 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + } + } + vq->last_avail_idx += i; + + if (likely(dev->dequeue_zero_copy == 0)) { + vq->last_used_idx += i; + update_used_idx(dev, vq, i); + } + +out: + if (unlikely(rarp_mbuf != NULL)) { + /* + * Inject it to the head of "pkts" array, so that switch's mac + * learning table will get updated first. + */ + memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *)); + pkts[0] = rarp_mbuf; + i += 1; + } + + return i; +} diff --git a/lib/vhost/task.c b/lib/vhost/task.c new file mode 100644 index 000000000..c9a27c6b2 --- /dev/null +++ b/lib/vhost/task.c @@ -0,0 +1,162 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "spdk_internal/log.h" +#include "spdk_internal/event.h" +#include "spdk/env.h" +#include "spdk/queue.h" +#include "task.h" + +#undef container_of +#define container_of(ptr, type, member) ({ \ + typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) + +typedef TAILQ_HEAD(, spdk_vhost_task) need_iovecs_tailq_t; + +static struct rte_mempool *g_task_pool; +static struct rte_mempool *g_iov_buffer_pool; + +need_iovecs_tailq_t g_need_iovecs[RTE_MAX_LCORE]; + +void +spdk_vhost_task_put(struct spdk_vhost_task *task) +{ + assert(&task->scsi.iov == task->scsi.iovs); + assert(task->scsi.iovcnt == 1); + spdk_scsi_task_put(&task->scsi); +} + +static void +spdk_vhost_task_free_cb(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_task *task = container_of(scsi_task, struct spdk_vhost_task, scsi); + + rte_mempool_put(g_task_pool, task); +} + +struct spdk_vhost_task * +spdk_vhost_task_get(uint32_t *owner_task_ctr) +{ + struct spdk_vhost_task *task; + int rc; + + rc = rte_mempool_get(g_task_pool, (void **)&task); + if ((rc < 0) || !task) { + SPDK_ERRLOG("Unable to get task\n"); + rte_panic("no memory\n"); + } + + memset(task, 0, sizeof(*task)); + spdk_scsi_task_construct(&task->scsi, owner_task_ctr, NULL); + task->scsi.free_fn = spdk_vhost_task_free_cb; + + return task; +} + +void +spdk_vhost_enqueue_task(struct spdk_vhost_task *task) +{ + need_iovecs_tailq_t *tailq = &g_need_iovecs[rte_lcore_id()]; + + TAILQ_INSERT_TAIL(tailq, task, iovecs_link); +} + +struct spdk_vhost_task * +spdk_vhost_dequeue_task(void) +{ + need_iovecs_tailq_t *tailq = &g_need_iovecs[rte_lcore_id()]; + struct spdk_vhost_task *task; + + if (TAILQ_EMPTY(tailq)) + return NULL; + + task = TAILQ_FIRST(tailq); + TAILQ_REMOVE(tailq, task, iovecs_link); + + return task; +} + +struct iovec * +spdk_vhost_iovec_alloc(void) +{ + struct iovec *iov = NULL; + + rte_mempool_get(g_iov_buffer_pool, (void **)&iov); + return iov; +} + +void +spdk_vhost_iovec_free(struct iovec *iov) +{ + rte_mempool_put(g_iov_buffer_pool, iov); +} + +static int +spdk_vhost_subsystem_init(void) +{ + g_task_pool = rte_mempool_create("vhost task pool", 16384, sizeof(struct spdk_vhost_task), + 128, 0, NULL, NULL, NULL, NULL, SOCKET_ID_ANY, 0); + if (!g_task_pool) { + SPDK_ERRLOG("create task pool failed\n"); + return -1; + } + + g_iov_buffer_pool = rte_mempool_create("vhost iov buffer pool", 2048, + VHOST_SCSI_IOVS_LEN * sizeof(struct iovec), + 128, 0, NULL, NULL, NULL, NULL, SOCKET_ID_ANY, 0); + if (!g_iov_buffer_pool) { + SPDK_ERRLOG("create iov buffer pool failed\n"); + return -1; + } + + for (int i = 0; i < RTE_MAX_LCORE; i++) { + TAILQ_INIT(&g_need_iovecs[i]); + } + + return 0; +} + +static int +spdk_vhost_subsystem_fini(void) +{ + return 0; +} + +SPDK_SUBSYSTEM_REGISTER(vhost, spdk_vhost_subsystem_init, spdk_vhost_subsystem_fini, NULL) +SPDK_SUBSYSTEM_DEPEND(vhost, scsi) diff --git a/lib/vhost/task.h b/lib/vhost/task.h new file mode 100644 index 000000000..c60d867de --- /dev/null +++ b/lib/vhost/task.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VHOST_TASK_H +#define SPDK_VHOST_TASK_H + +#include "spdk/scsi.h" + +/* Allocated iovec buffer len */ +#define VHOST_SCSI_IOVS_LEN 128 + +struct spdk_vhost_task { + struct spdk_scsi_task scsi; + + union { + struct virtio_scsi_cmd_resp *resp; + struct virtio_scsi_ctrl_tmf_resp *tmf_resp; + }; + + struct spdk_vhost_scsi_ctrlr *vdev; + struct spdk_scsi_dev *scsi_dev; + + int req_idx; + + struct vhost_virtqueue *vq; + + TAILQ_ENTRY(spdk_vhost_task) iovecs_link; +}; + +void spdk_vhost_enqueue_task(struct spdk_vhost_task *task); +struct spdk_vhost_task *spdk_vhost_dequeue_task(void); + +void spdk_vhost_task_put(struct spdk_vhost_task *task); +struct spdk_vhost_task *spdk_vhost_task_get(uint32_t *owner_task_ctr); + +void spdk_vhost_iovec_free(struct iovec *iov); +struct iovec *spdk_vhost_iovec_alloc(void); + +#endif /* SPDK_VHOST_TASK_H */ diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c new file mode 100644 index 000000000..a8d7ea7a9 --- /dev/null +++ b/lib/vhost/vhost.c @@ -0,0 +1,1161 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "spdk_internal/log.h" +#include "spdk/env.h" +#include "spdk/scsi.h" +#include "spdk/conf.h" +#include "spdk/event.h" +#include "spdk/scsi_spec.h" + +#include "spdk/vhost.h" +#include "task.h" + +static uint32_t g_num_ctrlrs[RTE_MAX_LCORE]; + +#define CONTROLQ_POLL_PERIOD_US (1000 * 5) + +#define VIRTIO_SCSI_CONTROLQ 0 +#define VIRTIO_SCSI_EVENTQ 1 +#define VIRTIO_SCSI_REQUESTQ 2 + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_dirname[PATH_MAX] = ""; + +struct spdk_vaddr_region { + void *vaddr; + uint64_t len; +}; + +/* + * Device linked list structure for data path. + */ +struct spdk_vhost_scsi_ctrlr { + char *name; + /**< Pointer to device created by vhost lib. */ + struct virtio_net *dev; + + struct spdk_vaddr_region region[VHOST_MEMORY_MAX_NREGIONS]; + uint32_t nregions; + + /**< TODO make this an array of spdk_scsi_devs. The vhost scsi + * request will tell us which scsi_dev to use. + */ + struct spdk_scsi_dev *scsi_dev[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; + + int task_cnt; + + struct spdk_poller *requestq_poller; + struct spdk_poller *controlq_poller; + + int32_t lcore; + + uint64_t cpumask; +} __rte_cache_aligned; + +/* This maps from the integer index passed by DPDK to the our controller representation. */ +struct spdk_vhost_scsi_ctrlr *dpdk_vid_mapping[MAX_VHOST_DEVICE]; /* MAX_VHOST_DEVICE from DPDK. */ + +/* + * Get available requests from avail ring. + */ +static uint16_t +vq_avail_ring_get(struct vhost_virtqueue *vq, uint16_t *reqs, uint16_t reqs_len) +{ + struct vring_avail *avail = vq->avail; + uint16_t size_mask = vq->size - 1; + uint16_t last_idx = vq->last_avail_idx, avail_idx = avail->idx; + uint16_t count = RTE_MIN((avail_idx - last_idx) & size_mask, reqs_len); + uint16_t i; + + vq->last_avail_idx += count; + for (i = 0; i < count; i++) { + reqs[i] = vq->avail->ring[(last_idx + i) & size_mask]; + } + + SPDK_TRACELOG(SPDK_TRACE_VHOST_RING, + "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", + last_idx, avail_idx, count); + + return count; +} + +/* + * Enqueue id and len to used ring. + */ +static void +vq_used_ring_enqueue(struct vhost_virtqueue *vq, uint16_t id, uint32_t len) +{ + struct vring_used *used = vq->used; + uint16_t size_mask = vq->size - 1; + uint16_t last_idx = vq->last_used_idx; + + SPDK_TRACELOG(SPDK_TRACE_VHOST_RING, "USED: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n", + last_idx, id, len); + + vq->last_used_idx++; + last_idx &= size_mask; + + used->ring[last_idx].id = id; + used->ring[last_idx].len = len; + + rte_compiler_barrier(); + + vq->used->idx = vq->last_used_idx; + eventfd_write(vq->callfd, (eventfd_t)1); +} + +static bool +vring_desc_has_next(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_NEXT); +} + +static struct vring_desc * +vring_desc_get_next(struct vring_desc *vq_desc, struct vring_desc *cur_desc) +{ + assert(vring_desc_has_next(cur_desc)); + return &vq_desc[cur_desc->next]; +} + +static bool +vring_desc_is_wr(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +static void task_submit(struct spdk_vhost_task *task); +static int process_request(struct spdk_vhost_task *task); +static void invalid_request(struct spdk_vhost_task *task); + +static void +submit_completion(struct spdk_vhost_task *task) +{ + struct iovec *iovs = NULL; + int result; + + vq_used_ring_enqueue(task->vq, task->req_idx, task->scsi.data_transferred); + SPDK_TRACELOG(SPDK_TRACE_VHOST, "Finished task (%p) req_idx=%d\n", task, task->req_idx); + + if (task->scsi.iovs != &task->scsi.iov) { + iovs = task->scsi.iovs; + task->scsi.iovs = &task->scsi.iov; + task->scsi.iovcnt = 1; + } + + spdk_vhost_task_put(task); + + if (!iovs) { + return; + } + + while (1) { + task = spdk_vhost_dequeue_task(); + if (!task) { + spdk_vhost_iovec_free(iovs); + break; + } + + /* Set iovs so underlying functions will not try to alloc IOV */ + task->scsi.iovs = iovs; + task->scsi.iovcnt = VHOST_SCSI_IOVS_LEN; + + result = process_request(task); + if (result == 0) { + task_submit(task); + break; + } else { + task->scsi.iovs = &task->scsi.iov; + task->scsi.iovcnt = 1; + invalid_request(task); + } + } +} + +static void +process_mgmt_task_completion(void *arg1, void *arg2) +{ + struct spdk_vhost_task *task = arg1; + + submit_completion(task); +} + +static void +process_task_completion(void *arg1, void *arg2) +{ + struct spdk_vhost_task *task = arg1; + + /* The SCSI task has completed. Do final processing and then post + notification to the virtqueue's "used" ring. + */ + task->resp->status = task->scsi.status; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len); + task->resp->sense_len = task->scsi.sense_data_len; + } + task->resp->resid = task->scsi.transfer_len - task->scsi.data_transferred; + + submit_completion(task); +} + +static void +task_submit(struct spdk_vhost_task *task) +{ + /* The task is ready to be submitted. First create the callback event that + will be invoked when the SCSI command is completed. See process_task_completion() + for what SPDK vhost-scsi does when the task is completed. + */ + + task->resp->response = VIRTIO_SCSI_S_OK; + task->scsi.cb_event = spdk_event_allocate(rte_lcore_id(), + process_task_completion, + task, NULL); + spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi); +} + +static void +mgmt_task_submit(struct spdk_vhost_task *task) +{ + task->tmf_resp->response = VIRTIO_SCSI_S_OK; + task->scsi.cb_event = spdk_event_allocate(rte_lcore_id(), + process_mgmt_task_completion, + task, NULL); + spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi); +} + +static void +invalid_request(struct spdk_vhost_task *task) +{ + vq_used_ring_enqueue(task->vq, task->req_idx, 0); + spdk_vhost_task_put(task); + + SPDK_TRACELOG(SPDK_TRACE_VHOST, "Invalid request (status=%" PRIu8")\n", + task->resp ? task->resp->response : -1); +} + +static struct spdk_scsi_dev * +get_scsi_dev(struct spdk_vhost_scsi_ctrlr *vdev, const __u8 *lun) +{ + SPDK_TRACEDUMP(SPDK_TRACE_VHOST_QUEUE, "LUN", lun, 8); + /* First byte must be 1 and second is target */ + if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) + return NULL; + + return vdev->scsi_dev[lun[1]]; +} + +static struct spdk_scsi_lun * +get_scsi_lun(struct spdk_scsi_dev *scsi_dev, const __u8 *lun) +{ + uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; + + /* For now only one LUN per controller is allowed so no need to search LUN IDs*/ + return likely(scsi_dev != NULL && lun_id < scsi_dev->maxlun) ? scsi_dev->lun[lun_id] : NULL; +} + +static void +process_ctrl_request(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue *controlq, + uint16_t req_idx) +{ + struct spdk_vhost_task *task; + + struct vring_desc *desc; + struct virtio_scsi_ctrl_tmf_req *ctrl_req; + struct virtio_scsi_ctrl_an_resp *an_resp; + + desc = &controlq->desc[req_idx]; + ctrl_req = (void *)gpa_to_vva(vdev->dev, desc->addr); + + SPDK_TRACELOG(SPDK_TRACE_VHOST_QUEUE, + "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; enabled %d; kickfd %d; size %d\n", + req_idx, desc, (void *)desc->addr, desc->len, desc->flags, controlq->last_used_idx, + controlq->enabled, controlq->kickfd, controlq->size); + SPDK_TRACEDUMP(SPDK_TRACE_VHOST_QUEUE, "Request desriptor", (uint8_t *)ctrl_req, + desc->len); + + task = spdk_vhost_task_get(&vdev->task_cnt); + task->vq = controlq; + task->vdev = vdev; + task->req_idx = req_idx; + task->scsi_dev = get_scsi_dev(task->vdev, ctrl_req->lun); + + /* Process the TMF request */ + switch (ctrl_req->type) { + case VIRTIO_SCSI_T_TMF: + /* Get the response buffer */ + assert(vring_desc_has_next(desc)); + desc = vring_desc_get_next(controlq->desc, desc); + task->tmf_resp = (void *)gpa_to_vva(vdev->dev, desc->addr); + + /* Check if we are processing a valid request */ + if (task->scsi_dev == NULL) { + task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET; + break; + } + + switch (ctrl_req->subtype) { + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + /* Handle LUN reset */ + SPDK_TRACELOG(SPDK_TRACE_VHOST_QUEUE, "LUN reset\n"); + task->scsi.type = SPDK_SCSI_TASK_TYPE_MANAGE; + task->scsi.function = SPDK_SCSI_TASK_FUNC_LUN_RESET; + task->scsi.lun = get_scsi_lun(task->scsi_dev, ctrl_req->lun); + + mgmt_task_submit(task); + return; + default: + task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED; + /* Unsupported command */ + SPDK_TRACELOG(SPDK_TRACE_VHOST_QUEUE, "Unsupported TMF command %x\n", ctrl_req->subtype); + break; + } + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: { + desc = vring_desc_get_next(controlq->desc, desc); + an_resp = (void *)gpa_to_vva(vdev->dev, desc->addr); + an_resp->response = VIRTIO_SCSI_S_ABORTED; + break; + } + default: + SPDK_TRACELOG(SPDK_TRACE_VHOST_QUEUE, "Unsupported control command %x\n", ctrl_req->type); + break; + } + + vq_used_ring_enqueue(controlq, req_idx, 0); + spdk_vhost_task_put(task); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * -1 if request is invalid and must be aborted, + * 0 if all data are set, + * 1 if it was not possible to allocate IO vector for this task. + */ +static int +task_data_setup(struct spdk_vhost_task *task, + struct virtio_scsi_cmd_req **req) +{ + struct vhost_virtqueue *vq = task->vq; + struct virtio_net *dev = task->vdev->dev; + struct vring_desc *desc = &task->vq->desc[task->req_idx]; + struct iovec *iovs = task->scsi.iovs; + uint16_t iovcnt = 0, iovcnt_max = task->scsi.iovcnt; + uint32_t len = 0; + + assert(iovcnt_max == 1 || iovcnt_max == VHOST_SCSI_IOVS_LEN); + + /* Sanity check. First descriptor must be readable and must have next one. */ + if (unlikely(vring_desc_is_wr(desc) || !vring_desc_has_next(desc))) { + SPDK_WARNLOG("Invalid first (request) descriptor.\n"); + task->resp = NULL; + goto abort_task; + } + + *req = (void *)gpa_to_vva(dev, desc->addr); + + desc = vring_desc_get_next(vq->desc, desc); + task->scsi.dxfer_dir = vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV : SPDK_SCSI_DIR_TO_DEV; + + if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { + /* + * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN] + */ + task->resp = (void *)gpa_to_vva(dev, desc->addr); + if (!vring_desc_has_next(desc)) { + /* + * TEST UNIT READY command and some others might not contain any payload and this is not an error. + */ + SPDK_TRACELOG(SPDK_TRACE_VHOST_DATA, + "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx); + SPDK_TRACEDUMP(SPDK_TRACE_VHOST_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE); + task->scsi.iovcnt = 1; + task->scsi.iovs[0].iov_len = 0; + task->scsi.length = 0; + task->scsi.transfer_len = 0; + return 0; + } + + desc = vring_desc_get_next(vq->desc, desc); + if (iovcnt_max != VHOST_SCSI_IOVS_LEN && vring_desc_has_next(desc)) { + iovs = spdk_vhost_iovec_alloc(); + if (iovs == NULL) { + return 1; + } + + iovcnt_max = VHOST_SCSI_IOVS_LEN; + } + + /* All remaining descriptors are data. */ + while (iovcnt < iovcnt_max) { + iovs[iovcnt].iov_base = (void *)gpa_to_vva(dev, desc->addr); + iovs[iovcnt].iov_len = desc->len; + len += desc->len; + iovcnt++; + + if (!vring_desc_has_next(desc)) + break; + + desc = vring_desc_get_next(vq->desc, desc); + if (unlikely(!vring_desc_is_wr(desc))) { + SPDK_WARNLOG("FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n", iovcnt); + task->resp = NULL; + goto abort_task; + } + } + } else { + SPDK_TRACELOG(SPDK_TRACE_VHOST_DATA, "TO DEV"); + /* + * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp] + * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir. + */ + + if (iovcnt_max != VHOST_SCSI_IOVS_LEN && vring_desc_has_next(desc)) { + /* If next descriptor is not for response, allocate iovs. */ + if (!vring_desc_is_wr(vring_desc_get_next(vq->desc, desc))) { + iovs = spdk_vhost_iovec_alloc(); + + if (iovs == NULL) { + return 1; + } + + iovcnt_max = VHOST_SCSI_IOVS_LEN; + } + } + + /* Process descriptors up to response. */ + while (!vring_desc_is_wr(desc) && iovcnt < iovcnt_max) { + iovs[iovcnt].iov_base = (void *)gpa_to_vva(dev, desc->addr); + iovs[iovcnt].iov_len = desc->len; + len += desc->len; + iovcnt++; + + if (!vring_desc_has_next(desc)) { + SPDK_WARNLOG("TO_DEV cmd: no response descriptor.\n"); + task->resp = NULL; + goto abort_task; + } + + desc = vring_desc_get_next(vq->desc, desc); + } + + task->resp = (void *)gpa_to_vva(dev, desc->addr); + if (vring_desc_has_next(desc)) { + SPDK_WARNLOG("TO_DEV cmd: ignoring unexpected descriptors after response descriptor.\n"); + } + } + + if (iovcnt_max > 1 && iovcnt == iovcnt_max) { + SPDK_WARNLOG("Too many IO vectors in chain!\n"); + goto abort_task; + } + + task->scsi.iovs = iovs; + task->scsi.iovcnt = iovcnt; + task->scsi.length = len; + task->scsi.transfer_len = len; + return 0; + +abort_task: + if (iovs != task->scsi.iovs) { + spdk_vhost_iovec_free(iovs); + } + + if (task->resp) { + task->resp->response = VIRTIO_SCSI_S_ABORTED; + } + + return -1; +} + +static int +process_request(struct spdk_vhost_task *task) +{ + struct virtio_scsi_cmd_req *req; + int result; + + result = task_data_setup(task, &req); + if (result) { + return result; + } + + task->scsi_dev = get_scsi_dev(task->vdev, req->lun); + if (unlikely(task->scsi_dev == NULL)) { + task->resp->response = VIRTIO_SCSI_S_BAD_TARGET; + return -1; + } + + task->scsi.lun = get_scsi_lun(task->scsi_dev, req->lun); + task->scsi.cdb = req->cdb; + task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0); + SPDK_TRACEDUMP(SPDK_TRACE_VHOST_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE); + return 0; +} + +static void +process_controlq(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue *vq) +{ + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + + reqs_cnt = vq_avail_ring_get(vq, reqs, RTE_DIM(reqs)); + for (i = 0; i < reqs_cnt; i++) { + process_ctrl_request(vdev, vq, reqs[i]); + } +} + +static void +process_requestq(struct spdk_vhost_scsi_ctrlr *vdev, struct vhost_virtqueue *vq) +{ + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + struct spdk_vhost_task *task; + int result; + + reqs_cnt = vq_avail_ring_get(vq, reqs, RTE_DIM(reqs)); + for (i = 0; i < reqs_cnt; i++) { + task = spdk_vhost_task_get(&vdev->task_cnt); + + SPDK_TRACELOG(SPDK_TRACE_VHOST, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + task->vq = vq; + task->vdev = vdev; + task->req_idx = reqs[i]; + result = process_request(task); + if (likely(result == 0)) { + task_submit(task); + SPDK_TRACELOG(SPDK_TRACE_VHOST, "====== Task %p req_idx %d submitted ======\n", task, + task->req_idx); + } else if (result > 0) { + spdk_vhost_enqueue_task(task); + SPDK_TRACELOG(SPDK_TRACE_VHOST, "====== Task %p req_idx %d deferred ======\n", task, task->req_idx); + } else { + invalid_request(task); + SPDK_TRACELOG(SPDK_TRACE_VHOST, "====== Task %p req_idx %d failed ======\n", task, task->req_idx); + } + } +} + +static void +vdev_controlq_worker(void *arg) +{ + struct spdk_vhost_scsi_ctrlr *vdev = arg; + + process_controlq(vdev, vdev->dev->virtqueue[VIRTIO_SCSI_CONTROLQ]); +} + +static void +vdev_worker(void *arg) +{ + struct spdk_vhost_scsi_ctrlr *vdev = arg; + uint32_t q_idx; + + for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < vdev->dev->num_queues; q_idx++) { + process_requestq(vdev, vdev->dev->virtqueue[q_idx]); + } +} + +#define SHIFT_2MB 21 +#define SIZE_2MB (1ULL << SHIFT_2MB) +#define FLOOR_2MB(x) (((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB +#define CEIL_2MB(x) ((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB + +static void +vdev_event_done_cb(void *arg1, void *arg2) +{ + sem_post((sem_t *)arg2); +} + +static struct spdk_event * +vhost_sem_event_alloc(uint32_t core, spdk_event_fn fn, void *arg1, sem_t *sem) +{ + if (sem_init(sem, 0, 0) < 0) + rte_panic("Failed to initialize semaphore."); + + return spdk_event_allocate(core, fn, arg1, sem); +} + +static int +vhost_sem_timedwait(sem_t *sem, unsigned sec) +{ + struct timespec timeout; + int rc; + + clock_gettime(CLOCK_REALTIME, &timeout); + timeout.tv_sec += sec; + + rc = sem_timedwait(sem, &timeout); + sem_destroy(sem); + + return rc; +} + +static void +add_vdev_cb(void *arg1, void *arg2) +{ + struct spdk_vhost_scsi_ctrlr *vdev = arg1; + struct virtio_memory_region *region; + uint32_t i; + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + if (vdev->scsi_dev[i] == NULL) { + continue; + } + spdk_scsi_dev_allocate_io_channels(vdev->scsi_dev[i]); + } + SPDK_NOTICELOG("Started poller for vhost controller %s on lcore %d\n", vdev->name, vdev->lcore); + vdev->nregions = vdev->dev->mem->nregions; + for (i = 0; i < vdev->nregions; i++) { + uint64_t start, end, len; + region = &vdev->dev->mem->regions[i]; + start = FLOOR_2MB(region->mmap_addr); + end = CEIL_2MB(region->mmap_addr + region->mmap_size); + len = end - start; + vdev->region[i].vaddr = (void *)start; + vdev->region[i].len = len; + SPDK_NOTICELOG("Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n", + start, len); + spdk_vtophys_register(vdev->region[i].vaddr, vdev->region[i].len); + } + + spdk_poller_register(&vdev->requestq_poller, vdev_worker, vdev, vdev->lcore, 0); + spdk_poller_register(&vdev->controlq_poller, vdev_controlq_worker, vdev, vdev->lcore, + CONTROLQ_POLL_PERIOD_US); + sem_post((sem_t *)arg2); +} + +static void +remove_vdev_cb(void *arg1, void *arg2) +{ + struct spdk_vhost_scsi_ctrlr *vdev = arg1; + uint32_t i; + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + if (vdev->scsi_dev[i] == NULL) { + continue; + } + spdk_scsi_dev_free_io_channels(vdev->scsi_dev[i]); + } + + SPDK_NOTICELOG("Stopping poller for vhost controller %s\n", vdev->name); + for (i = 0; i < vdev->nregions; i++) { + spdk_vtophys_unregister(vdev->region[i].vaddr, vdev->region[i].len); + } + + vdev->nregions = 0; + + sem_post((sem_t *)arg2); +} + +static void +destroy_device(int vid) +{ + struct spdk_vhost_scsi_ctrlr *vdev = dpdk_vid_mapping[vid]; + struct spdk_event *event; + sem_t done_sem; + uint32_t i; + + event = vhost_sem_event_alloc(vdev->lcore, vdev_event_done_cb, NULL, &done_sem); + spdk_poller_unregister(&vdev->requestq_poller, event); + if (vhost_sem_timedwait(&done_sem, 1)) + rte_panic("%s: failed to unregister request queue poller.\n", vdev->name); + + event = vhost_sem_event_alloc(vdev->lcore, vdev_event_done_cb, NULL, &done_sem); + spdk_poller_unregister(&vdev->controlq_poller, event); + if (vhost_sem_timedwait(&done_sem, 1)) + rte_panic("%s: failed to unregister control queue poller.\n", vdev->name); + + /* Wait for all tasks to finish */ + for (i = 1000; i && vdev->task_cnt > 0; i--) { + usleep(1000); + } + + if (vdev->task_cnt > 0) { + rte_panic("%s: pending tasks did not finish in 1s.\n", vdev->name); + } + + event = vhost_sem_event_alloc(vdev->lcore, remove_vdev_cb, vdev, &done_sem); + spdk_event_call(event); + if (vhost_sem_timedwait(&done_sem, 1)) + rte_panic("%s: failed to unregister poller.\n", vdev->name); + + g_num_ctrlrs[vdev->lcore]--; + vdev->lcore = -1; + vdev->dev = NULL; + dpdk_vid_mapping[vid] = NULL; +} + +#define LUN_DEV_NAME_SIZE 8 +#define MAX_SCSI_CTRLRS 15 + +static struct spdk_vhost_scsi_ctrlr *spdk_vhost_ctrlrs[MAX_SCSI_CTRLRS]; + +static struct spdk_vhost_scsi_ctrlr * +spdk_vhost_scsi_ctrlr_find(const char *ctrlr_name) +{ + unsigned i; + size_t dev_dirname_len = strlen(dev_dirname); + + if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) { + ctrlr_name += dev_dirname_len; + } + + for (i = 0; i < MAX_SCSI_CTRLRS; i++) { + if (spdk_vhost_ctrlrs[i] == NULL) { + continue; + } + + if (strcmp(spdk_vhost_ctrlrs[i]->name, ctrlr_name) == 0) { + return spdk_vhost_ctrlrs[i]; + } + } + + return NULL; +} + +int +spdk_vhost_scsi_ctrlr_construct(const char *name, uint64_t cpumask) +{ + struct spdk_vhost_scsi_ctrlr *vdev; + unsigned ctrlr_num; + char path[PATH_MAX]; + + if (name == NULL) { + SPDK_ERRLOG("Can't add controller with no name\n"); + return -EINVAL; + } + + if ((cpumask & spdk_app_get_core_mask()) != cpumask) { + SPDK_ERRLOG("cpumask 0x%jx not a subset of app mask 0x%jx\n", + cpumask, spdk_app_get_core_mask()); + return -EINVAL; + } + + if (spdk_vhost_scsi_ctrlr_find(name)) { + SPDK_ERRLOG("vhost scsi controller %s already exists.\n", name); + return -EEXIST; + } + + for (ctrlr_num = 0; ctrlr_num < MAX_SCSI_CTRLRS; ctrlr_num++) { + if (spdk_vhost_ctrlrs[ctrlr_num] == NULL) { + break; + } + } + + if (ctrlr_num == MAX_SCSI_CTRLRS) { + SPDK_ERRLOG("Max scsi controllers reached (%d).\n", MAX_SCSI_CTRLRS); + return -ENOSPC; + } + + vdev = rte_zmalloc(NULL, sizeof(*vdev), RTE_CACHE_LINE_SIZE); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't allocate memory for vhost dev\n"); + return -ENOMEM; + } + + snprintf(path, sizeof(path), "%s%s", dev_dirname, name); + /* Register vhost(cuse or user) driver to handle vhost messages. */ + if (access(path, F_OK) != -1) { + if (unlink(path) != 0) + rte_exit(EXIT_FAILURE, "Cannot remove %s.\n", path); + } + + if (rte_vhost_driver_register(path, 0) != 0) { + SPDK_ERRLOG("Could not register controller %s with vhost library\n", name); + SPDK_ERRLOG("Check if domain socket %s already exists\n", path); + return -EIO; + } + + spdk_vhost_ctrlrs[ctrlr_num] = vdev; + vdev->name = strdup(name); + vdev->cpumask = cpumask; + vdev->lcore = -1; + SPDK_NOTICELOG("Controller %s: new controller added\n", name); + return 0; +} + +int +spdk_vhost_parse_core_mask(const char *mask, uint64_t *cpumask) +{ + char *end; + + if (mask == NULL || cpumask == NULL) { + return -1; + } + + errno = 0; + *cpumask = strtoull(mask, &end, 16); + + if (*end != '\0' || errno || !*cpumask || + ((*cpumask & spdk_app_get_core_mask()) != *cpumask)) { + + SPDK_ERRLOG("cpumask %s not a subset of app mask 0x%jx\n", + mask, spdk_app_get_core_mask()); + return -1; + } + + return 0; +} + +struct spdk_scsi_dev * +spdk_vhost_scsi_ctrlr_get_dev(struct spdk_vhost_scsi_ctrlr *ctrlr, uint8_t num) +{ + assert(ctrlr != NULL); + assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + return ctrlr->scsi_dev[num]; +} + +int +spdk_vhost_scsi_ctrlr_add_dev(const char *ctrlr_name, unsigned scsi_dev_num, const char *lun_name) +{ + struct spdk_vhost_scsi_ctrlr *vdev; + char dev_name[SPDK_SCSI_DEV_MAX_NAME]; + int lun_id_list[1]; + char *lun_names_list[1]; + + if (ctrlr_name == NULL) { + SPDK_ERRLOG("No controller name\n"); + return -EINVAL; + } + + if (scsi_dev_num > SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("Controller %d device num too big (max %d)\n", scsi_dev_num, + SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + return -EINVAL; + } + + if (lun_name == NULL) { + SPDK_ERRLOG("No lun name specified \n"); + return -EINVAL; + } + + vdev = spdk_vhost_scsi_ctrlr_find(ctrlr_name); + if (vdev == NULL) { + SPDK_ERRLOG("Controller %s is not defined\n", ctrlr_name); + return -ENODEV; + } + + if (vdev->lcore != -1) { + SPDK_ERRLOG("Controller %s is in use and hotplug is not supported\n", ctrlr_name); + return -ENODEV; + } + + if (vdev->scsi_dev[scsi_dev_num] != NULL) { + SPDK_ERRLOG("Controller %s dev %u already occupied\n", ctrlr_name, scsi_dev_num); + return -EEXIST; + } + + /* + * At this stage only one LUN per device + */ + snprintf(dev_name, sizeof(dev_name), "Dev%u", scsi_dev_num); + lun_id_list[0] = 0; + lun_names_list[0] = (char *)lun_name; + + vdev->scsi_dev[scsi_dev_num] = spdk_scsi_dev_construct(dev_name, lun_names_list, lun_id_list, 1); + if (vdev->scsi_dev[scsi_dev_num] == NULL) { + SPDK_ERRLOG("Couldn't create spdk SCSI device '%s' using lun device '%s' in controller: %s\n", + dev_name, lun_name, vdev->name); + return -EINVAL; + } + + spdk_scsi_dev_add_port(vdev->scsi_dev[scsi_dev_num], 0, "vhost"); + SPDK_NOTICELOG("Controller %s: defined device '%s' using lun '%s'\n", + vdev->name, dev_name, lun_name); + return 0; +} + +struct spdk_vhost_scsi_ctrlr * +spdk_vhost_scsi_ctrlr_next(struct spdk_vhost_scsi_ctrlr *prev) +{ + int i = 0; + + if (prev != NULL) { + for (; i < MAX_SCSI_CTRLRS; i++) { + if (spdk_vhost_ctrlrs[i] == prev) { + break; + } + } + + i++; + } + + for (; i < MAX_SCSI_CTRLRS; i++) { + if (spdk_vhost_ctrlrs[i] == NULL) { + continue; + } + + return spdk_vhost_ctrlrs[i]; + } + + return NULL; +} + +const char * +spdk_vhost_scsi_ctrlr_get_name(struct spdk_vhost_scsi_ctrlr *ctrlr) +{ + assert(ctrlr != NULL); + return ctrlr->name; +} + +uint64_t +spdk_vhost_scsi_ctrlr_get_cpumask(struct spdk_vhost_scsi_ctrlr *ctrlr) +{ + assert(ctrlr != NULL); + return ctrlr->cpumask; +} + +static int spdk_vhost_scsi_controller_construct(void) +{ + struct spdk_conf_section *sp = spdk_conf_first_section(NULL); + int i; + unsigned ctrlr_num = 0; + char *lun_name, dev_name[LUN_DEV_NAME_SIZE]; + char *cpumask_str; + char *name; + uint64_t cpumask; + + while (sp != NULL) { + if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) { + sp = spdk_conf_next_section(sp); + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) { + SPDK_WARNLOG("Ignoring section that don't match VhostScsi controller template: %s\n", + spdk_conf_section_get_name(sp)); + continue; + } + + name = spdk_conf_section_get_val(sp, "Name"); + cpumask_str = spdk_conf_section_get_val(sp, "Cpumask"); + if (cpumask_str == NULL) { + cpumask = spdk_app_get_core_mask(); + } else if (spdk_vhost_parse_core_mask(cpumask_str, &cpumask)) { + SPDK_ERRLOG("Error parsing cpumask while creating controller\n"); + return -1; + } + + if (spdk_vhost_scsi_ctrlr_construct(name, cpumask) < 0) { + return -1; + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + snprintf(dev_name, sizeof(dev_name), "Dev%d", i); + lun_name = spdk_conf_section_get_val(sp, dev_name); + if (lun_name == NULL) { + continue; + } + + if (spdk_vhost_scsi_ctrlr_add_dev(name, i, lun_name) < 0) { + return -1; + } + } + + sp = spdk_conf_next_section(sp); + + } + + return 0; +} + +static uint32_t +spdk_vhost_scsi_allocate_reactor(uint64_t cpumask) +{ + uint32_t i, selected_core; + uint32_t min_ctrlrs; + + cpumask &= spdk_app_get_core_mask(); + + if (cpumask == 0) { + return 0; + } + + min_ctrlrs = INT_MAX; + selected_core = 0; + + for (i = 0; i < RTE_MAX_LCORE && i < 64; i++) { + if (!((1ULL << i) & cpumask)) { + continue; + } + + if (g_num_ctrlrs[i] < min_ctrlrs) { + selected_core = i; + min_ctrlrs = g_num_ctrlrs[i]; + } + } + + g_num_ctrlrs[selected_core]++; + return selected_core; +} + +/* + * A new device is added to a data core. First the device is added to the main linked list + * and then allocated to a specific data core. + */ +static int +new_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + struct spdk_vhost_scsi_ctrlr *vdev = NULL; + struct spdk_event *event; + sem_t added; + uint32_t i; + + vdev = spdk_vhost_scsi_ctrlr_find(dev->ifname); + if (vdev == NULL) { + SPDK_ERRLOG("Controller %s not found.\n", dev->ifname); + return -1; + } + + if (vdev->lcore != -1) { + SPDK_ERRLOG("Controller %s already connected.\n", dev->ifname); + return -1; + } + + dpdk_vid_mapping[vid] = vdev; + vdev->dev = dev; + + /* Disable notifications. */ + for (i = 0; i < dev->num_queues; i++) { + rte_vhost_enable_guest_notification(vid, i, 0); + } + + dev->flags |= VIRTIO_DEV_RUNNING; + vdev->dev = dev; + + vdev->lcore = spdk_vhost_scsi_allocate_reactor(vdev->cpumask); + + event = vhost_sem_event_alloc(vdev->lcore, add_vdev_cb, vdev, &added); + spdk_event_call(event); + if (vhost_sem_timedwait(&added, 1)) + rte_panic("Failed to register new device '%s'\n", vdev->name); + return 0; +} + +/* + * These callback allow devices to be added to the data core when configuration + * has been fully complete. + */ +static const struct virtio_net_device_ops virtio_net_device_ops = { + .new_device = new_device, + .destroy_device = destroy_device, +}; + +static void * +session_start(void *arg) +{ + rte_vhost_driver_session_start(); + return NULL; +} + +void +spdk_vhost_startup(void *arg1, void *arg2) +{ + int ret; + pthread_t tid; + const char *basename = arg1; + + if (basename) { + ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename); + if ((size_t)ret >= sizeof(dev_dirname) - 2) + rte_exit(EXIT_FAILURE, "Char dev dir path length %d is too long\n", ret); + + if (dev_dirname[ret - 1] != '/') { + dev_dirname[ret] = '/'; + dev_dirname[ret + 1] = '\0'; + } + } + + ret = spdk_vhost_scsi_controller_construct(); + if (ret != 0) + rte_exit(EXIT_FAILURE, "Cannot construct vhost controllers\n"); + + rte_vhost_driver_callback_register(&virtio_net_device_ops); + + if (pthread_create(&tid, NULL, &session_start, NULL) < 0) + rte_panic("Failed to start session poller thread (%d): %s", errno, strerror(errno)); + pthread_detach(tid); +} + +static void * +session_shutdown(void *arg) +{ + struct spdk_vhost_scsi_ctrlr *vdev = NULL; + int i; + + for (i = 0; i < MAX_SCSI_CTRLRS; i++) { + vdev = spdk_vhost_ctrlrs[i]; + if (vdev == NULL) { + continue; + } + rte_vhost_driver_unregister(vdev->name); + } + + SPDK_NOTICELOG("Exiting\n"); + spdk_app_stop(0); + return NULL; +} + +/* + * When we receive a INT signal. Execute shutdown in separate thread to avoid deadlock. + */ +void +spdk_vhost_shutdown_cb(void) +{ + pthread_t tid; + if (pthread_create(&tid, NULL, &session_shutdown, NULL) < 0) + rte_panic("Failed to start session shutdown thread (%d): %s", errno, strerror(errno)); + pthread_detach(tid); +} + +SPDK_LOG_REGISTER_TRACE_FLAG("vhost", SPDK_TRACE_VHOST) +SPDK_LOG_REGISTER_TRACE_FLAG("vhost_ring", SPDK_TRACE_VHOST_RING) +SPDK_LOG_REGISTER_TRACE_FLAG("vhost_queue", SPDK_TRACE_VHOST_QUEUE) +SPDK_LOG_REGISTER_TRACE_FLAG("vhost_data", SPDK_TRACE_VHOST_DATA) diff --git a/lib/vhost/vhost_rpc.c b/lib/vhost/vhost_rpc.c new file mode 100644 index 000000000..493b19be2 --- /dev/null +++ b/lib/vhost/vhost_rpc.c @@ -0,0 +1,215 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "spdk_internal/log.h" +#include "spdk/rpc.h" + +#include "spdk/vhost.h" +#include "task.h" + +static void +json_scsi_dev_write(struct spdk_json_write_ctx *ctx, struct spdk_scsi_dev *dev) +{ + int l; + + spdk_json_write_name(ctx, "id"); + spdk_json_write_int32(ctx, (int32_t)dev->id); + + spdk_json_write_name(ctx, "device_name"); + spdk_json_write_string(ctx, dev->name); + + spdk_json_write_name(ctx, "luns"); + spdk_json_write_array_begin(ctx); + for (l = 0; l < dev->maxlun; l++) { + if (NULL == dev->lun[l]) + continue; + + spdk_json_write_object_begin(ctx); + + spdk_json_write_name(ctx, "id"); + spdk_json_write_int32(ctx, (int32_t)dev->lun[l]->id); + + spdk_json_write_name(ctx, "name"); + spdk_json_write_string(ctx, dev->lun[l]->name); + + spdk_json_write_object_end(ctx); + } + spdk_json_write_array_end(ctx); +} + +static void +spdk_rpc_get_vhost_scsi_controllers(struct spdk_jsonrpc_server_conn *conn, + const struct spdk_json_val *params, + const struct spdk_json_val *id) +{ + struct spdk_json_write_ctx *w; + struct spdk_vhost_scsi_ctrlr *ctrlr = NULL; + struct spdk_scsi_dev *dev; + uint32_t i; + char buf[32]; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(conn, id, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_vhost_scsi_controllers requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(conn, id); + spdk_json_write_array_begin(w); + while ((ctrlr = spdk_vhost_scsi_ctrlr_next(ctrlr)) != NULL) { + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "ctrlr"); + spdk_json_write_string(w, spdk_vhost_scsi_ctrlr_get_name(ctrlr)); + + spdk_json_write_name(w, "cpu_mask"); + snprintf(buf, sizeof(buf), "%#" PRIx64, spdk_vhost_scsi_ctrlr_get_cpumask(ctrlr)); + spdk_json_write_string(w, buf); + + spdk_json_write_name(w, "scsi_devs"); + spdk_json_write_array_begin(w); + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + dev = spdk_vhost_scsi_ctrlr_get_dev(ctrlr, i); + if (!dev) + continue; + + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "scsi_dev_num"); + spdk_json_write_uint32(w, i); + json_scsi_dev_write(w, dev); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); // devs + + spdk_json_write_object_end(w); // ctrl + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(conn, w); + return; +} +SPDK_RPC_REGISTER("get_vhost_scsi_controllers", spdk_rpc_get_vhost_scsi_controllers) + +struct rpc_vhost_scsi_ctrlr { + char *ctrlr; + char *cpumask; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_construct_vhost_scsi_controller(struct spdk_jsonrpc_server_conn *conn, + const struct spdk_json_val *params, + const struct spdk_json_val *id) +{ + struct rpc_vhost_scsi_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + uint64_t cpumask; + + if (spdk_json_decode_object(params, rpc_construct_vhost_ctrlr, + sizeof(rpc_construct_vhost_ctrlr) / sizeof(*rpc_construct_vhost_ctrlr), + &req)) { + SPDK_TRACELOG(SPDK_TRACE_DEBUG, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + cpumask = spdk_app_get_core_mask(); + if (req.cpumask != NULL && spdk_vhost_parse_core_mask(req.cpumask, &cpumask)) { + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_scsi_ctrlr_construct(req.ctrlr, cpumask); + if (rc < 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(conn, id); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(conn, w); + return; +invalid: + spdk_jsonrpc_send_error_response(conn, id, SPDK_JSONRPC_ERROR_INVALID_PARAMS, strerror(-rc)); +} +SPDK_RPC_REGISTER("construct_vhost_scsi_controller", spdk_rpc_construct_vhost_scsi_controller) + +struct rpc_add_vhost_scsi_ctrlr_lun { + char *ctrlr; + uint32_t scsi_dev_num; + char *lun_name; +}; + +static const struct spdk_json_object_decoder rpc_vhost_add_lun[] = { + {"ctrlr", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, ctrlr), spdk_json_decode_string }, + {"scsi_dev_num", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, scsi_dev_num), spdk_json_decode_uint32}, + {"lun_name", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, lun_name), spdk_json_decode_string }, +}; + +static void +spdk_rpc_add_vhost_scsi_lun(struct spdk_jsonrpc_server_conn *conn, + const struct spdk_json_val *params, + const struct spdk_json_val *id) +{ + struct rpc_add_vhost_scsi_ctrlr_lun req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_add_lun, + sizeof(rpc_vhost_add_lun) / sizeof(*rpc_vhost_add_lun), + &req)) { + SPDK_TRACELOG(SPDK_TRACE_DEBUG, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_scsi_ctrlr_add_dev(req.ctrlr, req.scsi_dev_num, req.lun_name); + if (rc < 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(conn, id); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(conn, w); + return; +invalid: + spdk_jsonrpc_send_error_response(conn, id, SPDK_JSONRPC_ERROR_INVALID_PARAMS, strerror(-rc)); +} +SPDK_RPC_REGISTER("add_vhost_scsi_lun", spdk_rpc_add_vhost_scsi_lun) diff --git a/mk/spdk.app.mk b/mk/spdk.app.mk index 30eeb9e17..5ae15f80d 100644 --- a/mk/spdk.app.mk +++ b/mk/spdk.app.mk @@ -35,12 +35,13 @@ # separately and wrapped in whole-archive linker args SPDK_RPC_LIB_LIST = $(filter %_rpc,$(SPDK_LIB_LIST)) -# Currently the iscsi, net, and scsi libraries contain their respective RPC methods +# Currently some libraries contain their respective RPC methods # rather than breaking them out into separate libraries. So we must also include # these directories in the RPC library list. SPDK_RPC_LIB_LIST += $(filter iscsi,$(SPDK_LIB_LIST)) SPDK_RPC_LIB_LIST += $(filter net,$(SPDK_LIB_LIST)) SPDK_RPC_LIB_LIST += $(filter scsi,$(SPDK_LIB_LIST)) +SPDK_RPC_LIB_LIST += $(filter vhost,$(SPDK_LIB_LIST)) SPDK_REMAINING_LIB_LIST = $(filter-out $(SPDK_RPC_LIB_LIST),$(SPDK_LIB_LIST)) diff --git a/scripts/check_format.sh b/scripts/check_format.sh index 689e7382e..a81239ed8 100755 --- a/scripts/check_format.sh +++ b/scripts/check_format.sh @@ -12,9 +12,13 @@ if hash astyle; then echo -n "Checking coding style..." rm -f astyle.log touch astyle.log - astyle --options=.astylerc "*.c" >> astyle.log + # Exclude rte_vhost code imported from DPDK - we want to keep the original code + # as-is to enable ongoing work to synch with a generic upstream DPDK vhost library, + # rather than making diffs more complicated by a lot of changes to follow SPDK + # coding standards. + astyle --options=.astylerc "*.c" --exclude="rte_vhost" >> astyle.log astyle --options=.astylerc --exclude=test/cpp_headers "*.cpp" >> astyle.log - astyle --options=.astylerc "*.h" >> astyle.log + astyle --options=.astylerc "*.h" --exclude="rte_vhost" >> astyle.log if grep -q "^Formatted" astyle.log; then echo " errors detected" git diff diff --git a/scripts/rpc.py b/scripts/rpc.py index fd97f14a7..7c0055904 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -417,15 +417,16 @@ p = subparsers.add_parser('get_vhost_scsi_controllers', help='List vhost control p.set_defaults(func=get_vhost_scsi_controllers) def construct_vhost_scsi_controller(args): - params = { - 'ctrlr': args.ctrlr, - 'cpumask': args.cpu_mask - } + params = {'ctrlr': args.ctrlr} + + if args.cpumask: + params['cpumask'] = args.cpumask + jsonrpc_call('construct_vhost_scsi_controller', params) p = subparsers.add_parser('construct_vhost_scsi_controller', help='Add new vhost controller') -p.add_argument('ctrlr', help='conntroller name') -p.add_argument('cpumask', help='cpu mask for this controller') +p.add_argument('ctrlr', help='controller name') +p.add_argument('--cpumask', help='cpu mask for this controller') p.set_defaults(func=construct_vhost_scsi_controller) def add_vhost_scsi_lun(args): diff --git a/test/vhost/ext4test/ext4connect.sh b/test/vhost/ext4test/ext4connect.sh new file mode 100755 index 000000000..6c0980d47 --- /dev/null +++ b/test/vhost/ext4test/ext4connect.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$testdir/../../.. +source $rootdir/scripts/autotest_common.sh + +script='shopt -s nullglob; \ + for entry in /sys/block/sd*; do \ + disk_type="$(cat $entry/device/vendor)"; \ + if [[ $disk_type == Intel* ]] || [[ $disk_type == RAWSCSI* ]] || [[ $disk_type == LIO-ORG* ]]; then \ + fname=$(basename $entry); \ + echo -n "$fname "; \ + fi; \ + done' + +devs="$(echo "$script" | bash -s)" + +timing_enter ext4test + +trap "exit 1" SIGINT SIGTERM EXIT + +for dev in $devs; do + mkfs.ext4 -F /dev/$dev + mkdir -p /mnt/${dev}dir + mount -o sync /dev/$dev /mnt/${dev}dir + rsync -qav --exclude=".git" $rootdir/ /mnt/${dev}dir/spdk + sleep 2 + make -C /mnt/${dev}dir/spdk -j8 clean + make -C /mnt/${dev}dir/spdk -j8 + + # Print out space consumed on target device to help decide + # if/when we need to increase the size of the malloc LUN + df -h /dev/$dev + rm -rf /mnt/${dev}dir/spdk +done + +for dev in $devs; do + umount /mnt/${dev}dir + rm -rf /mnt/${dev}dir + + stats=( $(cat /sys/block/$dev/stat) ) + echo "" + echo "$dev stats" + printf "READ IO cnt: % 8u merges: % 8u sectors: % 8u ticks: % 8u\n" \ + ${stats[0]} ${stats[1]} ${stats[2]} ${stats[3]} + printf "WRITE IO cnt: % 8u merges: % 8u sectors: % 8u ticks: % 8u\n" \ + ${stats[4]} ${stats[5]} ${stats[6]} ${stats[7]} + printf "in flight: % 8u io ticks: % 8u time in queue: % 8u\n" \ + ${stats[8]} ${stats[9]} ${stats[10]} + echo "" +done + +trap - SIGINT SIGTERM EXIT + +timing_exit ext4test diff --git a/test/vhost/ext4test/ext4start.sh b/test/vhost/ext4test/ext4start.sh new file mode 100755 index 000000000..283df48e9 --- /dev/null +++ b/test/vhost/ext4test/ext4start.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +testdir=$(readlink -f $(dirname $0)) +rootdir=$testdir/../../.. +source $rootdir/scripts/autotest_common.sh + +if [ -z "$VM_IMG" ]; then + echo "VM_IMG: path to qcow2 image not provided - not running" + exit 1 +fi + +if [ -z "$VM_QEMU" ]; then + echo "VM_QEMU: path to qemu binary not provided - not running" + exit 1 +fi + +HOST_IP=192.168.122.1 +VM_IP=192.168.122.254 +VM_UNAME="root" +VM_PASS="root" +VM_NAME="ext4test_vm" +VM_NET_NAME="test_net" +VM_MAC="02:de:ad:de:ad:01" +VM_BAK_IMG="/tmp/ext4test_backing.img" +TIMEO=60 +SSHCMD="sshpass -p $VM_PASS ssh" +SCPCMD="sshpass -p $VM_PASS scp" + +function cleanup_virsh() { + virsh destroy $VM_NAME + virsh net-destroy $VM_NET_NAME + rm $VM_BAK_IMG +} + +timing_enter ext4test + +qemu-img create -f qcow2 -o backing_file=$VM_IMG $VM_BAK_IMG + +cp $testdir/spdk_vm_base.xml $testdir/spdk_vm.xml +cp $testdir/spdk_vnet_base.xml $testdir/spdk_vnet.xml + +sed -i "s@@$VM_NAME@g" $testdir/spdk_vm.xml +sed -i "s@source file=''@source file='$VM_BAK_IMG'@g" $testdir/spdk_vm.xml +sed -i "s@@$VM_QEMU@g" $testdir/spdk_vm.xml +sed -i "s@@$VM_NET_NAME@g" $testdir/spdk_vnet.xml + +trap "cleanup_virsh; killprocess $pid; exit 1" SIGINT SIGTERM EXIT + +virsh net-create $testdir/spdk_vnet.xml + +# Change directory and ownership because virsh has issues with +# paths that are in /root tree +cd /tmp +$rootdir/app/vhost/vhost -c $testdir/vhost.conf & +pid=$! +echo "Process pid: $pid" +sleep 10 +chmod 777 /tmp/naa.123 + +tar --exclude '.git' --exclude 'spdk.tgz' --exclude '*.d' --exclude '*.o' -zcf /tmp/spdk_host.tgz $rootdir + +virsh create $testdir/spdk_vm.xml +virsh net-update $VM_NET_NAME add ip-dhcp-host "" + +# Wait for VM to boot, disable trap temporarily +# so that we don't exit on first fail +echo "Trying to connect to virtual machine..." +trap - SIGINT SIGTERM EXIT +set +xe +rc=-1 +while [[ $TIMEO -gt 0 && rc -ne 0 ]]; do + $SSHCMD root@$VM_IP -q -oStrictHostKeyChecking=no 'echo Hello' + rc=$? + ((TIMEO-=1)) +done +set -xe +trap "cleanup_virsh; killprocess $pid; exit 1" SIGINT SIGTERM EXIT + +if [[ $TIMEO -eq 0 || rc -ne 0 ]]; then + echo "VM did not boot properly, exiting" + exit 1 +fi + +$SSHCMD root@$VM_IP 'mkdir -p /tmp/spdk' +$SCPCMD -r /tmp/spdk_host.tgz root@$VM_IP:/tmp/spdk +$SSHCMD root@$VM_IP 'cd /tmp/spdk; tar xf spdk_host.tgz' +$SSHCMD root@$VM_IP '/tmp/spdk/test/vhost/ext4test/ext4connect.sh' + +#read -p "Hit enter to exit..." + +trap - SIGINT SIGTERM EXIT + +cleanup_virsh +rm $testdir/spdk_vm.xml +rm $testdir/spdk_vnet.xml +killprocess $pid +timing_exit ext4test diff --git a/test/vhost/ext4test/spdk_vm_base.xml b/test/vhost/ext4test/spdk_vm_base.xml new file mode 100644 index 000000000..4df40a3a0 --- /dev/null +++ b/test/vhost/ext4test/spdk_vm_base.xml @@ -0,0 +1,69 @@ + + + + 2 + 2 + 4 + + hvm + + + + + + + + + + + + destroy + restart + destroy + + + + + + + +
+ + +
+ + + + + + +
+ + + + + + + + + + +