diff --git a/lib/bdev/uring/Makefile b/lib/bdev/uring/Makefile new file mode 100644 index 000000000..53cc7324b --- /dev/null +++ b/lib/bdev/uring/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_uring.c +LIBNAME = bdev_uring +LOCAL_SYS_LIBS = -luring + +ifneq ($(strip $(CONFIG_URING_PATH)),) +CFLAGS += -I$(CONFIG_URING_PATH) +LDFLAGS += -L$(CONFIG_URING_PATH) +endif + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/lib/bdev/uring/bdev_uring.c b/lib/bdev/uring/bdev_uring.c new file mode 100644 index 000000000..97ee8449f --- /dev/null +++ b/lib/bdev/uring/bdev_uring.c @@ -0,0 +1,577 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_uring.h" + +#include "spdk/stdinc.h" + +#include "spdk/barrier.h" +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/fd.h" +#include "spdk/likely.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include + +struct bdev_uring_io_channel { + struct bdev_uring_group_channel *group_ch; +}; + +struct bdev_uring_group_channel { + uint64_t io_inflight; + uint64_t io_pending; + struct spdk_poller *poller; + struct io_uring uring; +}; + +struct bdev_uring_task { + uint64_t len; + struct bdev_uring_io_channel *ch; + TAILQ_ENTRY(bdev_uring_task) link; +}; + +struct bdev_uring { + struct spdk_bdev bdev; + char *filename; + int fd; + TAILQ_ENTRY(bdev_uring) link; +}; + +static int bdev_uring_init(void); +static void bdev_uring_fini(void); +static void uring_free_bdev(struct bdev_uring *uring); +static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; + +#define SPDK_URING_QUEUE_DEPTH 512 +#define MAX_EVENTS_PER_POLL 32 + +static int +bdev_uring_get_ctx_size(void) +{ + return sizeof(struct bdev_uring_task); +} + +static struct spdk_bdev_module uring_if = { + .name = "uring", + .module_init = bdev_uring_init, + .module_fini = bdev_uring_fini, + .config_text = NULL, + .get_ctx_size = bdev_uring_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) + +static int +bdev_uring_open(struct bdev_uring *bdev) +{ + int fd; + + fd = open(bdev->filename, O_NOATIME | O_DIRECT); + if (fd < 0) { + SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", + bdev->filename, errno, spdk_strerror(errno)); + bdev->fd = -1; + return -1; + } + + bdev->fd = fd; + + return 0; +} + +static int +bdev_uring_close(struct bdev_uring *bdev) +{ + int rc; + + if (bdev->fd == -1) { + return 0; + } + + rc = close(bdev->fd); + if (rc < 0) { + SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", + bdev->fd, errno, spdk_strerror(errno)); + return -1; + } + + bdev->fd = -1; + + return 0; +} + +static int64_t +bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, + struct bdev_uring_task *uring_task, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) +{ + struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); + struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(&group_ch->uring); + io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); + io_uring_sqe_set_data(sqe, uring_task); + uring_task->len = nbytes; + uring_task->ch = uring_ch; + + SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n", + iovcnt, nbytes, offset); + + group_ch->io_pending++; + return nbytes; +} + +static int64_t +bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, + struct bdev_uring_task *uring_task, + struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) +{ + struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); + struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(&group_ch->uring); + io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); + io_uring_sqe_set_data(sqe, uring_task); + uring_task->ch = uring_ch; + + SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n", + iovcnt, nbytes, offset); + + group_ch->io_pending++; + return nbytes; +} + +static int +bdev_uring_destruct(void *ctx) +{ + struct bdev_uring *uring = ctx; + int rc = 0; + + TAILQ_REMOVE(&g_uring_bdev_head, uring, link); + rc = bdev_uring_close(uring); + if (rc < 0) { + SPDK_ERRLOG("bdev_uring_close() failed\n"); + } + spdk_io_device_unregister(uring, NULL); + uring_free_bdev(uring); + return rc; +} + +static int +bdev_uring_reap(struct io_uring *ring, int max) +{ + int i, count, ret; + struct io_uring_cqe *cqe; + struct bdev_uring_task *uring_task; + enum spdk_bdev_io_status status; + + count = 0; + for (i = 0; i < max; i++) { + ret = io_uring_get_completion(ring, &cqe); + if (ret != 0) { + return ret; + } + + if (cqe == NULL) { + return count; + } + + uring_task = (struct bdev_uring_task *)cqe->user_data; + if (cqe->res != (signed)uring_task->len) { + status = SPDK_BDEV_IO_STATUS_FAILED; + } else { + status = SPDK_BDEV_IO_STATUS_SUCCESS; + } + + uring_task->ch->group_ch->io_inflight--; + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); + count++; + } + + return count; +} + +static int +bdev_uring_group_poll(void *arg) +{ + struct bdev_uring_group_channel *group_ch = arg; + int to_complete, to_submit; + int count, ret; + + to_submit = group_ch->io_pending; + to_complete = group_ch->io_inflight; + + ret = 0; + if (to_submit > 0) { + /* If there are I/O to submit, use io_uring_submit here. + * It will automatically call io_uring_enter appropriately. */ + ret = io_uring_submit(&group_ch->uring); + group_ch->io_pending = 0; + group_ch->io_inflight += to_submit; + } else if (to_complete > 0) { + /* If there are I/O in flight but none to submit, we need to + * call io_uring_enter ourselves. */ + ret = io_uring_enter(group_ch->uring.ring_fd, 0, 0, + IORING_ENTER_GETEVENTS, NULL); + } + + if (ret < 0) { + return 1; + } + + count = 0; + if (to_complete > 0) { + count = bdev_uring_reap(&group_ch->uring, to_complete); + } + + return (count + to_submit); +} + +static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, + ch, + (struct bdev_uring_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, + ch, + (struct bdev_uring_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + break; + default: + SPDK_ERRLOG("Wrong io type\n"); + break; + } +} + +static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + /* Read and write operations must be performed on buffers aligned to + * bdev->required_alignment. If user specified unaligned buffers, + * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + default: + return -1; + } +} + +static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_uring_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + return true; + default: + return false; + } +} + +static int +bdev_uring_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_io_channel *ch = ctx_buf; + + ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); + + return 0; +} + +static void +bdev_uring_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_io_channel *ch = ctx_buf; + + spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); +} + +static struct spdk_io_channel * +bdev_uring_get_io_channel(void *ctx) +{ + struct bdev_uring *uring = ctx; + + return spdk_get_io_channel(uring); +} + + +static const struct spdk_bdev_fn_table uring_fn_table = { + .destruct = bdev_uring_destruct, + .submit_request = bdev_uring_submit_request, + .io_type_supported = bdev_uring_io_type_supported, + .get_io_channel = bdev_uring_get_io_channel, +}; + +static void uring_free_bdev(struct bdev_uring *uring) +{ + if (uring == NULL) { + return; + } + free(uring->filename); + free(uring->bdev.name); + free(uring); +} + +static int +bdev_uring_group_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_group_channel *ch = ctx_buf; + + if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { + SPDK_ERRLOG("uring I/O context setup failure\n"); + return -1; + } + + ch->poller = spdk_poller_register(bdev_uring_group_poll, ch, 0); + return 0; +} + +static void +bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_group_channel *ch = ctx_buf; + + close(ch->uring.ring_fd); + io_uring_queue_exit(&ch->uring); + + spdk_poller_unregister(&ch->poller); +} + +struct spdk_bdev * +create_uring_bdev(const char *name, const char *filename) +{ + struct bdev_uring *uring; + uint32_t block_size; + uint64_t bdev_size; + int rc; + + uring = calloc(1, sizeof(*uring)); + if (!uring) { + SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); + return NULL; + } + + uring->filename = strdup(filename); + if (!uring->filename) { + goto error_return; + } + + if (bdev_uring_open(uring)) { + SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); + goto error_return; + } + + bdev_size = spdk_fd_get_size(uring->fd); + + uring->bdev.name = strdup(name); + if (!uring->bdev.name) { + goto error_return; + } + uring->bdev.product_name = "URING bdev"; + uring->bdev.module = &uring_if; + + uring->bdev.write_cache = 1; + + block_size = spdk_fd_get_blocklen(uring->fd); + if (block_size == 0) { + SPDK_ERRLOG("Block size could not be auto-detected\n"); + goto error_return; + } + + if (block_size < 512) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); + goto error_return; + } + + if (!spdk_u32_is_pow2(block_size)) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); + goto error_return; + } + + uring->bdev.blocklen = block_size; + uring->bdev.required_alignment = spdk_u32log2(block_size); + + if (bdev_size % uring->bdev.blocklen != 0) { + SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", + bdev_size, uring->bdev.blocklen); + goto error_return; + } + + uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; + uring->bdev.ctxt = uring; + + uring->bdev.fn_table = &uring_fn_table; + + spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, + sizeof(struct bdev_uring_io_channel), + uring->bdev.name); + rc = spdk_bdev_register(&uring->bdev); + if (rc) { + spdk_io_device_unregister(uring, NULL); + goto error_return; + } + + TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); + return &uring->bdev; + +error_return: + bdev_uring_close(uring); + uring_free_bdev(uring); + return NULL; +} + +struct delete_uring_bdev_ctx { + spdk_delete_uring_complete cb_fn; + void *cb_arg; +}; + +static void +uring_bdev_unregister_cb(void *arg, int bdeverrno) +{ + struct delete_uring_bdev_ctx *ctx = arg; + + ctx->cb_fn(ctx->cb_arg, bdeverrno); + free(ctx); +} + +void +delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) +{ + struct delete_uring_bdev_ctx *ctx; + + if (!bdev || bdev->module != &uring_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); +} + +static int +bdev_uring_init(void) +{ + size_t i; + struct spdk_conf_section *sp; + struct spdk_bdev *bdev; + + TAILQ_INIT(&g_uring_bdev_head); + spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, + sizeof(struct bdev_uring_group_channel), + "uring_module"); + + sp = spdk_conf_find_section(NULL, "URING"); + if (!sp) { + return 0; + } + + i = 0; + while (true) { + const char *file; + const char *name; + + file = spdk_conf_section_get_nmval(sp, "URING", i, 0); + if (!file) { + break; + } + + name = spdk_conf_section_get_nmval(sp, "URING", i, 1); + if (!name) { + SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); + i++; + continue; + } + + bdev = create_uring_bdev(name, file); + if (!bdev) { + SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); + i++; + continue; + } + + i++; + } + + return 0; +} + +static void +bdev_uring_fini(void) +{ + spdk_io_device_unregister(&uring_if, NULL); +} + +SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING) diff --git a/lib/bdev/uring/bdev_uring.h b/lib/bdev/uring/bdev_uring.h new file mode 100644 index 000000000..fb3a653c8 --- /dev/null +++ b/lib/bdev/uring/bdev_uring.h @@ -0,0 +1,50 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_URING_H +#define SPDK_BDEV_URING_H + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" +#include "spdk/bdev.h" + +#include "spdk/bdev_module.h" + +typedef void (*spdk_delete_uring_complete)(void *cb_arg, int bdeverrno); + +struct spdk_bdev *create_uring_bdev(const char *name, const char *filename); + +void delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_URING_H */ diff --git a/mk/spdk.modules.mk b/mk/spdk.modules.mk index 4813f57a8..3a23280b2 100644 --- a/mk/spdk.modules.mk +++ b/mk/spdk.modules.mk @@ -61,6 +61,15 @@ SYS_LIBS += -L/usr/lib64/iscsi -liscsi endif endif +ifeq ($(CONFIG_URING),y) +BLOCKDEV_MODULES_LIST += bdev_uring +SYS_LIBS += -luring +ifneq ($(strip $(CONFIG_URING_PATH)),) +CFLAGS += -I$(CONFIG_URING_PATH) +LDFLAGS += -L$(CONFIG_URING_PATH) +endif +endif + ifeq ($(CONFIG_RBD),y) BLOCKDEV_MODULES_LIST += bdev_rbd SYS_LIBS += -lrados -lrbd