Spdk/module/bdev/uring/bdev_uring.c
Maciej Szwed eb05cbd677 pollers: Fix pollers to return correct busy status
Poller should return status > 0 when it did some work
(CPU was used for some time) marking its call as busy
CPU time.

Active pollers should return BUSY status only if they
did any meangful work besides checking some conditions
(e.g. processing requests, do some complicated operations).

Signed-off-by: Maciej Szwed <maciej.szwed@intel.com>
Change-Id: Id4636a0997489b129cecfe785592cc97b50992ba
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/2164
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Broadcom CI
Community-CI: Mellanox Build Bot
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
2020-07-07 07:29:31 +00:00

677 lines
17 KiB
C

/*-
* BSD LICENSE
*
* Copyright (c) Intel Corporation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bdev_uring.h"
#include "spdk/stdinc.h"
#include "spdk/barrier.h"
#include "spdk/bdev.h"
#include "spdk/conf.h"
#include "spdk/env.h"
#include "spdk/fd.h"
#include "spdk/likely.h"
#include "spdk/thread.h"
#include "spdk/json.h"
#include "spdk/util.h"
#include "spdk/string.h"
#include "spdk_internal/log.h"
#include "spdk_internal/uring.h"
struct bdev_uring_io_channel {
struct bdev_uring_group_channel *group_ch;
};
struct bdev_uring_group_channel {
uint64_t io_inflight;
uint64_t io_pending;
struct spdk_poller *poller;
struct io_uring uring;
};
struct bdev_uring_task {
uint64_t len;
struct bdev_uring_io_channel *ch;
TAILQ_ENTRY(bdev_uring_task) link;
};
struct bdev_uring {
struct spdk_bdev bdev;
char *filename;
int fd;
TAILQ_ENTRY(bdev_uring) link;
};
static int bdev_uring_init(void);
static void bdev_uring_fini(void);
static void uring_free_bdev(struct bdev_uring *uring);
static void bdev_uring_get_spdk_running_config(FILE *fp);
static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head;
#define SPDK_URING_QUEUE_DEPTH 512
#define MAX_EVENTS_PER_POLL 32
static int
bdev_uring_get_ctx_size(void)
{
return sizeof(struct bdev_uring_task);
}
static struct spdk_bdev_module uring_if = {
.name = "uring",
.module_init = bdev_uring_init,
.module_fini = bdev_uring_fini,
.config_text = bdev_uring_get_spdk_running_config,
.get_ctx_size = bdev_uring_get_ctx_size,
};
SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
static int
bdev_uring_open(struct bdev_uring *bdev)
{
int fd;
fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
if (fd < 0) {
/* Try without O_DIRECT for non-disk files */
fd = open(bdev->filename, O_RDWR | O_NOATIME);
if (fd < 0) {
SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
bdev->filename, errno, spdk_strerror(errno));
bdev->fd = -1;
return -1;
}
}
bdev->fd = fd;
return 0;
}
static int
bdev_uring_close(struct bdev_uring *bdev)
{
int rc;
if (bdev->fd == -1) {
return 0;
}
rc = close(bdev->fd);
if (rc < 0) {
SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
bdev->fd, errno, spdk_strerror(errno));
return -1;
}
bdev->fd = -1;
return 0;
}
static int64_t
bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
struct bdev_uring_task *uring_task,
struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
{
struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(&group_ch->uring);
io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
io_uring_sqe_set_data(sqe, uring_task);
uring_task->len = nbytes;
uring_task->ch = uring_ch;
SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n",
iovcnt, nbytes, offset);
group_ch->io_pending++;
return nbytes;
}
static int64_t
bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
struct bdev_uring_task *uring_task,
struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
{
struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
struct io_uring_sqe *sqe;
sqe = io_uring_get_sqe(&group_ch->uring);
io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
io_uring_sqe_set_data(sqe, uring_task);
uring_task->len = nbytes;
uring_task->ch = uring_ch;
SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n",
iovcnt, nbytes, offset);
group_ch->io_pending++;
return nbytes;
}
static int
bdev_uring_destruct(void *ctx)
{
struct bdev_uring *uring = ctx;
int rc = 0;
TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
rc = bdev_uring_close(uring);
if (rc < 0) {
SPDK_ERRLOG("bdev_uring_close() failed\n");
}
spdk_io_device_unregister(uring, NULL);
uring_free_bdev(uring);
return rc;
}
static int
bdev_uring_reap(struct io_uring *ring, int max)
{
int i, count, ret;
struct io_uring_cqe *cqe;
struct bdev_uring_task *uring_task;
enum spdk_bdev_io_status status;
count = 0;
for (i = 0; i < max; i++) {
ret = io_uring_peek_cqe(ring, &cqe);
if (ret != 0) {
return ret;
}
if (cqe == NULL) {
return count;
}
uring_task = (struct bdev_uring_task *)cqe->user_data;
if (cqe->res != (signed)uring_task->len) {
status = SPDK_BDEV_IO_STATUS_FAILED;
} else {
status = SPDK_BDEV_IO_STATUS_SUCCESS;
}
uring_task->ch->group_ch->io_inflight--;
io_uring_cqe_seen(ring, cqe);
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
count++;
}
return count;
}
static int
bdev_uring_group_poll(void *arg)
{
struct bdev_uring_group_channel *group_ch = arg;
int to_complete, to_submit;
int count, ret;
to_submit = group_ch->io_pending;
to_complete = group_ch->io_inflight;
ret = 0;
if (to_submit > 0) {
/* If there are I/O to submit, use io_uring_submit here.
* It will automatically call spdk_io_uring_enter appropriately. */
ret = io_uring_submit(&group_ch->uring);
group_ch->io_pending = 0;
group_ch->io_inflight += to_submit;
} else if (to_complete > 0) {
/* If there are I/O in flight but none to submit, we need to
* call io_uring_enter ourselves. */
ret = spdk_io_uring_enter(group_ch->uring.ring_fd, 0, 0,
IORING_ENTER_GETEVENTS);
}
if (ret < 0) {
return SPDK_POLLER_BUSY;
}
count = 0;
if (to_complete > 0) {
count = bdev_uring_reap(&group_ch->uring, to_complete);
}
if (count + to_submit > 0) {
return SPDK_POLLER_BUSY;
} else {
return SPDK_POLLER_IDLE;
}
}
static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
bool success)
{
if (!success) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
return;
}
switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_READ:
bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
ch,
(struct bdev_uring_task *)bdev_io->driver_ctx,
bdev_io->u.bdev.iovs,
bdev_io->u.bdev.iovcnt,
bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
break;
case SPDK_BDEV_IO_TYPE_WRITE:
bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
ch,
(struct bdev_uring_task *)bdev_io->driver_ctx,
bdev_io->u.bdev.iovs,
bdev_io->u.bdev.iovcnt,
bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
break;
default:
SPDK_ERRLOG("Wrong io type\n");
break;
}
}
static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
{
switch (bdev_io->type) {
/* Read and write operations must be performed on buffers aligned to
* bdev->required_alignment. If user specified unaligned buffers,
* get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
case SPDK_BDEV_IO_TYPE_READ:
case SPDK_BDEV_IO_TYPE_WRITE:
spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
return 0;
default:
return -1;
}
}
static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
{
if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
static bool
bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
{
switch (io_type) {
case SPDK_BDEV_IO_TYPE_READ:
case SPDK_BDEV_IO_TYPE_WRITE:
return true;
default:
return false;
}
}
static int
bdev_uring_create_cb(void *io_device, void *ctx_buf)
{
struct bdev_uring_io_channel *ch = ctx_buf;
ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
return 0;
}
static void
bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
{
struct bdev_uring_io_channel *ch = ctx_buf;
spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
}
static struct spdk_io_channel *
bdev_uring_get_io_channel(void *ctx)
{
struct bdev_uring *uring = ctx;
return spdk_get_io_channel(uring);
}
static int
bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
{
struct bdev_uring *uring = ctx;
spdk_json_write_named_object_begin(w, "uring");
spdk_json_write_named_string(w, "filename", uring->filename);
spdk_json_write_object_end(w);
return 0;
}
static void
bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
{
struct bdev_uring *uring = bdev->ctxt;
spdk_json_write_object_begin(w);
spdk_json_write_named_string(w, "method", "bdev_uring_create");
spdk_json_write_named_object_begin(w, "params");
spdk_json_write_named_string(w, "name", bdev->name);
spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
spdk_json_write_named_string(w, "filename", uring->filename);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);
}
static const struct spdk_bdev_fn_table uring_fn_table = {
.destruct = bdev_uring_destruct,
.submit_request = bdev_uring_submit_request,
.io_type_supported = bdev_uring_io_type_supported,
.get_io_channel = bdev_uring_get_io_channel,
.dump_info_json = bdev_uring_dump_info_json,
.write_config_json = bdev_uring_write_json_config,
};
static void uring_free_bdev(struct bdev_uring *uring)
{
if (uring == NULL) {
return;
}
free(uring->filename);
free(uring->bdev.name);
free(uring);
}
static int
bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
{
struct bdev_uring_group_channel *ch = ctx_buf;
if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) {
SPDK_ERRLOG("uring I/O context setup failure\n");
return -1;
}
ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
return 0;
}
static void
bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
{
struct bdev_uring_group_channel *ch = ctx_buf;
io_uring_queue_exit(&ch->uring);
spdk_poller_unregister(&ch->poller);
}
struct spdk_bdev *
create_uring_bdev(const char *name, const char *filename, uint32_t block_size)
{
struct bdev_uring *uring;
uint32_t detected_block_size;
uint64_t bdev_size;
int rc;
uring = calloc(1, sizeof(*uring));
if (!uring) {
SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
return NULL;
}
uring->filename = strdup(filename);
if (!uring->filename) {
goto error_return;
}
if (bdev_uring_open(uring)) {
SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
goto error_return;
}
bdev_size = spdk_fd_get_size(uring->fd);
uring->bdev.name = strdup(name);
if (!uring->bdev.name) {
goto error_return;
}
uring->bdev.product_name = "URING bdev";
uring->bdev.module = &uring_if;
uring->bdev.write_cache = 1;
detected_block_size = spdk_fd_get_blocklen(uring->fd);
if (block_size == 0) {
/* User did not specify block size - use autodetected block size. */
if (detected_block_size == 0) {
SPDK_ERRLOG("Block size could not be auto-detected\n");
goto error_return;
}
block_size = detected_block_size;
} else {
if (block_size < detected_block_size) {
SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
"auto-detected block size %" PRIu32 "\n",
block_size, detected_block_size);
goto error_return;
} else if (detected_block_size != 0 && block_size != detected_block_size) {
SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
"auto-detected block size %" PRIu32 "\n",
block_size, detected_block_size);
}
}
if (block_size < 512) {
SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
goto error_return;
}
if (!spdk_u32_is_pow2(block_size)) {
SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
goto error_return;
}
uring->bdev.blocklen = block_size;
uring->bdev.required_alignment = spdk_u32log2(block_size);
if (bdev_size % uring->bdev.blocklen != 0) {
SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
bdev_size, uring->bdev.blocklen);
goto error_return;
}
uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
uring->bdev.ctxt = uring;
uring->bdev.fn_table = &uring_fn_table;
spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
sizeof(struct bdev_uring_io_channel),
uring->bdev.name);
rc = spdk_bdev_register(&uring->bdev);
if (rc) {
spdk_io_device_unregister(uring, NULL);
goto error_return;
}
TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
return &uring->bdev;
error_return:
bdev_uring_close(uring);
uring_free_bdev(uring);
return NULL;
}
struct delete_uring_bdev_ctx {
spdk_delete_uring_complete cb_fn;
void *cb_arg;
};
static void
uring_bdev_unregister_cb(void *arg, int bdeverrno)
{
struct delete_uring_bdev_ctx *ctx = arg;
ctx->cb_fn(ctx->cb_arg, bdeverrno);
free(ctx);
}
void
delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg)
{
struct delete_uring_bdev_ctx *ctx;
if (!bdev || bdev->module != &uring_if) {
cb_fn(cb_arg, -ENODEV);
return;
}
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
cb_fn(cb_arg, -ENOMEM);
return;
}
ctx->cb_fn = cb_fn;
ctx->cb_arg = cb_arg;
spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx);
}
static int
bdev_uring_init(void)
{
size_t i;
struct spdk_conf_section *sp;
struct spdk_bdev *bdev;
TAILQ_INIT(&g_uring_bdev_head);
spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
sizeof(struct bdev_uring_group_channel),
"uring_module");
sp = spdk_conf_find_section(NULL, "URING");
if (!sp) {
return 0;
}
i = 0;
while (true) {
const char *file;
const char *name;
const char *block_size_str;
uint32_t block_size = 0;
long int tmp;
file = spdk_conf_section_get_nmval(sp, "URING", i, 0);
if (!file) {
break;
}
name = spdk_conf_section_get_nmval(sp, "URING", i, 1);
if (!name) {
SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file);
i++;
continue;
}
block_size_str = spdk_conf_section_get_nmval(sp, "URING", i, 2);
if (block_size_str) {
tmp = spdk_strtol(block_size_str, 10);
if (tmp < 0) {
SPDK_ERRLOG("Invalid block size for URING bdev with file %s\n", file);
i++;
continue;
}
block_size = (uint32_t)tmp;
}
bdev = create_uring_bdev(name, file, block_size);
if (!bdev) {
SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file);
i++;
continue;
}
i++;
}
return 0;
}
static void
bdev_uring_fini(void)
{
spdk_io_device_unregister(&uring_if, NULL);
}
static void
bdev_uring_get_spdk_running_config(FILE *fp)
{
char *file;
char *name;
uint32_t block_size;
struct bdev_uring *uring;
fprintf(fp,
"\n"
"# Users must change this section to match the /dev/sdX devices to be\n"
"# exported as iSCSI LUNs. The devices are accessed using io_uring.\n"
"# The format is:\n"
"# URING <file name> <bdev name> [<block size>]\n"
"# The file name is the backing device\n"
"# The bdev name can be referenced from elsewhere in the configuration file.\n"
"# Block size may be omitted to automatically detect the block size of a bdev.\n"
"[URING]\n");
TAILQ_FOREACH(uring, &g_uring_bdev_head, link) {
file = uring->filename;
name = uring->bdev.name;
block_size = uring->bdev.blocklen;
fprintf(fp, " URING %s %s %d\n", file, name, block_size);
}
fprintf(fp, "\n");
}
SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING)