Spdk/module/bdev/delay/vbdev_delay.c
paul luse a6dbe3721e update Intel copyright notices
per Intel policy to include file commit date using git cmd
below.  The policy does not apply to non-Intel (C) notices.

git log --follow -C90% --format=%ad --date default <file> | tail -1

and then pull just the 4 digit year from the result.

Intel copyrights were not added to files where Intel either had
no contribution ot the contribution lacked substance (ie license
header updates, formatting changes, etc).  Contribution date used
"--follow -C95%" to get the most accurate date.

Note that several files in this patch didn't end the license/(c)
block with a blank comment line so these were added as the vast
majority of files do have this last blank line.  Simply there for
consistency.

Signed-off-by: paul luse <paul.e.luse@intel.com>
Change-Id: Id5b7ce4f658fe87132f14139ead58d6e285c04d4
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15192
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Community-CI: Mellanox Build Bot
2022-11-10 08:28:53 +00:00

891 lines
27 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (C) 2019 Intel Corporation.
* All rights reserved.
* Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*/
#include "spdk/stdinc.h"
#include "vbdev_delay.h"
#include "spdk/rpc.h"
#include "spdk/env.h"
#include "spdk/endian.h"
#include "spdk/string.h"
#include "spdk/thread.h"
#include "spdk/util.h"
#include "spdk/bdev_module.h"
#include "spdk/log.h"
static int vbdev_delay_init(void);
static int vbdev_delay_get_ctx_size(void);
static void vbdev_delay_examine(struct spdk_bdev *bdev);
static void vbdev_delay_finish(void);
static int vbdev_delay_config_json(struct spdk_json_write_ctx *w);
static struct spdk_bdev_module delay_if = {
.name = "delay",
.module_init = vbdev_delay_init,
.get_ctx_size = vbdev_delay_get_ctx_size,
.examine_config = vbdev_delay_examine,
.module_fini = vbdev_delay_finish,
.config_json = vbdev_delay_config_json
};
SPDK_BDEV_MODULE_REGISTER(delay, &delay_if)
/* Associative list to be used in examine */
struct bdev_association {
char *vbdev_name;
char *bdev_name;
uint64_t avg_read_latency;
uint64_t p99_read_latency;
uint64_t avg_write_latency;
uint64_t p99_write_latency;
TAILQ_ENTRY(bdev_association) link;
};
static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER(
g_bdev_associations);
/* List of virtual bdevs and associated info for each. */
struct vbdev_delay {
struct spdk_bdev *base_bdev; /* the thing we're attaching to */
struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
struct spdk_bdev delay_bdev; /* the delay virtual bdev */
uint64_t average_read_latency_ticks; /* the average read delay */
uint64_t p99_read_latency_ticks; /* the p99 read delay */
uint64_t average_write_latency_ticks; /* the average write delay */
uint64_t p99_write_latency_ticks; /* the p99 write delay */
TAILQ_ENTRY(vbdev_delay) link;
struct spdk_thread *thread; /* thread where base device is opened */
};
static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes);
struct delay_bdev_io {
int status;
uint64_t completion_tick;
enum delay_io_type type;
struct spdk_io_channel *ch;
struct spdk_bdev_io_wait_entry bdev_io_wait;
struct spdk_bdev_io *zcopy_bdev_io;
STAILQ_ENTRY(delay_bdev_io) link;
};
struct delay_io_channel {
struct spdk_io_channel *base_ch; /* IO channel of base device */
STAILQ_HEAD(, delay_bdev_io) avg_read_io;
STAILQ_HEAD(, delay_bdev_io) p99_read_io;
STAILQ_HEAD(, delay_bdev_io) avg_write_io;
STAILQ_HEAD(, delay_bdev_io) p99_write_io;
struct spdk_poller *io_poller;
unsigned int rand_seed;
};
static void vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
/* Callback for unregistering the IO device. */
static void
_device_unregister_cb(void *io_device)
{
struct vbdev_delay *delay_node = io_device;
/* Done with this delay_node. */
free(delay_node->delay_bdev.name);
free(delay_node);
}
static void
_vbdev_delay_destruct(void *ctx)
{
struct spdk_bdev_desc *desc = ctx;
spdk_bdev_close(desc);
}
static int
vbdev_delay_destruct(void *ctx)
{
struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
/* It is important to follow this exact sequence of steps for destroying
* a vbdev...
*/
TAILQ_REMOVE(&g_delay_nodes, delay_node, link);
/* Unclaim the underlying bdev. */
spdk_bdev_module_release_bdev(delay_node->base_bdev);
/* Close the underlying bdev on its same opened thread. */
if (delay_node->thread && delay_node->thread != spdk_get_thread()) {
spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc);
} else {
spdk_bdev_close(delay_node->base_desc);
}
/* Unregister the io_device. */
spdk_io_device_unregister(delay_node, _device_unregister_cb);
return 0;
}
static int
_process_io_stailq(void *arg, uint64_t ticks)
{
STAILQ_HEAD(, delay_bdev_io) *head = arg;
struct delay_bdev_io *io_ctx, *tmp;
int completions = 0;
STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
if (io_ctx->completion_tick <= ticks) {
STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status);
completions++;
} else {
/* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically
* changed, this is not necessarily the case. However, the normal behavior will be restored
* after the outstanding I/O at the time of the change have been completed.
* This essentially means that moving from a high to low latency creates a dam for the new I/O
* submitted after the latency change. This is considered desirable behavior for the use case where
* we are trying to trigger a pre-defined timeout on an initiator.
*/
break;
}
}
return completions;
}
static int
_delay_finish_io(void *arg)
{
struct delay_io_channel *delay_ch = arg;
uint64_t ticks = spdk_get_ticks();
int completions = 0;
completions += _process_io_stailq(&delay_ch->avg_read_io, ticks);
completions += _process_io_stailq(&delay_ch->avg_write_io, ticks);
completions += _process_io_stailq(&delay_ch->p99_read_io, ticks);
completions += _process_io_stailq(&delay_ch->p99_write_io, ticks);
return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
}
/* Completion callback for IO that were issued from this bdev. The original bdev_io
* is passed in as an arg so we'll complete that one with the appropriate status
* and then free the one that this module issued.
*/
static void
_delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct spdk_bdev_io *orig_io = cb_arg;
struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev);
struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx;
struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZCOPY && bdev_io->u.bdev.zcopy.start && success) {
io_ctx->zcopy_bdev_io = bdev_io;
} else {
assert(io_ctx->zcopy_bdev_io == NULL || io_ctx->zcopy_bdev_io == bdev_io);
io_ctx->zcopy_bdev_io = NULL;
spdk_bdev_free_io(bdev_io);
}
/* Put the I/O into the proper list for processing by the channel poller. */
switch (io_ctx->type) {
case DELAY_AVG_READ:
io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks;
STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link);
break;
case DELAY_AVG_WRITE:
io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks;
STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link);
break;
case DELAY_P99_READ:
io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks;
STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link);
break;
case DELAY_P99_WRITE:
io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks;
STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link);
break;
case DELAY_NONE:
default:
spdk_bdev_io_complete(orig_io, io_ctx->status);
break;
}
}
static void
vbdev_delay_resubmit_io(void *arg)
{
struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
vbdev_delay_submit_request(io_ctx->ch, bdev_io);
}
static void
vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io)
{
struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
int rc;
io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io;
io_ctx->bdev_io_wait.cb_arg = bdev_io;
rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait);
if (rc != 0) {
SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc);
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
static void
delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
{
struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay,
delay_bdev);
struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
int rc;
if (!success) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
return;
}
rc = spdk_bdev_readv_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
bdev_io->u.bdev.num_blocks, _delay_complete_io,
bdev_io, bdev_io->u.bdev.ext_opts);
if (rc == -ENOMEM) {
SPDK_ERRLOG("No memory, start to queue io for delay.\n");
vbdev_delay_queue_io(bdev_io);
} else if (rc != 0) {
SPDK_ERRLOG("ERROR on bdev_io submission!\n");
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
static void
vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status)
{
struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i);
int rc;
rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch,
_delay_complete_io, bdev_io);
if (rc == -ENOMEM) {
SPDK_ERRLOG("No memory, start to queue io for delay.\n");
vbdev_delay_queue_io(bdev_io);
} else if (rc != 0) {
SPDK_ERRLOG("ERROR on bdev_io submission!\n");
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
static void
abort_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
spdk_bdev_free_io(bdev_io);
}
static void
_abort_all_delayed_io(void *arg)
{
STAILQ_HEAD(, delay_bdev_io) *head = arg;
struct delay_bdev_io *io_ctx, *tmp;
STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
if (io_ctx->zcopy_bdev_io != NULL) {
spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
}
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED);
}
}
static void
vbdev_delay_reset_channel(struct spdk_io_channel_iter *i)
{
struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
_abort_all_delayed_io(&delay_ch->avg_read_io);
_abort_all_delayed_io(&delay_ch->avg_write_io);
_abort_all_delayed_io(&delay_ch->p99_read_io);
_abort_all_delayed_io(&delay_ch->p99_write_io);
spdk_for_each_channel_continue(i, 0);
}
static bool
abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort)
{
STAILQ_HEAD(, delay_bdev_io) *head = _head;
struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx;
struct delay_bdev_io *io_ctx;
STAILQ_FOREACH(io_ctx, head, link) {
if (io_ctx == io_ctx_to_abort) {
STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link);
if (io_ctx->zcopy_bdev_io != NULL) {
spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
}
spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
return true;
}
}
return false;
}
static int
vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch,
struct spdk_bdev_io *bdev_io)
{
struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) ||
abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) ||
abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) ||
abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) {
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
return 0;
}
return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort,
_delay_complete_io, bdev_io);
}
static void
vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
{
struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev);
struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
int rc = 0;
bool is_p99;
is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false;
io_ctx->ch = ch;
io_ctx->type = DELAY_NONE;
if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY || bdev_io->u.bdev.zcopy.start) {
io_ctx->zcopy_bdev_io = NULL;
}
switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_READ:
io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb,
bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
break;
case SPDK_BDEV_IO_TYPE_WRITE:
io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
rc = spdk_bdev_writev_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
bdev_io->u.bdev.num_blocks, _delay_complete_io,
bdev_io, bdev_io->u.bdev.ext_opts);
break;
case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch,
bdev_io->u.bdev.offset_blocks,
bdev_io->u.bdev.num_blocks,
_delay_complete_io, bdev_io);
break;
case SPDK_BDEV_IO_TYPE_UNMAP:
rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch,
bdev_io->u.bdev.offset_blocks,
bdev_io->u.bdev.num_blocks,
_delay_complete_io, bdev_io);
break;
case SPDK_BDEV_IO_TYPE_FLUSH:
rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch,
bdev_io->u.bdev.offset_blocks,
bdev_io->u.bdev.num_blocks,
_delay_complete_io, bdev_io);
break;
case SPDK_BDEV_IO_TYPE_RESET:
/* During reset, the generic bdev layer aborts all new I/Os and queues all new resets.
* Hence we can simply abort all I/Os delayed to complete.
*/
spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io,
vbdev_delay_reset_dev);
break;
case SPDK_BDEV_IO_TYPE_ABORT:
rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io);
break;
case SPDK_BDEV_IO_TYPE_ZCOPY:
if (bdev_io->u.bdev.zcopy.commit) {
io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
} else if (bdev_io->u.bdev.zcopy.populate) {
io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
}
if (bdev_io->u.bdev.zcopy.start) {
rc = spdk_bdev_zcopy_start(delay_node->base_desc, delay_ch->base_ch,
bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
bdev_io->u.bdev.offset_blocks,
bdev_io->u.bdev.num_blocks,
bdev_io->u.bdev.zcopy.populate,
_delay_complete_io, bdev_io);
} else {
rc = spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, bdev_io->u.bdev.zcopy.commit,
_delay_complete_io, bdev_io);
}
break;
default:
SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type);
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
return;
}
if (rc == -ENOMEM) {
SPDK_ERRLOG("No memory, start to queue io for delay.\n");
vbdev_delay_queue_io(bdev_io);
} else if (rc != 0) {
SPDK_ERRLOG("ERROR on bdev_io submission!\n");
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
}
}
static bool
vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
{
struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type);
}
static struct spdk_io_channel *
vbdev_delay_get_io_channel(void *ctx)
{
struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
struct spdk_io_channel *delay_ch = NULL;
delay_ch = spdk_get_io_channel(delay_node);
return delay_ch;
}
static void
_delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w)
{
spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev));
spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev));
spdk_json_write_named_int64(w, "avg_read_latency",
delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
spdk_json_write_named_int64(w, "p99_read_latency",
delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
spdk_json_write_named_int64(w, "avg_write_latency",
delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
spdk_json_write_named_int64(w, "p99_write_latency",
delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
}
static int
vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
{
struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
spdk_json_write_name(w, "delay");
spdk_json_write_object_begin(w);
_delay_write_conf_values(delay_node, w);
spdk_json_write_object_end(w);
return 0;
}
/* This is used to generate JSON that can configure this module to its current state. */
static int
vbdev_delay_config_json(struct spdk_json_write_ctx *w)
{
struct vbdev_delay *delay_node;
TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
spdk_json_write_object_begin(w);
spdk_json_write_named_string(w, "method", "bdev_delay_create");
spdk_json_write_named_object_begin(w, "params");
_delay_write_conf_values(delay_node, w);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);
}
return 0;
}
/* We provide this callback for the SPDK channel code to create a channel using
* the channel struct we provided in our module get_io_channel() entry point. Here
* we get and save off an underlying base channel of the device below us so that
* we can communicate with the base bdev on a per channel basis. If we needed
* our own poller for this vbdev, we'd register it here.
*/
static int
delay_bdev_ch_create_cb(void *io_device, void *ctx_buf)
{
struct delay_io_channel *delay_ch = ctx_buf;
struct vbdev_delay *delay_node = io_device;
STAILQ_INIT(&delay_ch->avg_read_io);
STAILQ_INIT(&delay_ch->p99_read_io);
STAILQ_INIT(&delay_ch->avg_write_io);
STAILQ_INIT(&delay_ch->p99_write_io);
delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0);
delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc);
delay_ch->rand_seed = time(NULL);
return 0;
}
/* We provide this callback for the SPDK channel code to destroy a channel
* created with our create callback. We just need to undo anything we did
* when we created. If this bdev used its own poller, we'd unregister it here.
*/
static void
delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
{
struct delay_io_channel *delay_ch = ctx_buf;
spdk_poller_unregister(&delay_ch->io_poller);
spdk_put_io_channel(delay_ch->base_ch);
}
/* Create the delay association from the bdev and vbdev name and insert
* on the global list. */
static int
vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name,
uint64_t avg_read_latency, uint64_t p99_read_latency,
uint64_t avg_write_latency, uint64_t p99_write_latency)
{
struct bdev_association *assoc;
TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
if (strcmp(vbdev_name, assoc->vbdev_name) == 0) {
SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name);
return -EEXIST;
}
}
assoc = calloc(1, sizeof(struct bdev_association));
if (!assoc) {
SPDK_ERRLOG("could not allocate bdev_association\n");
return -ENOMEM;
}
assoc->bdev_name = strdup(bdev_name);
if (!assoc->bdev_name) {
SPDK_ERRLOG("could not allocate assoc->bdev_name\n");
free(assoc);
return -ENOMEM;
}
assoc->vbdev_name = strdup(vbdev_name);
if (!assoc->vbdev_name) {
SPDK_ERRLOG("could not allocate assoc->vbdev_name\n");
free(assoc->bdev_name);
free(assoc);
return -ENOMEM;
}
assoc->avg_read_latency = avg_read_latency;
assoc->p99_read_latency = p99_read_latency;
assoc->avg_write_latency = avg_write_latency;
assoc->p99_write_latency = p99_write_latency;
TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link);
return 0;
}
int
vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type)
{
struct vbdev_delay *delay_node;
uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
if (strcmp(delay_node->delay_bdev.name, delay_name) == 0) {
break;
}
}
if (delay_node == NULL) {
return -ENODEV;
}
switch (type) {
case DELAY_AVG_READ:
delay_node->average_read_latency_ticks = ticks_mhz * latency_us;
break;
case DELAY_AVG_WRITE:
delay_node->average_write_latency_ticks = ticks_mhz * latency_us;
break;
case DELAY_P99_READ:
delay_node->p99_read_latency_ticks = ticks_mhz * latency_us;
break;
case DELAY_P99_WRITE:
delay_node->p99_write_latency_ticks = ticks_mhz * latency_us;
break;
default:
return -EINVAL;
}
return 0;
}
static int
vbdev_delay_init(void)
{
/* Not allowing for .ini style configuration. */
return 0;
}
static void
vbdev_delay_finish(void)
{
struct bdev_association *assoc;
while ((assoc = TAILQ_FIRST(&g_bdev_associations))) {
TAILQ_REMOVE(&g_bdev_associations, assoc, link);
free(assoc->bdev_name);
free(assoc->vbdev_name);
free(assoc);
}
}
static int
vbdev_delay_get_ctx_size(void)
{
return sizeof(struct delay_bdev_io);
}
static void
vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
{
/* No config per bdev needed */
}
static int
vbdev_delay_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
{
struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
/* Delay bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */
return spdk_bdev_get_memory_domains(delay_node->base_bdev, domains, array_size);
}
/* When we register our bdev this is how we specify our entry points. */
static const struct spdk_bdev_fn_table vbdev_delay_fn_table = {
.destruct = vbdev_delay_destruct,
.submit_request = vbdev_delay_submit_request,
.io_type_supported = vbdev_delay_io_type_supported,
.get_io_channel = vbdev_delay_get_io_channel,
.dump_info_json = vbdev_delay_dump_info_json,
.write_config_json = vbdev_delay_write_config_json,
.get_memory_domains = vbdev_delay_get_memory_domains,
};
static void
vbdev_delay_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
{
struct vbdev_delay *delay_node, *tmp;
TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) {
if (bdev_find == delay_node->base_bdev) {
spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL);
}
}
}
/* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */
static void
vbdev_delay_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
void *event_ctx)
{
switch (type) {
case SPDK_BDEV_EVENT_REMOVE:
vbdev_delay_base_bdev_hotremove_cb(bdev);
break;
default:
SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
break;
}
}
/* Create and register the delay vbdev if we find it in our list of bdev names.
* This can be called either by the examine path or RPC method.
*/
static int
vbdev_delay_register(const char *bdev_name)
{
struct bdev_association *assoc;
struct vbdev_delay *delay_node;
struct spdk_bdev *bdev;
uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
int rc = 0;
/* Check our list of names from config versus this bdev and if
* there's a match, create the delay_node & bdev accordingly.
*/
TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
if (strcmp(assoc->bdev_name, bdev_name) != 0) {
continue;
}
delay_node = calloc(1, sizeof(struct vbdev_delay));
if (!delay_node) {
rc = -ENOMEM;
SPDK_ERRLOG("could not allocate delay_node\n");
break;
}
delay_node->delay_bdev.name = strdup(assoc->vbdev_name);
if (!delay_node->delay_bdev.name) {
rc = -ENOMEM;
SPDK_ERRLOG("could not allocate delay_bdev name\n");
free(delay_node);
break;
}
delay_node->delay_bdev.product_name = "delay";
/* The base bdev that we're attaching to. */
rc = spdk_bdev_open_ext(bdev_name, true, vbdev_delay_base_bdev_event_cb,
NULL, &delay_node->base_desc);
if (rc) {
if (rc != -ENODEV) {
SPDK_ERRLOG("could not open bdev %s\n", bdev_name);
}
free(delay_node->delay_bdev.name);
free(delay_node);
break;
}
bdev = spdk_bdev_desc_get_bdev(delay_node->base_desc);
delay_node->base_bdev = bdev;
delay_node->delay_bdev.write_cache = bdev->write_cache;
delay_node->delay_bdev.required_alignment = bdev->required_alignment;
delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary;
delay_node->delay_bdev.blocklen = bdev->blocklen;
delay_node->delay_bdev.blockcnt = bdev->blockcnt;
delay_node->delay_bdev.ctxt = delay_node;
delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table;
delay_node->delay_bdev.module = &delay_if;
/* Store the number of ticks you need to add to get the I/O expiration time. */
delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency;
delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency;
delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency;
delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency;
spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb,
sizeof(struct delay_io_channel),
assoc->vbdev_name);
/* Save the thread where the base device is opened */
delay_node->thread = spdk_get_thread();
rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module);
if (rc) {
SPDK_ERRLOG("could not claim bdev %s\n", bdev_name);
goto error_close;
}
rc = spdk_bdev_register(&delay_node->delay_bdev);
if (rc) {
SPDK_ERRLOG("could not register delay_bdev\n");
spdk_bdev_module_release_bdev(delay_node->base_bdev);
goto error_close;
}
TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link);
}
return rc;
error_close:
spdk_bdev_close(delay_node->base_desc);
spdk_io_device_unregister(delay_node, NULL);
free(delay_node->delay_bdev.name);
free(delay_node);
return rc;
}
int
create_delay_disk(const char *bdev_name, const char *vbdev_name, uint64_t avg_read_latency,
uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency)
{
int rc = 0;
if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) {
SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n");
return -EINVAL;
}
rc = vbdev_delay_insert_association(bdev_name, vbdev_name, avg_read_latency, p99_read_latency,
avg_write_latency, p99_write_latency);
if (rc) {
return rc;
}
rc = vbdev_delay_register(bdev_name);
if (rc == -ENODEV) {
/* This is not an error, we tracked the name above and it still
* may show up later.
*/
SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n");
rc = 0;
}
return rc;
}
void
delete_delay_disk(const char *vbdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
{
struct bdev_association *assoc;
int rc;
rc = spdk_bdev_unregister_by_name(vbdev_name, &delay_if, cb_fn, cb_arg);
if (rc == 0) {
TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
if (strcmp(assoc->vbdev_name, vbdev_name) == 0) {
TAILQ_REMOVE(&g_bdev_associations, assoc, link);
free(assoc->bdev_name);
free(assoc->vbdev_name);
free(assoc);
break;
}
}
} else {
cb_fn(cb_arg, rc);
}
}
static void
vbdev_delay_examine(struct spdk_bdev *bdev)
{
vbdev_delay_register(bdev->name);
spdk_bdev_module_examine_done(&delay_if);
}
SPDK_LOG_REGISTER_COMPONENT(vbdev_delay)