diff --git a/etc/spdk/iscsi.conf.in b/etc/spdk/iscsi.conf.in index 6891ebf21..b96edd8a7 100644 --- a/etc/spdk/iscsi.conf.in +++ b/etc/spdk/iscsi.conf.in @@ -120,7 +120,7 @@ # Of course, users can disable offload even it is available. [Malloc] # Number of Malloc targets - NumberOfLuns 1 + NumberOfLuns 3 # Malloc targets are 128M LunSizeInMB 128 # Block size. Default is 512 bytes. @@ -140,6 +140,18 @@ AIO /dev/sdb AIO /dev/sdc +# The Split virtual block device slices block devices into multiple smaller bdevs. +[Split] + # Syntax: + # Split [] + + # Split Malloc1 into two equally-sized portions, Malloc1p0 and Malloc1p1 + Split Malloc1 2 + + # Split Malloc2 into eight 1-megabyte portions, Malloc2p0 ... Malloc2p7, + # leaving the rest of the device inaccessible + Split Malloc2 8 1 + # Users should change the TargetNode section(s) below to match the # desired iSCSI target node configuration. # TargetName, Mapping, LUN0 are minimum required diff --git a/etc/spdk/nvmf.conf.in b/etc/spdk/nvmf.conf.in index 777648690..747e8a56f 100644 --- a/etc/spdk/nvmf.conf.in +++ b/etc/spdk/nvmf.conf.in @@ -76,6 +76,18 @@ # Units in microseconds. AdminPollRate 100000 +# The Split virtual block device slices block devices into multiple smaller bdevs. +[Split] + # Syntax: + # Split [] + + # Split Malloc2 into two equally-sized portions, Malloc2p0 and Malloc2p1 + Split Malloc2 2 + + # Split Malloc3 into eight 1-megabyte portions, Malloc3p0 ... Malloc3p7, + # leaving the rest of the device inaccessible + Split Malloc3 8 1 + # Define an NVMf Subsystem. # - NQN is required and must be unique. # - Core may be set or not. If set, the specified subsystem will run on diff --git a/include/spdk_internal/bdev.h b/include/spdk_internal/bdev.h index a168b8dee..295c15ab0 100644 --- a/include/spdk_internal/bdev.h +++ b/include/spdk_internal/bdev.h @@ -157,6 +157,7 @@ struct spdk_bdev_io *spdk_bdev_get_child_io(struct spdk_bdev_io *parent, struct spdk_bdev *bdev, spdk_bdev_io_completion_cb cb, void *cb_arg); +void spdk_bdev_io_resubmit(struct spdk_bdev_io *bdev_io, struct spdk_bdev *new_bdev); void spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status); diff --git a/lib/bdev/Makefile b/lib/bdev/Makefile index 07afabd33..b88285f96 100644 --- a/lib/bdev/Makefile +++ b/lib/bdev/Makefile @@ -38,7 +38,7 @@ CFLAGS += $(ENV_CFLAGS) -I. C_SRCS = bdev.c LIBNAME = bdev -DIRS-y += malloc nvme rpc +DIRS-y += malloc nvme rpc split ifeq ($(OS),Linux) DIRS-y += aio diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index 1c15512a6..0fd59aa0e 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -430,6 +430,22 @@ spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) return 0; } +void +spdk_bdev_io_resubmit(struct spdk_bdev_io *bdev_io, struct spdk_bdev *new_bdev) +{ + assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING); + bdev_io->bdev = new_bdev; + + /* + * These fields are normally set during spdk_bdev_io_init(), but since bdev is + * being switched, they need to be reinitialized. + */ + bdev_io->gencnt = new_bdev->gencnt; + bdev_io->ctx = new_bdev->ctxt; + + __submit_request(new_bdev, bdev_io); +} + static void spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, struct spdk_bdev *bdev, void *cb_arg, diff --git a/lib/bdev/split/Makefile b/lib/bdev/split/Makefile new file mode 100644 index 000000000..fe81db9ca --- /dev/null +++ b/lib/bdev/split/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += $(ENV_CFLAGS) -I$(SPDK_ROOT_DIR)/lib/bdev/ +C_SRCS = vbdev_split.c +LIBNAME = vbdev_split + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/lib/bdev/split/vbdev_split.c b/lib/bdev/split/vbdev_split.c new file mode 100644 index 000000000..2f83655dd --- /dev/null +++ b/lib/bdev/split/vbdev_split.c @@ -0,0 +1,388 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a simple example of a virtual block device that takes a single + * bdev and slices it into multiple smaller bdevs. + */ + +#include +#include +#include +#include +#include + +#include "spdk/conf.h" +#include "spdk/endian.h" + +#include "spdk_internal/bdev.h" +#include "spdk_internal/log.h" + +/* Base block device split context */ +struct split_base { + struct spdk_bdev *base_bdev; + uint32_t ref; +}; + +/* Context for each split virtual bdev */ +struct split_disk { + struct spdk_bdev disk; + struct spdk_bdev *base_bdev; + struct split_base *split_base; + uint64_t offset_blocks; + uint64_t offset_bytes; + TAILQ_ENTRY(split_disk) tailq; +}; + +static TAILQ_HEAD(, split_disk) g_split_disks = TAILQ_HEAD_INITIALIZER(g_split_disks); + +static void +split_read(struct split_disk *split_disk, struct spdk_bdev_io *bdev_io) +{ + bdev_io->u.read.offset += split_disk->offset_bytes; +} + +static void +split_write(struct split_disk *split_disk, struct spdk_bdev_io *bdev_io) +{ + bdev_io->u.write.offset += split_disk->offset_bytes; +} + +static void +split_unmap(struct split_disk *split_disk, struct spdk_bdev_io *bdev_io) +{ + uint16_t i; + uint64_t lba; + + for (i = 0; i < bdev_io->u.unmap.bdesc_count; i++) { + lba = from_be64(&bdev_io->u.unmap.unmap_bdesc[i].lba); + lba += split_disk->offset_blocks; + to_be64(&bdev_io->u.unmap.unmap_bdesc[i].lba, lba); + } +} + +static void +split_flush(struct split_disk *split_disk, struct spdk_bdev_io *bdev_io) +{ + bdev_io->u.flush.offset += split_disk->offset_bytes; +} + +static void +split_reset(struct split_disk *split_disk, struct spdk_bdev_io *bdev_io) +{ + /* + * No offset to modify for reset - pass the I/O through unmodified. + * + * However, we do need to increment the generation count for the split bdev, + * since the spdk_bdev_io_complete() path that normally updates it will not execute + * after we resubmit the I/O to the base_bdev. + */ + if (bdev_io->u.reset.type == SPDK_BDEV_RESET_HARD) { + split_disk->disk.gencnt++; + } +} + +static void +vbdev_split_submit_request(struct spdk_bdev_io *bdev_io) +{ + struct split_disk *split_disk = bdev_io->ctx; + + /* Modify the I/O to adjust for the offset within the base bdev. */ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + split_read(split_disk, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + split_write(split_disk, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + split_unmap(split_disk, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + split_flush(split_disk, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + split_reset(split_disk, bdev_io); + break; + default: + SPDK_ERRLOG("split: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + /* Submit the modified I/O to the underlying bdev. */ + spdk_bdev_io_resubmit(bdev_io, split_disk->base_bdev); +} + +static void +vbdev_split_base_get_ref(struct split_base *split_base, struct split_disk *split_disk) +{ + __sync_fetch_and_add(&split_base->ref, 1); + split_disk->split_base = split_base; +} + +static void +vbdev_split_base_put_ref(struct split_base *split_base) +{ + if (__sync_sub_and_fetch(&split_base->ref, 1) == 0) { + spdk_bdev_unclaim(split_base->base_bdev); + free(split_base); + } +} + +static void +vbdev_split_free(struct split_disk *split_disk) +{ + struct split_base *split_base; + + if (!split_disk) { + return; + } + + split_base = split_disk->split_base; + + TAILQ_REMOVE(&g_split_disks, split_disk, tailq); + free(split_disk); + + if (split_base) { + vbdev_split_base_put_ref(split_base); + } +} + +static int +vbdev_split_destruct(struct spdk_bdev *bdev) +{ + struct split_disk *split_disk = (struct split_disk *)bdev; + + vbdev_split_free(split_disk); + return 0; +} + +static bool +vbdev_split_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) +{ + struct split_disk *split_disk = (struct split_disk *)bdev; + + return split_disk->base_bdev->fn_table->io_type_supported(bdev, io_type); +} + +static struct spdk_io_channel * +vbdev_split_get_io_channel(struct spdk_bdev *bdev, uint32_t priority) +{ + struct split_disk *split_disk = (struct split_disk *)bdev; + + return split_disk->base_bdev->fn_table->get_io_channel(bdev, priority); +} + +static struct spdk_bdev_fn_table vbdev_split_fn_table = { + .destruct = vbdev_split_destruct, + .io_type_supported = vbdev_split_io_type_supported, + .submit_request = vbdev_split_submit_request, + .get_io_channel = vbdev_split_get_io_channel, +}; + +static int +vbdev_split_create(struct spdk_bdev *base_bdev, uint64_t split_count, uint64_t split_size_mb) +{ + uint64_t split_size_bytes, split_size_blocks, offset_bytes, offset_blocks; + uint64_t max_split_count; + uint64_t mb = 1024 * 1024; + uint64_t i; + int rc; + struct split_base *split_base; + + if (!spdk_bdev_claim(base_bdev)) { + SPDK_ERRLOG("Split bdev %s is already claimed\n", base_bdev->name); + return -1; + } + + if (split_size_mb) { + if (((split_size_mb * mb) % base_bdev->blocklen) != 0) { + SPDK_ERRLOG("Split size %" PRIu64 " MB is not possible with block size " + "%" PRIu32 "\n", + split_size_mb, base_bdev->blocklen); + return -1; + } + split_size_blocks = (split_size_mb * mb) / base_bdev->blocklen; + SPDK_TRACELOG(SPDK_TRACE_VBDEV_SPLIT, "Split size %" PRIu64 " MB specified by user\n", + split_size_mb); + } else { + split_size_blocks = base_bdev->blockcnt / split_count; + SPDK_TRACELOG(SPDK_TRACE_VBDEV_SPLIT, "Split size not specified by user\n"); + } + + split_size_bytes = split_size_blocks * base_bdev->blocklen; + + max_split_count = base_bdev->blockcnt / split_size_blocks; + if (split_count > max_split_count) { + SPDK_WARNLOG("Split count %" PRIu64 " is greater than maximum possible split count " + "%" PRIu64 " - clamping\n", split_count, max_split_count); + split_count = max_split_count; + } + + SPDK_TRACELOG(SPDK_TRACE_VBDEV_SPLIT, "base_bdev: %s split_count: %" PRIu64 + " split_size_bytes: %" PRIu64 "\n", + base_bdev->name, split_count, split_size_bytes); + + split_base = calloc(1, sizeof(*split_base)); + split_base->base_bdev = base_bdev; + split_base->ref = 0; + + offset_bytes = 0; + offset_blocks = 0; + for (i = 0; i < split_count; i++) { + struct split_disk *d; + + d = calloc(1, sizeof(*d)); + if (!d) { + SPDK_ERRLOG("Memory allocation failure\n"); + rc = -1; + goto cleanup; + } + + /* Copy properties of the base bdev */ + d->disk.blocklen = base_bdev->blocklen; + d->disk.write_cache = base_bdev->write_cache; + d->disk.need_aligned_buffer = base_bdev->need_aligned_buffer; + d->disk.thin_provisioning = base_bdev->thin_provisioning; + + /* Append partition number to the base bdev's name, e.g. Malloc0 -> Malloc0p0 */ + snprintf(d->disk.name, sizeof(d->disk.name), "%sp%" PRIu64, base_bdev->name, i); + snprintf(d->disk.product_name, sizeof(d->disk.product_name), "Split Disk"); + d->base_bdev = base_bdev; + d->offset_bytes = offset_bytes; + d->offset_blocks = offset_blocks; + d->disk.blockcnt = split_size_blocks; + d->disk.ctxt = d; + d->disk.fn_table = &vbdev_split_fn_table; + + SPDK_TRACELOG(SPDK_TRACE_VBDEV_SPLIT, "Split vbdev %s: base bdev: %s offset_bytes: " + "%" PRIu64 " offset_blocks: %" PRIu64 "\n", + d->disk.name, base_bdev->name, d->offset_bytes, d->offset_blocks); + + vbdev_split_base_get_ref(split_base, d); + + spdk_bdev_register(&d->disk); + + TAILQ_INSERT_TAIL(&g_split_disks, d, tailq); + + offset_bytes += split_size_bytes; + offset_blocks += split_size_blocks; + } + + rc = 0; + +cleanup: + if (split_base->ref == 0) { + /* If no split_disk instances were created, free the base context */ + free(split_base); + } + + return rc; +} + +static int +vbdev_split_init(void) +{ + struct spdk_conf_section *sp; + const char *base_bdev_name; + const char *split_count_str; + const char *split_size_str; + int i, split_count, split_size; + struct spdk_bdev *base_bdev; + + sp = spdk_conf_find_section(NULL, "Split"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "Split", i)) { + break; + } + + base_bdev_name = spdk_conf_section_get_nmval(sp, "Split", i, 0); + if (!base_bdev_name) { + SPDK_ERRLOG("Split configuration missing blockdev name\n"); + return -1; + } + + base_bdev = spdk_bdev_get_by_name(base_bdev_name); + if (!base_bdev) { + SPDK_ERRLOG("Could not find Split bdev %s\n", base_bdev_name); + return -1; + } + + split_count_str = spdk_conf_section_get_nmval(sp, "Split", i, 1); + if (!split_count_str) { + SPDK_ERRLOG("Split configuration missing split count\n"); + return -1; + } + + split_count = atoi(split_count_str); + if (split_count < 1) { + SPDK_ERRLOG("Invalid Split count %d\n", split_count); + return -1; + } + + /* Optional split size in MB */ + split_size = 0; + split_size_str = spdk_conf_section_get_nmval(sp, "Split", i, 2); + if (split_size_str) { + split_size = atoi(split_size_str); + if (split_size <= 0) { + SPDK_ERRLOG("Invalid Split size %d\n", split_size); + return -1; + } + } + + if (vbdev_split_create(base_bdev, split_count, split_size)) { + return -1; + } + } + + return 0; +} + +static void +vbdev_split_fini(void) +{ + struct split_disk *split_disk, *tmp; + + TAILQ_FOREACH_SAFE(split_disk, &g_split_disks, tailq, tmp) { + vbdev_split_free(split_disk); + } +} + +SPDK_VBDEV_MODULE_REGISTER(vbdev_split_init, vbdev_split_fini, NULL, NULL) +SPDK_LOG_REGISTER_TRACE_FLAG("vbdev_split", SPDK_TRACE_VBDEV_SPLIT) diff --git a/mk/spdk.modules.mk b/mk/spdk.modules.mk index 02490c64e..df4a0f705 100644 --- a/mk/spdk.modules.mk +++ b/mk/spdk.modules.mk @@ -31,7 +31,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -BLOCKDEV_MODULES_LIST = bdev_malloc bdev_nvme nvme +BLOCKDEV_MODULES_LIST = bdev_malloc bdev_nvme nvme vbdev_split ifeq ($(CONFIG_RDMA),y) BLOCKDEV_MODULES_DEPS += -libverbs -lrdmacm diff --git a/test/lib/bdev/bdev.conf b/test/lib/bdev/bdev.conf index 01c9e517a..2f9fd76c8 100644 --- a/test/lib/bdev/bdev.conf +++ b/test/lib/bdev/bdev.conf @@ -7,6 +7,14 @@ NumberOfLuns 5 LunSizeInMB 32 +[Split] + # Split Malloc1 into two auto-sized halves + Split Malloc1 2 + + # Split Malloc2 into eight 1-megabyte pieces, + # leaving the rest of the device inaccessible + Split Malloc2 8 1 + [AIO] # skip these blockdevs if the /dev/ramX nodes do not exist # so that the blockdev tests can still run on systems that