per Intel policy to include file commit date using git cmd below. The policy does not apply to non-Intel (C) notices. git log --follow -C90% --format=%ad --date default <file> | tail -1 and then pull just the 4 digit year from the result. Intel copyrights were not added to files where Intel either had no contribution ot the contribution lacked substance (ie license header updates, formatting changes, etc). Contribution date used "--follow -C95%" to get the most accurate date. Note that several files in this patch didn't end the license/(c) block with a blank comment line so these were added as the vast majority of files do have this last blank line. Simply there for consistency. Signed-off-by: paul luse <paul.e.luse@intel.com> Change-Id: Id5b7ce4f658fe87132f14139ead58d6e285c04d4 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15192 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Community-CI: Mellanox Build Bot
900 lines
24 KiB
C
900 lines
24 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (C) 2019 Intel Corporation.
|
|
* Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
|
|
* All rights reserved.
|
|
*/
|
|
|
|
#include "spdk/stdinc.h"
|
|
|
|
#include "vbdev_zone_block.h"
|
|
|
|
#include "spdk/config.h"
|
|
#include "spdk/nvme.h"
|
|
#include "spdk/bdev_zone.h"
|
|
|
|
#include "spdk/log.h"
|
|
|
|
static int zone_block_init(void);
|
|
static int zone_block_get_ctx_size(void);
|
|
static void zone_block_finish(void);
|
|
static int zone_block_config_json(struct spdk_json_write_ctx *w);
|
|
static void zone_block_examine(struct spdk_bdev *bdev);
|
|
|
|
static struct spdk_bdev_module bdev_zoned_if = {
|
|
.name = "bdev_zoned_block",
|
|
.module_init = zone_block_init,
|
|
.module_fini = zone_block_finish,
|
|
.config_json = zone_block_config_json,
|
|
.examine_config = zone_block_examine,
|
|
.get_ctx_size = zone_block_get_ctx_size,
|
|
};
|
|
|
|
SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
|
|
|
|
/* List of block vbdev names and their base bdevs via configuration file.
|
|
* Used so we can parse the conf once at init and use this list in examine().
|
|
*/
|
|
struct bdev_zone_block_config {
|
|
char *vbdev_name;
|
|
char *bdev_name;
|
|
uint64_t zone_capacity;
|
|
uint64_t optimal_open_zones;
|
|
TAILQ_ENTRY(bdev_zone_block_config) link;
|
|
};
|
|
static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
|
|
|
|
struct block_zone {
|
|
struct spdk_bdev_zone_info zone_info;
|
|
pthread_spinlock_t lock;
|
|
};
|
|
|
|
/* List of block vbdevs and associated info for each. */
|
|
struct bdev_zone_block {
|
|
struct spdk_bdev bdev; /* the block zoned bdev */
|
|
struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
|
|
struct block_zone *zones; /* array of zones */
|
|
uint64_t num_zones; /* number of zones */
|
|
uint64_t zone_capacity; /* zone capacity */
|
|
uint64_t zone_shift; /* log2 of zone_size */
|
|
TAILQ_ENTRY(bdev_zone_block) link;
|
|
struct spdk_thread *thread; /* thread where base device is opened */
|
|
};
|
|
static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
|
|
|
|
struct zone_block_io_channel {
|
|
struct spdk_io_channel *base_ch; /* IO channel of base device */
|
|
};
|
|
|
|
struct zone_block_io {
|
|
/* vbdev to which IO was issued */
|
|
struct bdev_zone_block *bdev_zone_block;
|
|
};
|
|
|
|
static int
|
|
zone_block_init(void)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
zone_block_remove_config(struct bdev_zone_block_config *name)
|
|
{
|
|
TAILQ_REMOVE(&g_bdev_configs, name, link);
|
|
free(name->bdev_name);
|
|
free(name->vbdev_name);
|
|
free(name);
|
|
}
|
|
|
|
static void
|
|
zone_block_finish(void)
|
|
{
|
|
struct bdev_zone_block_config *name;
|
|
|
|
while ((name = TAILQ_FIRST(&g_bdev_configs))) {
|
|
zone_block_remove_config(name);
|
|
}
|
|
}
|
|
|
|
static int
|
|
zone_block_get_ctx_size(void)
|
|
{
|
|
return sizeof(struct zone_block_io);
|
|
}
|
|
|
|
static int
|
|
zone_block_config_json(struct spdk_json_write_ctx *w)
|
|
{
|
|
struct bdev_zone_block *bdev_node;
|
|
struct spdk_bdev *base_bdev = NULL;
|
|
|
|
TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
|
|
base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
|
|
spdk_json_write_object_begin(w);
|
|
spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
|
|
spdk_json_write_named_object_begin(w, "params");
|
|
spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
|
|
spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
|
|
spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
|
|
spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
|
|
spdk_json_write_object_end(w);
|
|
spdk_json_write_object_end(w);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Callback for unregistering the IO device. */
|
|
static void
|
|
_device_unregister_cb(void *io_device)
|
|
{
|
|
struct bdev_zone_block *bdev_node = io_device;
|
|
uint64_t i;
|
|
|
|
free(bdev_node->bdev.name);
|
|
for (i = 0; i < bdev_node->num_zones; i++) {
|
|
pthread_spin_destroy(&bdev_node->zones[i].lock);
|
|
}
|
|
free(bdev_node->zones);
|
|
free(bdev_node);
|
|
}
|
|
|
|
static void
|
|
_zone_block_destruct(void *ctx)
|
|
{
|
|
struct spdk_bdev_desc *desc = ctx;
|
|
|
|
spdk_bdev_close(desc);
|
|
}
|
|
|
|
static int
|
|
zone_block_destruct(void *ctx)
|
|
{
|
|
struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
|
|
|
|
TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
|
|
|
|
/* Unclaim the underlying bdev. */
|
|
spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
|
|
|
|
/* Close the underlying bdev on its same opened thread. */
|
|
if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
|
|
spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
|
|
} else {
|
|
spdk_bdev_close(bdev_node->base_desc);
|
|
}
|
|
|
|
/* Unregister the io_device. */
|
|
spdk_io_device_unregister(bdev_node, _device_unregister_cb);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct block_zone *
|
|
zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
|
|
{
|
|
size_t index = lba >> bdev_node->zone_shift;
|
|
|
|
if (index >= bdev_node->num_zones) {
|
|
return NULL;
|
|
}
|
|
|
|
return &bdev_node->zones[index];
|
|
}
|
|
|
|
static struct block_zone *
|
|
zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
|
|
{
|
|
struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
|
|
|
|
if (zone && zone->zone_info.zone_id == start_lba) {
|
|
return zone;
|
|
} else {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
static int
|
|
zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
|
|
{
|
|
struct block_zone *zone;
|
|
struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
|
|
uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
|
|
size_t i;
|
|
|
|
/* User can request info for more zones than exist, need to check both internal and user
|
|
* boundaries
|
|
*/
|
|
for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
|
|
zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
|
|
if (!zone) {
|
|
return -EINVAL;
|
|
}
|
|
memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
|
|
}
|
|
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
|
|
{
|
|
pthread_spin_lock(&zone->lock);
|
|
|
|
switch (zone->zone_info.state) {
|
|
case SPDK_BDEV_ZONE_STATE_EMPTY:
|
|
case SPDK_BDEV_ZONE_STATE_OPEN:
|
|
case SPDK_BDEV_ZONE_STATE_CLOSED:
|
|
zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
|
|
pthread_spin_unlock(&zone->lock);
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
|
|
return 0;
|
|
default:
|
|
pthread_spin_unlock(&zone->lock);
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
static void
|
|
_zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
|
|
{
|
|
struct spdk_bdev_io *orig_io = cb_arg;
|
|
int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
|
|
|
|
/* Complete the original IO and then free the one that we created here
|
|
* as a result of issuing an IO via submit_request.
|
|
*/
|
|
spdk_bdev_io_complete(orig_io, status);
|
|
spdk_bdev_free_io(bdev_io);
|
|
}
|
|
|
|
static int
|
|
zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
|
|
struct block_zone *zone, struct spdk_bdev_io *bdev_io)
|
|
{
|
|
pthread_spin_lock(&zone->lock);
|
|
|
|
switch (zone->zone_info.state) {
|
|
case SPDK_BDEV_ZONE_STATE_EMPTY:
|
|
pthread_spin_unlock(&zone->lock);
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
|
|
return 0;
|
|
case SPDK_BDEV_ZONE_STATE_OPEN:
|
|
case SPDK_BDEV_ZONE_STATE_FULL:
|
|
case SPDK_BDEV_ZONE_STATE_CLOSED:
|
|
zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
|
|
zone->zone_info.write_pointer = zone->zone_info.zone_id;
|
|
pthread_spin_unlock(&zone->lock);
|
|
|
|
/* The unmap isn't necessary, so if the base bdev doesn't support it, we're done */
|
|
if (!spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(bdev_node->base_desc),
|
|
SPDK_BDEV_IO_TYPE_UNMAP)) {
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
|
|
return 0;
|
|
}
|
|
|
|
return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
|
|
zone->zone_info.zone_id, zone->zone_info.capacity,
|
|
_zone_block_complete_unmap, bdev_io);
|
|
default:
|
|
pthread_spin_unlock(&zone->lock);
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
static int
|
|
zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
|
|
{
|
|
pthread_spin_lock(&zone->lock);
|
|
|
|
switch (zone->zone_info.state) {
|
|
case SPDK_BDEV_ZONE_STATE_OPEN:
|
|
case SPDK_BDEV_ZONE_STATE_CLOSED:
|
|
zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
|
|
pthread_spin_unlock(&zone->lock);
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
|
|
return 0;
|
|
default:
|
|
pthread_spin_unlock(&zone->lock);
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
static int
|
|
zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
|
|
{
|
|
pthread_spin_lock(&zone->lock);
|
|
|
|
zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
|
|
zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
|
|
|
|
pthread_spin_unlock(&zone->lock);
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
|
|
struct spdk_bdev_io *bdev_io)
|
|
{
|
|
struct block_zone *zone;
|
|
|
|
zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
|
|
if (!zone) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
switch (bdev_io->u.zone_mgmt.zone_action) {
|
|
case SPDK_BDEV_ZONE_RESET:
|
|
return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
|
|
case SPDK_BDEV_ZONE_OPEN:
|
|
return zone_block_open_zone(zone, bdev_io);
|
|
case SPDK_BDEV_ZONE_CLOSE:
|
|
return zone_block_close_zone(zone, bdev_io);
|
|
case SPDK_BDEV_ZONE_FINISH:
|
|
return zone_block_finish_zone(zone, bdev_io);
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
static void
|
|
_zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
|
|
{
|
|
struct spdk_bdev_io *orig_io = cb_arg;
|
|
int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
|
|
|
|
if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
|
|
orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
|
|
}
|
|
|
|
/* Complete the original IO and then free the one that we created here
|
|
* as a result of issuing an IO via submit_request.
|
|
*/
|
|
spdk_bdev_io_complete(orig_io, status);
|
|
spdk_bdev_free_io(bdev_io);
|
|
}
|
|
|
|
static int
|
|
zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
|
|
struct spdk_bdev_io *bdev_io)
|
|
{
|
|
struct block_zone *zone;
|
|
uint64_t len = bdev_io->u.bdev.num_blocks;
|
|
uint64_t lba = bdev_io->u.bdev.offset_blocks;
|
|
uint64_t num_blocks_left, wp;
|
|
int rc = 0;
|
|
bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
|
|
|
|
if (is_append) {
|
|
zone = zone_block_get_zone_by_slba(bdev_node, lba);
|
|
} else {
|
|
zone = zone_block_get_zone_containing_lba(bdev_node, lba);
|
|
}
|
|
if (!zone) {
|
|
SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba);
|
|
return -EINVAL;
|
|
}
|
|
|
|
pthread_spin_lock(&zone->lock);
|
|
|
|
switch (zone->zone_info.state) {
|
|
case SPDK_BDEV_ZONE_STATE_OPEN:
|
|
case SPDK_BDEV_ZONE_STATE_EMPTY:
|
|
case SPDK_BDEV_ZONE_STATE_CLOSED:
|
|
zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
|
|
break;
|
|
default:
|
|
SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
|
|
rc = -EINVAL;
|
|
goto write_fail;
|
|
}
|
|
|
|
wp = zone->zone_info.write_pointer;
|
|
if (is_append) {
|
|
lba = wp;
|
|
} else {
|
|
if (lba != wp) {
|
|
SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n",
|
|
lba, wp);
|
|
rc = -EINVAL;
|
|
goto write_fail;
|
|
}
|
|
}
|
|
|
|
num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
|
|
if (len > num_blocks_left) {
|
|
SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64
|
|
")\n", lba, len, wp);
|
|
rc = -EINVAL;
|
|
goto write_fail;
|
|
}
|
|
|
|
zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
|
|
assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
|
|
if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
|
|
zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
|
|
}
|
|
pthread_spin_unlock(&zone->lock);
|
|
|
|
rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
|
|
bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
|
|
bdev_io->u.bdev.md_buf,
|
|
lba, bdev_io->u.bdev.num_blocks,
|
|
_zone_block_complete_write, bdev_io);
|
|
|
|
return rc;
|
|
|
|
write_fail:
|
|
pthread_spin_unlock(&zone->lock);
|
|
return rc;
|
|
}
|
|
|
|
static void
|
|
_zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
|
|
{
|
|
struct spdk_bdev_io *orig_io = cb_arg;
|
|
int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
|
|
|
|
/* Complete the original IO and then free the one that we created here
|
|
* as a result of issuing an IO via submit_request.
|
|
*/
|
|
spdk_bdev_io_complete(orig_io, status);
|
|
spdk_bdev_free_io(bdev_io);
|
|
}
|
|
|
|
static int
|
|
zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
|
|
struct spdk_bdev_io *bdev_io)
|
|
{
|
|
struct block_zone *zone;
|
|
uint64_t len = bdev_io->u.bdev.num_blocks;
|
|
uint64_t lba = bdev_io->u.bdev.offset_blocks;
|
|
int rc;
|
|
|
|
zone = zone_block_get_zone_containing_lba(bdev_node, lba);
|
|
if (!zone) {
|
|
SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba);
|
|
return -EINVAL;
|
|
}
|
|
|
|
if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
|
|
SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len);
|
|
return -EINVAL;
|
|
}
|
|
|
|
rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
|
|
bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
|
|
bdev_io->u.bdev.md_buf,
|
|
lba, len,
|
|
_zone_block_complete_read, bdev_io);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void
|
|
zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
|
|
{
|
|
struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
|
|
struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
|
|
int rc = 0;
|
|
|
|
switch (bdev_io->type) {
|
|
case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
|
|
rc = zone_block_get_zone_info(bdev_node, bdev_io);
|
|
break;
|
|
case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
|
|
rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
|
|
break;
|
|
case SPDK_BDEV_IO_TYPE_WRITE:
|
|
case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
|
|
rc = zone_block_write(bdev_node, dev_ch, bdev_io);
|
|
break;
|
|
case SPDK_BDEV_IO_TYPE_READ:
|
|
rc = zone_block_read(bdev_node, dev_ch, bdev_io);
|
|
break;
|
|
default:
|
|
SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
|
|
rc = -ENOTSUP;
|
|
break;
|
|
}
|
|
|
|
if (rc != 0) {
|
|
if (rc == -ENOMEM) {
|
|
SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
|
|
} else {
|
|
SPDK_ERRLOG("ERROR on bdev_io submission!\n");
|
|
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool
|
|
zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
|
|
{
|
|
switch (io_type) {
|
|
case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
|
|
case SPDK_BDEV_IO_TYPE_WRITE:
|
|
case SPDK_BDEV_IO_TYPE_READ:
|
|
case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static struct spdk_io_channel *
|
|
zone_block_get_io_channel(void *ctx)
|
|
{
|
|
struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
|
|
|
|
return spdk_get_io_channel(bdev_node);
|
|
}
|
|
|
|
static int
|
|
zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
|
|
{
|
|
struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
|
|
struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
|
|
|
|
spdk_json_write_name(w, "zoned_block");
|
|
spdk_json_write_object_begin(w);
|
|
spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
|
|
spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
|
|
spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
|
|
spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
|
|
spdk_json_write_object_end(w);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* When we register our vbdev this is how we specify our entry points. */
|
|
static const struct spdk_bdev_fn_table zone_block_fn_table = {
|
|
.destruct = zone_block_destruct,
|
|
.submit_request = zone_block_submit_request,
|
|
.io_type_supported = zone_block_io_type_supported,
|
|
.get_io_channel = zone_block_get_io_channel,
|
|
.dump_info_json = zone_block_dump_info_json,
|
|
};
|
|
|
|
static void
|
|
zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
|
|
{
|
|
struct bdev_zone_block *bdev_node, *tmp;
|
|
|
|
TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
|
|
if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
|
|
spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
|
|
void *event_ctx)
|
|
{
|
|
switch (type) {
|
|
case SPDK_BDEV_EVENT_REMOVE:
|
|
zone_block_base_bdev_hotremove_cb(bdev);
|
|
break;
|
|
default:
|
|
SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
|
|
break;
|
|
}
|
|
}
|
|
|
|
static int
|
|
_zone_block_ch_create_cb(void *io_device, void *ctx_buf)
|
|
{
|
|
struct zone_block_io_channel *bdev_ch = ctx_buf;
|
|
struct bdev_zone_block *bdev_node = io_device;
|
|
|
|
bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
|
|
if (!bdev_ch->base_ch) {
|
|
return -ENOMEM;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
_zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
|
|
{
|
|
struct zone_block_io_channel *bdev_ch = ctx_buf;
|
|
|
|
spdk_put_io_channel(bdev_ch->base_ch);
|
|
}
|
|
|
|
static int
|
|
zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
|
|
uint64_t optimal_open_zones)
|
|
{
|
|
struct bdev_zone_block_config *name;
|
|
|
|
TAILQ_FOREACH(name, &g_bdev_configs, link) {
|
|
if (strcmp(vbdev_name, name->vbdev_name) == 0) {
|
|
SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
|
|
return -EEXIST;
|
|
}
|
|
if (strcmp(bdev_name, name->bdev_name) == 0) {
|
|
SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
|
|
return -EEXIST;
|
|
}
|
|
}
|
|
|
|
name = calloc(1, sizeof(*name));
|
|
if (!name) {
|
|
SPDK_ERRLOG("could not allocate bdev_names\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
name->bdev_name = strdup(bdev_name);
|
|
if (!name->bdev_name) {
|
|
SPDK_ERRLOG("could not allocate name->bdev_name\n");
|
|
free(name);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
name->vbdev_name = strdup(vbdev_name);
|
|
if (!name->vbdev_name) {
|
|
SPDK_ERRLOG("could not allocate name->vbdev_name\n");
|
|
free(name->bdev_name);
|
|
free(name);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
name->zone_capacity = zone_capacity;
|
|
name->optimal_open_zones = optimal_open_zones;
|
|
|
|
TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
|
|
{
|
|
size_t i;
|
|
struct block_zone *zone;
|
|
int rc = 0;
|
|
|
|
for (i = 0; i < bdev_node->num_zones; i++) {
|
|
zone = &bdev_node->zones[i];
|
|
zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
|
|
zone->zone_info.capacity = bdev_node->zone_capacity;
|
|
zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
|
|
zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
|
|
zone->zone_info.type = SPDK_BDEV_ZONE_TYPE_SEQWR;
|
|
if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
|
|
SPDK_ERRLOG("pthread_spin_init() failed\n");
|
|
rc = -ENOMEM;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (rc) {
|
|
for (; i > 0; i--) {
|
|
pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int
|
|
zone_block_register(const char *base_bdev_name)
|
|
{
|
|
struct spdk_bdev_desc *base_desc;
|
|
struct spdk_bdev *base_bdev;
|
|
struct bdev_zone_block_config *name, *tmp;
|
|
struct bdev_zone_block *bdev_node;
|
|
uint64_t zone_size;
|
|
int rc = 0;
|
|
|
|
/* Check our list of names from config versus this bdev and if
|
|
* there's a match, create the bdev_node & bdev accordingly.
|
|
*/
|
|
TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
|
|
if (strcmp(name->bdev_name, base_bdev_name) != 0) {
|
|
continue;
|
|
}
|
|
|
|
rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb,
|
|
NULL, &base_desc);
|
|
if (rc == -ENODEV) {
|
|
return -ENODEV;
|
|
} else if (rc) {
|
|
SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name);
|
|
goto free_config;
|
|
}
|
|
|
|
base_bdev = spdk_bdev_desc_get_bdev(base_desc);
|
|
|
|
if (spdk_bdev_is_zoned(base_bdev)) {
|
|
SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name);
|
|
rc = -EEXIST;
|
|
goto zone_exist;
|
|
}
|
|
|
|
bdev_node = calloc(1, sizeof(struct bdev_zone_block));
|
|
if (!bdev_node) {
|
|
rc = -ENOMEM;
|
|
SPDK_ERRLOG("could not allocate bdev_node\n");
|
|
goto zone_exist;
|
|
}
|
|
|
|
bdev_node->base_desc = base_desc;
|
|
|
|
/* The base bdev that we're attaching to. */
|
|
bdev_node->bdev.name = strdup(name->vbdev_name);
|
|
if (!bdev_node->bdev.name) {
|
|
rc = -ENOMEM;
|
|
SPDK_ERRLOG("could not allocate bdev_node name\n");
|
|
goto strdup_failed;
|
|
}
|
|
|
|
zone_size = spdk_align64pow2(name->zone_capacity);
|
|
if (zone_size == 0) {
|
|
rc = -EINVAL;
|
|
SPDK_ERRLOG("invalid zone size\n");
|
|
goto roundup_failed;
|
|
}
|
|
|
|
bdev_node->zone_shift = spdk_u64log2(zone_size);
|
|
bdev_node->num_zones = base_bdev->blockcnt / zone_size;
|
|
|
|
bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
|
|
if (!bdev_node->zones) {
|
|
rc = -ENOMEM;
|
|
SPDK_ERRLOG("could not allocate zones\n");
|
|
goto calloc_failed;
|
|
}
|
|
|
|
bdev_node->bdev.product_name = "zone_block";
|
|
|
|
/* Copy some properties from the underlying base bdev. */
|
|
bdev_node->bdev.write_cache = base_bdev->write_cache;
|
|
bdev_node->bdev.required_alignment = base_bdev->required_alignment;
|
|
bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
|
|
|
|
bdev_node->bdev.blocklen = base_bdev->blocklen;
|
|
bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
|
|
|
|
if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
|
|
SPDK_DEBUGLOG(vbdev_zone_block,
|
|
"Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n",
|
|
base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
|
|
}
|
|
|
|
bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
|
|
|
|
bdev_node->bdev.md_interleave = base_bdev->md_interleave;
|
|
bdev_node->bdev.md_len = base_bdev->md_len;
|
|
bdev_node->bdev.dif_type = base_bdev->dif_type;
|
|
bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
|
|
bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
|
|
|
|
bdev_node->bdev.zoned = true;
|
|
bdev_node->bdev.ctxt = bdev_node;
|
|
bdev_node->bdev.fn_table = &zone_block_fn_table;
|
|
bdev_node->bdev.module = &bdev_zoned_if;
|
|
|
|
/* bdev specific info */
|
|
bdev_node->bdev.zone_size = zone_size;
|
|
|
|
bdev_node->zone_capacity = name->zone_capacity;
|
|
bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
|
|
bdev_node->bdev.max_open_zones = 0;
|
|
rc = zone_block_init_zone_info(bdev_node);
|
|
if (rc) {
|
|
SPDK_ERRLOG("could not init zone info\n");
|
|
goto zone_info_failed;
|
|
}
|
|
|
|
TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
|
|
|
|
spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
|
|
sizeof(struct zone_block_io_channel),
|
|
name->vbdev_name);
|
|
|
|
/* Save the thread where the base device is opened */
|
|
bdev_node->thread = spdk_get_thread();
|
|
|
|
rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module);
|
|
if (rc) {
|
|
SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name);
|
|
goto claim_failed;
|
|
}
|
|
|
|
rc = spdk_bdev_register(&bdev_node->bdev);
|
|
if (rc) {
|
|
SPDK_ERRLOG("could not register zoned bdev\n");
|
|
goto register_failed;
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
|
|
register_failed:
|
|
spdk_bdev_module_release_bdev(&bdev_node->bdev);
|
|
claim_failed:
|
|
TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
|
|
spdk_io_device_unregister(bdev_node, NULL);
|
|
zone_info_failed:
|
|
free(bdev_node->zones);
|
|
calloc_failed:
|
|
roundup_failed:
|
|
free(bdev_node->bdev.name);
|
|
strdup_failed:
|
|
free(bdev_node);
|
|
zone_exist:
|
|
spdk_bdev_close(base_desc);
|
|
free_config:
|
|
zone_block_remove_config(name);
|
|
return rc;
|
|
}
|
|
|
|
int
|
|
vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
|
|
uint64_t optimal_open_zones)
|
|
{
|
|
int rc = 0;
|
|
|
|
if (zone_capacity == 0) {
|
|
SPDK_ERRLOG("Zone capacity can't be 0\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (optimal_open_zones == 0) {
|
|
SPDK_ERRLOG("Optimal open zones can't be 0\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* Insert the bdev into our global name list even if it doesn't exist yet,
|
|
* it may show up soon...
|
|
*/
|
|
rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
|
|
if (rc) {
|
|
return rc;
|
|
}
|
|
|
|
rc = zone_block_register(bdev_name);
|
|
if (rc == -ENODEV) {
|
|
/* This is not an error, even though the bdev is not present at this time it may
|
|
* still show up later.
|
|
*/
|
|
rc = 0;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
void
|
|
vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
|
|
{
|
|
struct bdev_zone_block_config *name_node;
|
|
int rc;
|
|
|
|
rc = spdk_bdev_unregister_by_name(name, &bdev_zoned_if, cb_fn, cb_arg);
|
|
if (rc == 0) {
|
|
TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
|
|
if (strcmp(name_node->vbdev_name, name) == 0) {
|
|
zone_block_remove_config(name_node);
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
cb_fn(cb_arg, rc);
|
|
}
|
|
}
|
|
|
|
static void
|
|
zone_block_examine(struct spdk_bdev *bdev)
|
|
{
|
|
zone_block_register(bdev->name);
|
|
|
|
spdk_bdev_module_examine_done(&bdev_zoned_if);
|
|
}
|
|
|
|
SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
|