From 8b8401959ec5e714e418deb7ca881a1ff192cd91 Mon Sep 17 00:00:00 2001 From: Indraneel M Date: Sat, 2 Jul 2022 20:23:02 +0530 Subject: [PATCH] bdev/uring: Add support for zoned io in uring bdev. Enables the use of uring bdev with ZNS devices. Uses BLKXXXZONE ioctls for implementing the zone operations. Signed-off-by: Indraneel M Change-Id: I440e316138182e25d89eb7224932e19bef9a005f Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13550 Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins Reviewed-by: Jim Harris Reviewed-by: Ben Walker --- CONFIG | 3 + configure | 16 ++ doc/bdev.md | 4 + module/bdev/uring/bdev_uring.c | 284 ++++++++++++++++++++++++++++++++- 4 files changed, 306 insertions(+), 1 deletion(-) diff --git a/CONFIG b/CONFIG index 801f22c1f..3ee114863 100644 --- a/CONFIG +++ b/CONFIG @@ -161,6 +161,9 @@ CONFIG_ISAL=y # Build with IO_URING support CONFIG_URING=n +# Build IO_URING bdev with ZNS support +CONFIG_URING_ZNS=n + # Path to custom built IO_URING library CONFIG_URING_PATH= diff --git a/configure b/configure index a64571ace..6a78bf681 100755 --- a/configure +++ b/configure @@ -97,6 +97,7 @@ function usage() { echo " --without-uring If an argument is provided, it is considered a directory containing" echo " liburing.a and io_uring.h. Otherwise the regular system paths will" echo " be searched." + echo " --with-uring-zns Build I/O uring module with ZNS (zoned namespaces) support." echo " --with-openssl[=DIR] Build OPENSSL with custom path. Otherwise the regular system paths will" echo " be searched." echo " --with-fuse Build FUSE components for mounting a blobfs filesystem." @@ -608,6 +609,9 @@ for i in "$@"; do CONFIG[URING]=n CONFIG[URING_PATH]= ;; + --with-uring-zns) + CONFIG[URING_ZNS]=y + ;; --with-openssl=*) check_dir "$i" CONFIG[OPENSSL_PATH]=$(readlink -f ${i#*=}) @@ -1122,6 +1126,18 @@ if [[ "${CONFIG[URING]}" = "y" ]]; then fi fi +if [[ "${CONFIG[URING_ZNS]}" = "y" ]]; then + if [[ "${CONFIG[URING]}" = "n" ]]; then + echo "--with-uring-zns requires --with-uring." + exit 1 + fi + if ! echo -e '#include\nint main(void) { return BLK_ZONE_REP_CAPACITY; }\n' \ + | "${BUILD_CMD[@]}" -c - 2> /dev/null; then + echo "--with-uring-zns requires blkzoned.h (from kernel >= linux-5.9)." + exit 1 + fi +fi + if [[ "${CONFIG[FUSE]}" = "y" ]]; then if [[ ! -d /usr/include/fuse3 ]] && [[ ! -d /usr/local/include/fuse3 ]]; then echo "--with-fuse requires libfuse3." diff --git a/doc/bdev.md b/doc/bdev.md index 49c1827a3..8e1fc7be3 100644 --- a/doc/bdev.md +++ b/doc/bdev.md @@ -605,6 +605,10 @@ The user needs to configure SPDK to include io_uring support: `configure --with-uring` +To enable uring bdev for ZNS devices use the following: + +`configure --with-uring --with-uring-zns` + To create a uring bdev with given filename, bdev name and block size use the `bdev_uring_create` RPC. `rpc.py bdev_uring_create /path/to/device bdev_u0 512` diff --git a/module/bdev/uring/bdev_uring.c b/module/bdev/uring/bdev_uring.c index 2a97f4c3c..a234f7f71 100644 --- a/module/bdev/uring/bdev_uring.c +++ b/module/bdev/uring/bdev_uring.c @@ -6,7 +6,7 @@ #include "bdev_uring.h" #include "spdk/stdinc.h" - +#include "spdk/config.h" #include "spdk/barrier.h" #include "spdk/bdev.h" #include "spdk/env.h" @@ -20,6 +20,17 @@ #include "spdk/log.h" #include "spdk_internal/uring.h" +#ifdef SPDK_CONFIG_URING_ZNS +#include +#define SECTOR_SHIFT 9 +#endif + +struct bdev_uring_zoned_dev { + uint64_t num_zones; + uint32_t zone_shift; + uint32_t lba_shift; +}; + struct bdev_uring_io_channel { struct bdev_uring_group_channel *group_ch; }; @@ -39,6 +50,7 @@ struct bdev_uring_task { struct bdev_uring { struct spdk_bdev bdev; + struct bdev_uring_zoned_dev zd; char *filename; int fd; TAILQ_ENTRY(bdev_uring) link; @@ -273,10 +285,271 @@ bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, } } +#ifdef SPDK_CONFIG_URING_ZNS +static int +bdev_uring_read_sysfs_attr(const char *devname, const char *attr, char *str, int str_len) +{ + char *path = NULL; + char *device = NULL; + FILE *file; + int ret = 0; + + device = basename(devname); + path = spdk_sprintf_alloc("/sys/block/%s/%s", device, attr); + if (!path) { + return -EINVAL; + } + + file = fopen(path, "r"); + if (!file) { + free(path); + return -ENOENT; + } + + if (!fgets(str, str_len, file)) { + ret = -EINVAL; + goto close; + } + + spdk_str_chomp(str); + +close: + free(path); + fclose(file); + return ret; +} + +static int +bdev_uring_read_sysfs_attr_long(const char *devname, const char *attr, long *val) +{ + char str[128]; + int ret; + + ret = bdev_uring_read_sysfs_attr(devname, attr, str, sizeof(str)); + if (ret) { + return ret; + } + + *val = spdk_strtol(str, 10); + + return 0; +} + +static int +bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep) +{ + switch (zones_rep->cond) { + case BLK_ZONE_COND_EMPTY: + zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY; + break; + case BLK_ZONE_COND_IMP_OPEN: + zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; + break; + case BLK_ZONE_COND_EXP_OPEN: + zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; + break; + case BLK_ZONE_COND_CLOSED: + zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED; + break; + case BLK_ZONE_COND_READONLY: + zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; + break; + case BLK_ZONE_COND_FULL: + zone_info->state = SPDK_BDEV_ZONE_STATE_FULL; + break; + case BLK_ZONE_COND_OFFLINE: + zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; + break; + default: + SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond); + return -EIO; + } + return 0; +} + +static int +bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io) +{ + struct bdev_uring *uring; + struct blk_zone_range range; + long unsigned zone_mgmt_op; + uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; + + uring = (struct bdev_uring *)bdev_io->bdev->ctxt; + + switch (bdev_io->u.zone_mgmt.zone_action) { + case SPDK_BDEV_ZONE_RESET: + zone_mgmt_op = BLKRESETZONE; + break; + case SPDK_BDEV_ZONE_OPEN: + zone_mgmt_op = BLKOPENZONE; + break; + case SPDK_BDEV_ZONE_CLOSE: + zone_mgmt_op = BLKCLOSEZONE; + break; + case SPDK_BDEV_ZONE_FINISH: + zone_mgmt_op = BLKFINISHZONE; + break; + default: + return -EINVAL; + } + + range.sector = (zone_id << uring->zd.lba_shift); + range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift); + + if (ioctl(uring->fd, zone_mgmt_op, &range)) { + SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n", + bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno)); + return -EINVAL; + } + + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + + return 0; +} + +static int +bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io) +{ + struct bdev_uring *uring; + struct blk_zone *zones; + struct blk_zone_report *rep; + struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf; + size_t repsize; + uint32_t i, shift; + uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones; + uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; + + uring = (struct bdev_uring *)bdev_io->bdev->ctxt; + shift = uring->zd.lba_shift; + + if ((num_zones > uring->zd.num_zones) || !num_zones) { + return -EINVAL; + } + + repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones); + rep = (struct blk_zone_report *)malloc(repsize); + if (!rep) { + return -ENOMEM; + } + + zones = (struct blk_zone *)(rep + 1); + + while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) { + memset(rep, 0, repsize); + rep->sector = zone_id; + rep->nr_zones = num_zones; + + if (ioctl(uring->fd, BLKREPORTZONE, rep)) { + SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n", + errno, strerror(errno)); + free(rep); + return -EINVAL; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; i++) { + zone_info->zone_id = ((zones + i)->start >> shift); + zone_info->write_pointer = ((zones + i)->wp >> shift); + zone_info->capacity = ((zones + i)->capacity >> shift); + + bdev_uring_fill_zone_state(zone_info, zones + i); + + zone_id = ((zones + i)->start + (zones + i)->len) >> shift; + zone_info++; + num_zones--; + } + } + + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + free(rep); + return 0; +} + +static int +bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename) +{ + char str[128]; + long int val = 0; + uint32_t zinfo; + int retval = -1; + + uring->bdev.zoned = false; + + /* Check if this is a zoned block device */ + if (bdev_uring_read_sysfs_attr(filename, "queue/zoned", str, sizeof(str))) { + SPDK_ERRLOG("Unable to open file %s/queue/zoned. errno: %d\n", filename, errno); + } else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) { + /* Only host-aware & host-managed zns devices */ + uring->bdev.zoned = true; + + if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) { + SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno)); + goto err_ret; + } + uring->zd.num_zones = zinfo; + + if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) { + SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno)); + goto err_ret; + } + + uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT; + uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift); + uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift); + + if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_open_zones", &val)) { + SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", errno, strerror(errno)); + goto err_ret; + } + uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = (uint32_t)val; + + if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_active_zones", &val)) { + SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", errno, strerror(errno)); + goto err_ret; + } + uring->bdev.max_active_zones = (uint32_t)val; + retval = 0; + } else { + retval = 0; /* queue/zoned=none */ + } + +err_ret: + return retval; +} +#else +/* No support for zoned devices */ +static int +bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io) +{ + return -1; +} + +static int +bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io) +{ + return -1; +} + +static int +bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename) +{ + return 0; +} +#endif + static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) { + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: + return bdev_uring_zone_get_info(bdev_io); + case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: + return bdev_uring_zone_management_op(bdev_io); /* Read and write operations must be performed on buffers aligned to * bdev->required_alignment. If user specified unaligned buffers, * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ @@ -302,6 +575,10 @@ static bool bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) { switch (io_type) { +#ifdef SPDK_CONFIG_URING_ZNS + case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: + case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: +#endif case SPDK_BDEV_IO_TYPE_READ: case SPDK_BDEV_IO_TYPE_WRITE: return true; @@ -483,6 +760,11 @@ create_uring_bdev(const char *name, const char *filename, uint32_t block_size) uring->bdev.blocklen = block_size; uring->bdev.required_alignment = spdk_u32log2(block_size); + rc = bdev_uring_check_zoned_support(uring, name, filename); + if (rc) { + goto error_return; + } + if (bdev_size % uring->bdev.blocklen != 0) { SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", bdev_size, uring->bdev.blocklen);