bdev/uring: Add support for zoned io in uring bdev.

Enables the use of uring bdev with ZNS devices.
Uses BLKXXXZONE ioctls for implementing the zone operations.

Signed-off-by: Indraneel M <Indraneel.Mukherjee@wdc.com>
Change-Id: I440e316138182e25d89eb7224932e19bef9a005f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13550
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
Indraneel M 2022-07-02 20:23:02 +05:30 committed by Tomasz Zawadzki
parent 2d37b82e6b
commit 8b8401959e
4 changed files with 306 additions and 1 deletions

3
CONFIG
View File

@ -161,6 +161,9 @@ CONFIG_ISAL=y
# Build with IO_URING support # Build with IO_URING support
CONFIG_URING=n CONFIG_URING=n
# Build IO_URING bdev with ZNS support
CONFIG_URING_ZNS=n
# Path to custom built IO_URING library # Path to custom built IO_URING library
CONFIG_URING_PATH= CONFIG_URING_PATH=

16
configure vendored
View File

@ -97,6 +97,7 @@ function usage() {
echo " --without-uring If an argument is provided, it is considered a directory containing" echo " --without-uring If an argument is provided, it is considered a directory containing"
echo " liburing.a and io_uring.h. Otherwise the regular system paths will" echo " liburing.a and io_uring.h. Otherwise the regular system paths will"
echo " be searched." echo " be searched."
echo " --with-uring-zns Build I/O uring module with ZNS (zoned namespaces) support."
echo " --with-openssl[=DIR] Build OPENSSL with custom path. Otherwise the regular system paths will" echo " --with-openssl[=DIR] Build OPENSSL with custom path. Otherwise the regular system paths will"
echo " be searched." echo " be searched."
echo " --with-fuse Build FUSE components for mounting a blobfs filesystem." echo " --with-fuse Build FUSE components for mounting a blobfs filesystem."
@ -608,6 +609,9 @@ for i in "$@"; do
CONFIG[URING]=n CONFIG[URING]=n
CONFIG[URING_PATH]= CONFIG[URING_PATH]=
;; ;;
--with-uring-zns)
CONFIG[URING_ZNS]=y
;;
--with-openssl=*) --with-openssl=*)
check_dir "$i" check_dir "$i"
CONFIG[OPENSSL_PATH]=$(readlink -f ${i#*=}) CONFIG[OPENSSL_PATH]=$(readlink -f ${i#*=})
@ -1122,6 +1126,18 @@ if [[ "${CONFIG[URING]}" = "y" ]]; then
fi fi
fi fi
if [[ "${CONFIG[URING_ZNS]}" = "y" ]]; then
if [[ "${CONFIG[URING]}" = "n" ]]; then
echo "--with-uring-zns requires --with-uring."
exit 1
fi
if ! echo -e '#include<linux/blkzoned.h>\nint main(void) { return BLK_ZONE_REP_CAPACITY; }\n' \
| "${BUILD_CMD[@]}" -c - 2> /dev/null; then
echo "--with-uring-zns requires blkzoned.h (from kernel >= linux-5.9)."
exit 1
fi
fi
if [[ "${CONFIG[FUSE]}" = "y" ]]; then if [[ "${CONFIG[FUSE]}" = "y" ]]; then
if [[ ! -d /usr/include/fuse3 ]] && [[ ! -d /usr/local/include/fuse3 ]]; then if [[ ! -d /usr/include/fuse3 ]] && [[ ! -d /usr/local/include/fuse3 ]]; then
echo "--with-fuse requires libfuse3." echo "--with-fuse requires libfuse3."

View File

@ -605,6 +605,10 @@ The user needs to configure SPDK to include io_uring support:
`configure --with-uring` `configure --with-uring`
To enable uring bdev for ZNS devices use the following:
`configure --with-uring --with-uring-zns`
To create a uring bdev with given filename, bdev name and block size use the `bdev_uring_create` RPC. To create a uring bdev with given filename, bdev name and block size use the `bdev_uring_create` RPC.
`rpc.py bdev_uring_create /path/to/device bdev_u0 512` `rpc.py bdev_uring_create /path/to/device bdev_u0 512`

View File

@ -6,7 +6,7 @@
#include "bdev_uring.h" #include "bdev_uring.h"
#include "spdk/stdinc.h" #include "spdk/stdinc.h"
#include "spdk/config.h"
#include "spdk/barrier.h" #include "spdk/barrier.h"
#include "spdk/bdev.h" #include "spdk/bdev.h"
#include "spdk/env.h" #include "spdk/env.h"
@ -20,6 +20,17 @@
#include "spdk/log.h" #include "spdk/log.h"
#include "spdk_internal/uring.h" #include "spdk_internal/uring.h"
#ifdef SPDK_CONFIG_URING_ZNS
#include <linux/blkzoned.h>
#define SECTOR_SHIFT 9
#endif
struct bdev_uring_zoned_dev {
uint64_t num_zones;
uint32_t zone_shift;
uint32_t lba_shift;
};
struct bdev_uring_io_channel { struct bdev_uring_io_channel {
struct bdev_uring_group_channel *group_ch; struct bdev_uring_group_channel *group_ch;
}; };
@ -39,6 +50,7 @@ struct bdev_uring_task {
struct bdev_uring { struct bdev_uring {
struct spdk_bdev bdev; struct spdk_bdev bdev;
struct bdev_uring_zoned_dev zd;
char *filename; char *filename;
int fd; int fd;
TAILQ_ENTRY(bdev_uring) link; TAILQ_ENTRY(bdev_uring) link;
@ -273,10 +285,271 @@ bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
} }
} }
#ifdef SPDK_CONFIG_URING_ZNS
static int
bdev_uring_read_sysfs_attr(const char *devname, const char *attr, char *str, int str_len)
{
char *path = NULL;
char *device = NULL;
FILE *file;
int ret = 0;
device = basename(devname);
path = spdk_sprintf_alloc("/sys/block/%s/%s", device, attr);
if (!path) {
return -EINVAL;
}
file = fopen(path, "r");
if (!file) {
free(path);
return -ENOENT;
}
if (!fgets(str, str_len, file)) {
ret = -EINVAL;
goto close;
}
spdk_str_chomp(str);
close:
free(path);
fclose(file);
return ret;
}
static int
bdev_uring_read_sysfs_attr_long(const char *devname, const char *attr, long *val)
{
char str[128];
int ret;
ret = bdev_uring_read_sysfs_attr(devname, attr, str, sizeof(str));
if (ret) {
return ret;
}
*val = spdk_strtol(str, 10);
return 0;
}
static int
bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
{
switch (zones_rep->cond) {
case BLK_ZONE_COND_EMPTY:
zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
break;
case BLK_ZONE_COND_IMP_OPEN:
zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
break;
case BLK_ZONE_COND_EXP_OPEN:
zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
break;
case BLK_ZONE_COND_CLOSED:
zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
break;
case BLK_ZONE_COND_READONLY:
zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
break;
case BLK_ZONE_COND_FULL:
zone_info->state = SPDK_BDEV_ZONE_STATE_FULL;
break;
case BLK_ZONE_COND_OFFLINE:
zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
break;
default:
SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond);
return -EIO;
}
return 0;
}
static int
bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
{
struct bdev_uring *uring;
struct blk_zone_range range;
long unsigned zone_mgmt_op;
uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
switch (bdev_io->u.zone_mgmt.zone_action) {
case SPDK_BDEV_ZONE_RESET:
zone_mgmt_op = BLKRESETZONE;
break;
case SPDK_BDEV_ZONE_OPEN:
zone_mgmt_op = BLKOPENZONE;
break;
case SPDK_BDEV_ZONE_CLOSE:
zone_mgmt_op = BLKCLOSEZONE;
break;
case SPDK_BDEV_ZONE_FINISH:
zone_mgmt_op = BLKFINISHZONE;
break;
default:
return -EINVAL;
}
range.sector = (zone_id << uring->zd.lba_shift);
range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift);
if (ioctl(uring->fd, zone_mgmt_op, &range)) {
SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n",
bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno));
return -EINVAL;
}
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
return 0;
}
static int
bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
{
struct bdev_uring *uring;
struct blk_zone *zones;
struct blk_zone_report *rep;
struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
size_t repsize;
uint32_t i, shift;
uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones;
uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
shift = uring->zd.lba_shift;
if ((num_zones > uring->zd.num_zones) || !num_zones) {
return -EINVAL;
}
repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones);
rep = (struct blk_zone_report *)malloc(repsize);
if (!rep) {
return -ENOMEM;
}
zones = (struct blk_zone *)(rep + 1);
while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) {
memset(rep, 0, repsize);
rep->sector = zone_id;
rep->nr_zones = num_zones;
if (ioctl(uring->fd, BLKREPORTZONE, rep)) {
SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n",
errno, strerror(errno));
free(rep);
return -EINVAL;
}
if (!rep->nr_zones) {
break;
}
for (i = 0; i < rep->nr_zones; i++) {
zone_info->zone_id = ((zones + i)->start >> shift);
zone_info->write_pointer = ((zones + i)->wp >> shift);
zone_info->capacity = ((zones + i)->capacity >> shift);
bdev_uring_fill_zone_state(zone_info, zones + i);
zone_id = ((zones + i)->start + (zones + i)->len) >> shift;
zone_info++;
num_zones--;
}
}
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
free(rep);
return 0;
}
static int
bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
{
char str[128];
long int val = 0;
uint32_t zinfo;
int retval = -1;
uring->bdev.zoned = false;
/* Check if this is a zoned block device */
if (bdev_uring_read_sysfs_attr(filename, "queue/zoned", str, sizeof(str))) {
SPDK_ERRLOG("Unable to open file %s/queue/zoned. errno: %d\n", filename, errno);
} else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) {
/* Only host-aware & host-managed zns devices */
uring->bdev.zoned = true;
if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) {
SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno));
goto err_ret;
}
uring->zd.num_zones = zinfo;
if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) {
SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno));
goto err_ret;
}
uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT;
uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift);
uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift);
if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_open_zones", &val)) {
SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", errno, strerror(errno));
goto err_ret;
}
uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = (uint32_t)val;
if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_active_zones", &val)) {
SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", errno, strerror(errno));
goto err_ret;
}
uring->bdev.max_active_zones = (uint32_t)val;
retval = 0;
} else {
retval = 0; /* queue/zoned=none */
}
err_ret:
return retval;
}
#else
/* No support for zoned devices */
static int
bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
{
return -1;
}
static int
bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
{
return -1;
}
static int
bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
{
return 0;
}
#endif
static int static int
_bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
{ {
switch (bdev_io->type) { switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
return bdev_uring_zone_get_info(bdev_io);
case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
return bdev_uring_zone_management_op(bdev_io);
/* Read and write operations must be performed on buffers aligned to /* Read and write operations must be performed on buffers aligned to
* bdev->required_alignment. If user specified unaligned buffers, * bdev->required_alignment. If user specified unaligned buffers,
* get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
@ -302,6 +575,10 @@ static bool
bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
{ {
switch (io_type) { switch (io_type) {
#ifdef SPDK_CONFIG_URING_ZNS
case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
#endif
case SPDK_BDEV_IO_TYPE_READ: case SPDK_BDEV_IO_TYPE_READ:
case SPDK_BDEV_IO_TYPE_WRITE: case SPDK_BDEV_IO_TYPE_WRITE:
return true; return true;
@ -483,6 +760,11 @@ create_uring_bdev(const char *name, const char *filename, uint32_t block_size)
uring->bdev.blocklen = block_size; uring->bdev.blocklen = block_size;
uring->bdev.required_alignment = spdk_u32log2(block_size); uring->bdev.required_alignment = spdk_u32log2(block_size);
rc = bdev_uring_check_zoned_support(uring, name, filename);
if (rc) {
goto error_return;
}
if (bdev_size % uring->bdev.blocklen != 0) { if (bdev_size % uring->bdev.blocklen != 0) {
SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
bdev_size, uring->bdev.blocklen); bdev_size, uring->bdev.blocklen);