From aef00d4420639a8e1abff899f43eda21992dec42 Mon Sep 17 00:00:00 2001 From: Shuhei Matsumoto Date: Mon, 24 Oct 2022 22:26:19 +0900 Subject: [PATCH] bdev/malloc: Support both of interleaved and separated metadata The malloc bdev module supports both of interleaved and separated metadata in this patch. Different from the NULL bdev module, opts->block_size is a data block size and a block size is caculated internally as a sum of opts->block_size and opts->md_size if opts->md_interleave is true, or opts->block_size otherwise. This will be more intuitive. Additionally, opts->md_size accepts only either of 0, 8, 16, 32, 64, or 128. Protection information (T10 DIF/DIX) will be supported in the following patches. Signed-off-by: Shuhei Matsumoto Change-Id: Icd9e92c8ea94e30139e416f8c533ab4cf473d2a8 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14984 Tested-by: SPDK CI Jenkins Reviewed-by: Changpeng Liu Reviewed-by: Aleksey Marchuk Community-CI: Mellanox Build Bot --- CHANGELOG.md | 4 + doc/jsonrpc.md | 4 +- module/bdev/malloc/bdev_malloc.c | 106 ++++++++++++++++++++++++--- module/bdev/malloc/bdev_malloc.h | 2 + module/bdev/malloc/bdev_malloc_rpc.c | 2 + python/spdk/rpc/bdev.py | 12 ++- scripts/rpc.py | 10 ++- 7 files changed, 123 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6e051ef7..999d1ddd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## v23.01: (Upcoming Release) +### bdev + +Both of interleaved and separated metadata are now supported by the malloc bdev module. + ### scheduler Changing scheduler from dynamic back to static is no longer possible, diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 7a65be89b..7ba5836dd 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -2790,10 +2790,12 @@ Construct @ref bdev_config_malloc Name | Optional | Type | Description ----------------------- | -------- | ----------- | ----------- name | Optional | string | Bdev name to use -block_size | Required | number | Block size in bytes -must be multiple of 512 +block_size | Required | number | Data block size in bytes -must be multiple of 512 num_blocks | Required | number | Number of blocks uuid | Optional | string | UUID of new bdev optimal_io_boundary | Optional | number | Split on optimal IO boundary, in number of blocks, default 0 +md_size | Optional | number | Metadata size for this bdev (0, 8, 16, 32, 64, or 128). Default is 0. +md_interleave | Optional | boolean | Metadata location, interleaved if true, and separated if false. Default is false. #### Result diff --git a/module/bdev/malloc/bdev_malloc.c b/module/bdev/malloc/bdev_malloc.c index a594b4e90..e1360cfac 100644 --- a/module/bdev/malloc/bdev_malloc.c +++ b/module/bdev/malloc/bdev_malloc.c @@ -17,6 +17,7 @@ struct malloc_disk { struct spdk_bdev disk; void *malloc_buf; + void *malloc_md_buf; TAILQ_ENTRY(malloc_disk) link; }; @@ -90,6 +91,7 @@ malloc_disk_free(struct malloc_disk *malloc_disk) free(malloc_disk->disk.name); spdk_free(malloc_disk->malloc_buf); + spdk_free(malloc_disk->malloc_md_buf); free(malloc_disk); } @@ -122,10 +124,12 @@ bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) static void bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, struct malloc_task *task, - struct iovec *iov, int iovcnt, size_t len, uint64_t offset) + struct iovec *iov, int iovcnt, size_t len, uint64_t offset, + void *md_buf, size_t md_len, uint64_t md_offset) { int64_t res = 0; - void *src = mdisk->malloc_buf + offset; + void *src; + void *md_src; int i; if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { @@ -134,11 +138,13 @@ bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, return; } + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + task->num_outstanding = 0; + SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", len, offset, iovcnt); - task->status = SPDK_BDEV_IO_STATUS_SUCCESS; - task->num_outstanding = 0; + src = mdisk->malloc_buf + offset; for (i = 0; i < iovcnt; i++) { task->num_outstanding++; @@ -153,15 +159,34 @@ bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, src += iov[i].iov_len; len -= iov[i].iov_len; } + + if (md_buf == NULL) { + return; + } + + SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n", + md_len, md_offset); + + md_src = mdisk->malloc_md_buf + md_offset; + + task->num_outstanding++; + res = spdk_accel_submit_copy(ch, md_buf, md_src, md_len, 0, malloc_done, task); + + if (res != 0) { + malloc_done(task, res); + } } static void bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, struct malloc_task *task, - struct iovec *iov, int iovcnt, size_t len, uint64_t offset) + struct iovec *iov, int iovcnt, size_t len, uint64_t offset, + void *md_buf, size_t md_len, uint64_t md_offset) { + int64_t res = 0; - void *dst = mdisk->malloc_buf + offset; + void *dst; + void *md_dst; int i; if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { @@ -173,6 +198,8 @@ bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", len, offset, iovcnt); + dst = mdisk->malloc_buf + offset; + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; task->num_outstanding = 0; @@ -188,6 +215,22 @@ bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, dst += iov[i].iov_len; } + + if (md_buf == NULL) { + return; + } + SPDK_DEBUGLOG(bdev_malloc, "wrote metadata %zu bytes to offset %#" PRIx64 "\n", + md_len, md_offset); + + md_dst = mdisk->malloc_md_buf + md_offset; + + task->num_outstanding++; + res = spdk_accel_submit_copy(ch, md_dst, md_buf, md_len, 0, malloc_done, task); + + if (res != 0) { + malloc_done(task, res); + } + } static int @@ -208,6 +251,7 @@ static int _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) { uint32_t block_size = bdev_io->bdev->blocklen; + uint32_t md_size = bdev_io->bdev->md_len; switch (bdev_io->type) { case SPDK_BDEV_IO_TYPE_READ: @@ -228,7 +272,10 @@ _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bde bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.num_blocks * block_size, - bdev_io->u.bdev.offset_blocks * block_size); + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks * md_size, + bdev_io->u.bdev.offset_blocks * md_size); return 0; case SPDK_BDEV_IO_TYPE_WRITE: @@ -238,7 +285,10 @@ _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bde bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.num_blocks * block_size, - bdev_io->u.bdev.offset_blocks * block_size); + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks * md_size, + bdev_io->u.bdev.offset_blocks * md_size); return 0; case SPDK_BDEV_IO_TYPE_RESET: @@ -359,7 +409,8 @@ static const struct spdk_bdev_fn_table malloc_fn_table = { int create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) { - struct malloc_disk *mdisk; + struct malloc_disk *mdisk; + uint32_t block_size; int rc; assert(opts != NULL); @@ -370,10 +421,29 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) } if (opts->block_size % 512) { - SPDK_ERRLOG("block size must be 512 bytes aligned\n"); + SPDK_ERRLOG("Data block size must be 512 bytes aligned\n"); return -EINVAL; } + switch (opts->md_size) { + case 0: + case 8: + case 16: + case 32: + case 64: + case 128: + break; + default: + SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size); + return -EINVAL; + } + + if (opts->md_interleave) { + block_size = opts->block_size + opts->md_size; + } else { + block_size = opts->block_size; + } + mdisk = calloc(1, sizeof(*mdisk)); if (!mdisk) { SPDK_ERRLOG("mdisk calloc() failed\n"); @@ -386,7 +456,7 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) * TODO: need to pass a hint so we know which socket to allocate * from on multi-socket systems. */ - mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * opts->block_size, 2 * 1024 * 1024, NULL, + mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); if (!mdisk->malloc_buf) { SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); @@ -394,6 +464,16 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) return -ENOMEM; } + if (!opts->md_interleave && opts->md_size != 0) { + mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!mdisk->malloc_md_buf) { + SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n"); + malloc_disk_free(mdisk); + return -ENOMEM; + } + } + if (opts->name) { mdisk->disk.name = strdup(opts->name); } else { @@ -408,8 +488,10 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) mdisk->disk.product_name = "Malloc disk"; mdisk->disk.write_cache = 1; - mdisk->disk.blocklen = opts->block_size; + mdisk->disk.blocklen = block_size; mdisk->disk.blockcnt = opts->num_blocks; + mdisk->disk.md_len = opts->md_size; + mdisk->disk.md_interleave = opts->md_interleave; if (opts->optimal_io_boundary) { mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary; mdisk->disk.split_on_optimal_io_boundary = true; diff --git a/module/bdev/malloc/bdev_malloc.h b/module/bdev/malloc/bdev_malloc.h index 8d01f4f0b..9822e9403 100644 --- a/module/bdev/malloc/bdev_malloc.h +++ b/module/bdev/malloc/bdev_malloc.h @@ -19,6 +19,8 @@ struct malloc_bdev_opts { uint64_t num_blocks; uint32_t block_size; uint32_t optimal_io_boundary; + uint32_t md_size; + bool md_interleave; }; int create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts); diff --git a/module/bdev/malloc/bdev_malloc_rpc.c b/module/bdev/malloc/bdev_malloc_rpc.c index 6aa16a6a1..a1551b604 100644 --- a/module/bdev/malloc/bdev_malloc_rpc.c +++ b/module/bdev/malloc/bdev_malloc_rpc.c @@ -36,6 +36,8 @@ static const struct spdk_json_object_decoder rpc_construct_malloc_decoders[] = { {"num_blocks", offsetof(struct malloc_bdev_opts, num_blocks), spdk_json_decode_uint64}, {"block_size", offsetof(struct malloc_bdev_opts, block_size), spdk_json_decode_uint32}, {"optimal_io_boundary", offsetof(struct malloc_bdev_opts, optimal_io_boundary), spdk_json_decode_uint32, true}, + {"md_size", offsetof(struct malloc_bdev_opts, md_size), spdk_json_decode_uint32, true}, + {"md_interleave", offsetof(struct malloc_bdev_opts, md_interleave), spdk_json_decode_bool, true}, }; static void diff --git a/python/spdk/rpc/bdev.py b/python/spdk/rpc/bdev.py index e7b7d11d0..8fdc8060e 100644 --- a/python/spdk/rpc/bdev.py +++ b/python/spdk/rpc/bdev.py @@ -234,15 +234,18 @@ def bdev_ocf_set_seqcutoff(client, name, policy, threshold, promotion_count): return client.call('bdev_ocf_set_seqcutoff', params) -def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, optimal_io_boundary=None): +def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, optimal_io_boundary=None, + md_size=None, md_interleave=None): """Construct a malloc block device. Args: num_blocks: size of block device in blocks - block_size: block size of device; must be a power of 2 and at least 512 + block_size: Data block size of device; must be a power of 2 and at least 512 name: name of block device (optional) uuid: UUID of block device (optional) optimal_io_boundary: Split on optimal IO boundary, in number of blocks, default 0 (disabled, optional) + md_size: metadata size of device (0, 8, 16, 32, 64, or 128), default 0 (optional) + md_interleave: metadata location, interleaved if set, and separated if omitted (optional) Returns: Name of created block device. @@ -254,6 +257,11 @@ def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, opt params['uuid'] = uuid if optimal_io_boundary: params['optimal_io_boundary'] = optimal_io_boundary + if md_size: + params['md_size'] = md_size + if md_interleave: + params['md_interleave'] = md_interleave + return client.call('bdev_malloc_create', params) diff --git a/scripts/rpc.py b/scripts/rpc.py index 0430961b9..f9aaaba97 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -353,15 +353,21 @@ if __name__ == "__main__": block_size=args.block_size, name=args.name, uuid=args.uuid, - optimal_io_boundary=args.optimal_io_boundary)) + optimal_io_boundary=args.optimal_io_boundary, + md_size=args.md_size, + md_interleave=args.md_interleave)) p = subparsers.add_parser('bdev_malloc_create', help='Create a bdev with malloc backend') p.add_argument('-b', '--name', help="Name of the bdev") p.add_argument('-u', '--uuid', help="UUID of the bdev") p.add_argument( 'total_size', help='Size of malloc bdev in MB (float > 0)', type=float) - p.add_argument('block_size', help='Block size for this bdev', type=int) + p.add_argument('block_size', help='Data block size for this bdev', type=int) p.add_argument('-o', '--optimal-io-boundary', help="""Split on optimal IO boundary, in number of blocks, default 0 (disabled)""", type=int) + p.add_argument('-m', '--md-size', type=int, + help='Metadata size for this bdev (0, 8, 16, 32, 64, or 128). Default is 0.') + p.add_argument('-i', '--md-interleave', action='store_true', + help='Metadata location, interleaved if set, and separated if omitted.') p.set_defaults(func=bdev_malloc_create) def bdev_malloc_delete(args):