bdev/malloc: Support both of interleaved and separated metadata

The malloc bdev module supports both of interleaved and separated
metadata in this patch.

Different from the NULL bdev module, opts->block_size is a data block
size and a block size is caculated internally as a sum of
opts->block_size and opts->md_size if opts->md_interleave is true, or
opts->block_size otherwise. This will be more intuitive. Additionally,
opts->md_size accepts only either of 0, 8, 16, 32, 64, or 128.

Protection information (T10 DIF/DIX) will be supported in the
following patches.

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: Icd9e92c8ea94e30139e416f8c533ab4cf473d2a8
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14984
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Community-CI: Mellanox Build Bot
This commit is contained in:
Shuhei Matsumoto 2022-10-24 22:26:19 +09:00 committed by Tomasz Zawadzki
parent e6b2b9075a
commit aef00d4420
7 changed files with 123 additions and 17 deletions

View File

@ -2,6 +2,10 @@
## v23.01: (Upcoming Release)
### bdev
Both of interleaved and separated metadata are now supported by the malloc bdev module.
### scheduler
Changing scheduler from dynamic back to static is no longer possible,

View File

@ -2790,10 +2790,12 @@ Construct @ref bdev_config_malloc
Name | Optional | Type | Description
----------------------- | -------- | ----------- | -----------
name | Optional | string | Bdev name to use
block_size | Required | number | Block size in bytes -must be multiple of 512
block_size | Required | number | Data block size in bytes -must be multiple of 512
num_blocks | Required | number | Number of blocks
uuid | Optional | string | UUID of new bdev
optimal_io_boundary | Optional | number | Split on optimal IO boundary, in number of blocks, default 0
md_size | Optional | number | Metadata size for this bdev (0, 8, 16, 32, 64, or 128). Default is 0.
md_interleave | Optional | boolean | Metadata location, interleaved if true, and separated if false. Default is false.
#### Result

View File

@ -17,6 +17,7 @@
struct malloc_disk {
struct spdk_bdev disk;
void *malloc_buf;
void *malloc_md_buf;
TAILQ_ENTRY(malloc_disk) link;
};
@ -90,6 +91,7 @@ malloc_disk_free(struct malloc_disk *malloc_disk)
free(malloc_disk->disk.name);
spdk_free(malloc_disk->malloc_buf);
spdk_free(malloc_disk->malloc_md_buf);
free(malloc_disk);
}
@ -122,10 +124,12 @@ bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes)
static void
bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
struct malloc_task *task,
struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
struct iovec *iov, int iovcnt, size_t len, uint64_t offset,
void *md_buf, size_t md_len, uint64_t md_offset)
{
int64_t res = 0;
void *src = mdisk->malloc_buf + offset;
void *src;
void *md_src;
int i;
if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
@ -134,11 +138,13 @@ bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
return;
}
task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
task->num_outstanding = 0;
SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n",
len, offset, iovcnt);
task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
task->num_outstanding = 0;
src = mdisk->malloc_buf + offset;
for (i = 0; i < iovcnt; i++) {
task->num_outstanding++;
@ -153,15 +159,34 @@ bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
src += iov[i].iov_len;
len -= iov[i].iov_len;
}
if (md_buf == NULL) {
return;
}
SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n",
md_len, md_offset);
md_src = mdisk->malloc_md_buf + md_offset;
task->num_outstanding++;
res = spdk_accel_submit_copy(ch, md_buf, md_src, md_len, 0, malloc_done, task);
if (res != 0) {
malloc_done(task, res);
}
}
static void
bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
struct malloc_task *task,
struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
struct iovec *iov, int iovcnt, size_t len, uint64_t offset,
void *md_buf, size_t md_len, uint64_t md_offset)
{
int64_t res = 0;
void *dst = mdisk->malloc_buf + offset;
void *dst;
void *md_dst;
int i;
if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
@ -173,6 +198,8 @@ bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n",
len, offset, iovcnt);
dst = mdisk->malloc_buf + offset;
task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
task->num_outstanding = 0;
@ -188,6 +215,22 @@ bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
dst += iov[i].iov_len;
}
if (md_buf == NULL) {
return;
}
SPDK_DEBUGLOG(bdev_malloc, "wrote metadata %zu bytes to offset %#" PRIx64 "\n",
md_len, md_offset);
md_dst = mdisk->malloc_md_buf + md_offset;
task->num_outstanding++;
res = spdk_accel_submit_copy(ch, md_dst, md_buf, md_len, 0, malloc_done, task);
if (res != 0) {
malloc_done(task, res);
}
}
static int
@ -208,6 +251,7 @@ static int
_bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io)
{
uint32_t block_size = bdev_io->bdev->blocklen;
uint32_t md_size = bdev_io->bdev->md_len;
switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_READ:
@ -228,7 +272,10 @@ _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bde
bdev_io->u.bdev.iovs,
bdev_io->u.bdev.iovcnt,
bdev_io->u.bdev.num_blocks * block_size,
bdev_io->u.bdev.offset_blocks * block_size);
bdev_io->u.bdev.offset_blocks * block_size,
bdev_io->u.bdev.md_buf,
bdev_io->u.bdev.num_blocks * md_size,
bdev_io->u.bdev.offset_blocks * md_size);
return 0;
case SPDK_BDEV_IO_TYPE_WRITE:
@ -238,7 +285,10 @@ _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bde
bdev_io->u.bdev.iovs,
bdev_io->u.bdev.iovcnt,
bdev_io->u.bdev.num_blocks * block_size,
bdev_io->u.bdev.offset_blocks * block_size);
bdev_io->u.bdev.offset_blocks * block_size,
bdev_io->u.bdev.md_buf,
bdev_io->u.bdev.num_blocks * md_size,
bdev_io->u.bdev.offset_blocks * md_size);
return 0;
case SPDK_BDEV_IO_TYPE_RESET:
@ -359,7 +409,8 @@ static const struct spdk_bdev_fn_table malloc_fn_table = {
int
create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts)
{
struct malloc_disk *mdisk;
struct malloc_disk *mdisk;
uint32_t block_size;
int rc;
assert(opts != NULL);
@ -370,10 +421,29 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts)
}
if (opts->block_size % 512) {
SPDK_ERRLOG("block size must be 512 bytes aligned\n");
SPDK_ERRLOG("Data block size must be 512 bytes aligned\n");
return -EINVAL;
}
switch (opts->md_size) {
case 0:
case 8:
case 16:
case 32:
case 64:
case 128:
break;
default:
SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size);
return -EINVAL;
}
if (opts->md_interleave) {
block_size = opts->block_size + opts->md_size;
} else {
block_size = opts->block_size;
}
mdisk = calloc(1, sizeof(*mdisk));
if (!mdisk) {
SPDK_ERRLOG("mdisk calloc() failed\n");
@ -386,7 +456,7 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts)
* TODO: need to pass a hint so we know which socket to allocate
* from on multi-socket systems.
*/
mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * opts->block_size, 2 * 1024 * 1024, NULL,
mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL,
SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
if (!mdisk->malloc_buf) {
SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n");
@ -394,6 +464,16 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts)
return -ENOMEM;
}
if (!opts->md_interleave && opts->md_size != 0) {
mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL,
SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
if (!mdisk->malloc_md_buf) {
SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n");
malloc_disk_free(mdisk);
return -ENOMEM;
}
}
if (opts->name) {
mdisk->disk.name = strdup(opts->name);
} else {
@ -408,8 +488,10 @@ create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts)
mdisk->disk.product_name = "Malloc disk";
mdisk->disk.write_cache = 1;
mdisk->disk.blocklen = opts->block_size;
mdisk->disk.blocklen = block_size;
mdisk->disk.blockcnt = opts->num_blocks;
mdisk->disk.md_len = opts->md_size;
mdisk->disk.md_interleave = opts->md_interleave;
if (opts->optimal_io_boundary) {
mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary;
mdisk->disk.split_on_optimal_io_boundary = true;

View File

@ -19,6 +19,8 @@ struct malloc_bdev_opts {
uint64_t num_blocks;
uint32_t block_size;
uint32_t optimal_io_boundary;
uint32_t md_size;
bool md_interleave;
};
int create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts);

View File

@ -36,6 +36,8 @@ static const struct spdk_json_object_decoder rpc_construct_malloc_decoders[] = {
{"num_blocks", offsetof(struct malloc_bdev_opts, num_blocks), spdk_json_decode_uint64},
{"block_size", offsetof(struct malloc_bdev_opts, block_size), spdk_json_decode_uint32},
{"optimal_io_boundary", offsetof(struct malloc_bdev_opts, optimal_io_boundary), spdk_json_decode_uint32, true},
{"md_size", offsetof(struct malloc_bdev_opts, md_size), spdk_json_decode_uint32, true},
{"md_interleave", offsetof(struct malloc_bdev_opts, md_interleave), spdk_json_decode_bool, true},
};
static void

View File

@ -234,15 +234,18 @@ def bdev_ocf_set_seqcutoff(client, name, policy, threshold, promotion_count):
return client.call('bdev_ocf_set_seqcutoff', params)
def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, optimal_io_boundary=None):
def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, optimal_io_boundary=None,
md_size=None, md_interleave=None):
"""Construct a malloc block device.
Args:
num_blocks: size of block device in blocks
block_size: block size of device; must be a power of 2 and at least 512
block_size: Data block size of device; must be a power of 2 and at least 512
name: name of block device (optional)
uuid: UUID of block device (optional)
optimal_io_boundary: Split on optimal IO boundary, in number of blocks, default 0 (disabled, optional)
md_size: metadata size of device (0, 8, 16, 32, 64, or 128), default 0 (optional)
md_interleave: metadata location, interleaved if set, and separated if omitted (optional)
Returns:
Name of created block device.
@ -254,6 +257,11 @@ def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, opt
params['uuid'] = uuid
if optimal_io_boundary:
params['optimal_io_boundary'] = optimal_io_boundary
if md_size:
params['md_size'] = md_size
if md_interleave:
params['md_interleave'] = md_interleave
return client.call('bdev_malloc_create', params)

View File

@ -353,15 +353,21 @@ if __name__ == "__main__":
block_size=args.block_size,
name=args.name,
uuid=args.uuid,
optimal_io_boundary=args.optimal_io_boundary))
optimal_io_boundary=args.optimal_io_boundary,
md_size=args.md_size,
md_interleave=args.md_interleave))
p = subparsers.add_parser('bdev_malloc_create', help='Create a bdev with malloc backend')
p.add_argument('-b', '--name', help="Name of the bdev")
p.add_argument('-u', '--uuid', help="UUID of the bdev")
p.add_argument(
'total_size', help='Size of malloc bdev in MB (float > 0)', type=float)
p.add_argument('block_size', help='Block size for this bdev', type=int)
p.add_argument('block_size', help='Data block size for this bdev', type=int)
p.add_argument('-o', '--optimal-io-boundary', help="""Split on optimal IO boundary, in number of
blocks, default 0 (disabled)""", type=int)
p.add_argument('-m', '--md-size', type=int,
help='Metadata size for this bdev (0, 8, 16, 32, 64, or 128). Default is 0.')
p.add_argument('-i', '--md-interleave', action='store_true',
help='Metadata location, interleaved if set, and separated if omitted.')
p.set_defaults(func=bdev_malloc_create)
def bdev_malloc_delete(args):