bdev: Add a zero copy I/O path

Add a ZCOPY operation to obtain buffers that represent
data regions on the backing block device.

Change-Id: Ie941c16ee051d0009e3888b52b8f41773bba47b3
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/386166
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Ben Walker 2017-11-07 15:05:19 -07:00 committed by Jim Harris
parent b92c3d412d
commit 84850dacd7
4 changed files with 125 additions and 4 deletions

View File

@ -49,6 +49,9 @@ block except for metadata.
spdk_vbdev_register() has been deprecated. spdk_bdev_register() should be used
instead.
A mechanism for acquiring and releasing data buffers from bdev modules, used
to perform zero copy operations, was added.
### NVMe-oF Target
Support for per-device shared receive queues in the RDMA transport has been added.

View File

@ -110,6 +110,7 @@ enum spdk_bdev_io_type {
SPDK_BDEV_IO_TYPE_NVME_IO,
SPDK_BDEV_IO_TYPE_NVME_IO_MD,
SPDK_BDEV_IO_TYPE_WRITE_ZEROES,
SPDK_BDEV_IO_TYPE_ZCOPY,
SPDK_BDEV_NUM_IO_TYPES /* Keep last */
};
@ -785,6 +786,47 @@ int spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel
uint64_t offset_blocks, uint64_t num_blocks,
spdk_bdev_io_completion_cb cb, void *cb_arg);
/**
* Submit a request to acquire a data buffer that represents the given
* range of blocks. The data buffer is placed in the spdk_bdev_io structure
* and can be obtained by calling spdk_bdev_io_get_iovec().
*
* \param desc Block device descriptor
* \param ch I/O channel. Obtained by calling spdk_bdev_get_io_channel().
* \param offset_blocks The offset, in blocks, from the start of the block device.
* \param num_blocks The number of blocks.
* \param populate Whether the data buffer should be populated with the
* data at the given blocks. Populating the data buffer can
* be skipped if the user writes new data to the entire buffer.
* \param cb Called when the request is complete.
* \param cb_arg Argument passed to cb.
*
* \return 0 on success. On success, the callback will always
* be called (even if the request ultimately failed). Return
* negated errno on failure, in which case the callback will not be called.
*/
int spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
uint64_t offset_blocks, uint64_t num_blocks,
bool populate,
spdk_bdev_io_completion_cb cb, void *cb_arg);
/**
* Submit a request to release a data buffer representing a range of blocks.
*
* \param bdev_io I/O request returned in the completion callback of spdk_bdev_zcopy_start().
* \param commit Whether to commit the data in the buffers to the blocks before releasing.
* The data does not need to be committed if it was not modified.
* \param cb Called when the request is complete.
* \param cb_arg Argument passed to cb.
*
* \return 0 on success. On success, the callback will always
* be called (even if the request ultimately failed). Return
* negated errno on failure, in which case the callback will not be called.
*/
int spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
spdk_bdev_io_completion_cb cb, void *cb_arg);
/**
* Submit a write zeroes request to the bdev on the given channel. This command
* ensures that all bytes in the specified range are set to 00h

View File

@ -453,6 +453,17 @@ struct spdk_bdev_io {
/** count of outstanding batched split I/Os */
uint32_t split_outstanding;
struct {
/** Whether the buffer should be populated with the real data */
uint8_t populate : 1;
/** Whether the buffer should be committed back to disk */
uint8_t commit : 1;
/** True if this request is in the 'start' phase of zcopy. False if in 'end'. */
uint8_t start : 1;
} zcopy;
} bdev;
struct {
/** Channel reference held while messages for this reset are in progress. */

View File

@ -424,6 +424,11 @@ spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
{
struct iovec *iovs;
if (bdev_io->u.bdev.iovs == NULL) {
bdev_io->u.bdev.iovs = &bdev_io->iov;
bdev_io->u.bdev.iovcnt = 1;
}
iovs = bdev_io->u.bdev.iovs;
assert(iovs != NULL);
@ -436,6 +441,10 @@ spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
static bool
_is_buf_allocated(struct iovec *iovs)
{
if (iovs == NULL) {
return false;
}
return iovs[0].iov_base != NULL;
}
@ -585,7 +594,6 @@ spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, u
bool buf_allocated;
assert(cb != NULL);
assert(bdev_io->u.bdev.iovs != NULL);
alignment = spdk_bdev_get_buf_align(bdev_io->bdev);
buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
@ -2744,6 +2752,65 @@ spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
return 0;
}
int
spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
uint64_t offset_blocks, uint64_t num_blocks,
bool populate,
spdk_bdev_io_completion_cb cb, void *cb_arg)
{
struct spdk_bdev *bdev = desc->bdev;
struct spdk_bdev_io *bdev_io;
struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
if (!desc->write) {
return -EBADF;
}
if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
return -EINVAL;
}
if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
return -ENOTSUP;
}
bdev_io = spdk_bdev_get_io(channel);
if (!bdev_io) {
return -ENOMEM;
}
bdev_io->internal.ch = channel;
bdev_io->internal.desc = desc;
bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
bdev_io->u.bdev.num_blocks = num_blocks;
bdev_io->u.bdev.offset_blocks = offset_blocks;
bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
bdev_io->u.bdev.zcopy.commit = 0;
bdev_io->u.bdev.zcopy.start = 1;
spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
spdk_bdev_io_submit(bdev_io);
return 0;
}
int
spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
spdk_bdev_io_completion_cb cb, void *cb_arg)
{
if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
return -EINVAL;
}
bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
bdev_io->u.bdev.zcopy.start = 0;
bdev_io->internal.caller_ctx = cb_arg;
bdev_io->internal.cb = cb;
bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
spdk_bdev_io_submit(bdev_io);
return 0;
}
int
spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
uint64_t offset, uint64_t len,
@ -4018,10 +4085,8 @@ spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *i
switch (bdev_io->type) {
case SPDK_BDEV_IO_TYPE_READ:
iovs = bdev_io->u.bdev.iovs;
iovcnt = bdev_io->u.bdev.iovcnt;
break;
case SPDK_BDEV_IO_TYPE_WRITE:
case SPDK_BDEV_IO_TYPE_ZCOPY:
iovs = bdev_io->u.bdev.iovs;
iovcnt = bdev_io->u.bdev.iovcnt;
break;