diff --git a/CHANGELOG.md b/CHANGELOG.md index 16de4c7ba..e4f214fda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,9 @@ block except for metadata. spdk_vbdev_register() has been deprecated. spdk_bdev_register() should be used instead. +A mechanism for acquiring and releasing data buffers from bdev modules, used +to perform zero copy operations, was added. + ### NVMe-oF Target Support for per-device shared receive queues in the RDMA transport has been added. diff --git a/include/spdk/bdev.h b/include/spdk/bdev.h index d47fd332e..0e4126890 100644 --- a/include/spdk/bdev.h +++ b/include/spdk/bdev.h @@ -110,6 +110,7 @@ enum spdk_bdev_io_type { SPDK_BDEV_IO_TYPE_NVME_IO, SPDK_BDEV_IO_TYPE_NVME_IO_MD, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, + SPDK_BDEV_IO_TYPE_ZCOPY, SPDK_BDEV_NUM_IO_TYPES /* Keep last */ }; @@ -785,6 +786,47 @@ int spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel uint64_t offset_blocks, uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg); +/** + * Submit a request to acquire a data buffer that represents the given + * range of blocks. The data buffer is placed in the spdk_bdev_io structure + * and can be obtained by calling spdk_bdev_io_get_iovec(). + * + * \param desc Block device descriptor + * \param ch I/O channel. Obtained by calling spdk_bdev_get_io_channel(). + * \param offset_blocks The offset, in blocks, from the start of the block device. + * \param num_blocks The number of blocks. + * \param populate Whether the data buffer should be populated with the + * data at the given blocks. Populating the data buffer can + * be skipped if the user writes new data to the entire buffer. + * \param cb Called when the request is complete. + * \param cb_arg Argument passed to cb. + * + * \return 0 on success. On success, the callback will always + * be called (even if the request ultimately failed). Return + * negated errno on failure, in which case the callback will not be called. + */ +int spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + bool populate, + spdk_bdev_io_completion_cb cb, void *cb_arg); + + +/** + * Submit a request to release a data buffer representing a range of blocks. + * + * \param bdev_io I/O request returned in the completion callback of spdk_bdev_zcopy_start(). + * \param commit Whether to commit the data in the buffers to the blocks before releasing. + * The data does not need to be committed if it was not modified. + * \param cb Called when the request is complete. + * \param cb_arg Argument passed to cb. + * + * \return 0 on success. On success, the callback will always + * be called (even if the request ultimately failed). Return + * negated errno on failure, in which case the callback will not be called. + */ +int spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, + spdk_bdev_io_completion_cb cb, void *cb_arg); + /** * Submit a write zeroes request to the bdev on the given channel. This command * ensures that all bytes in the specified range are set to 00h diff --git a/include/spdk/bdev_module.h b/include/spdk/bdev_module.h index b37d9cae8..b8cfa5ef6 100644 --- a/include/spdk/bdev_module.h +++ b/include/spdk/bdev_module.h @@ -453,6 +453,17 @@ struct spdk_bdev_io { /** count of outstanding batched split I/Os */ uint32_t split_outstanding; + + struct { + /** Whether the buffer should be populated with the real data */ + uint8_t populate : 1; + + /** Whether the buffer should be committed back to disk */ + uint8_t commit : 1; + + /** True if this request is in the 'start' phase of zcopy. False if in 'end'. */ + uint8_t start : 1; + } zcopy; } bdev; struct { /** Channel reference held while messages for this reset are in progress. */ diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c index d7b61e9a1..08ef5b542 100644 --- a/lib/bdev/bdev.c +++ b/lib/bdev/bdev.c @@ -424,6 +424,11 @@ spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) { struct iovec *iovs; + if (bdev_io->u.bdev.iovs == NULL) { + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovcnt = 1; + } + iovs = bdev_io->u.bdev.iovs; assert(iovs != NULL); @@ -436,6 +441,10 @@ spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) static bool _is_buf_allocated(struct iovec *iovs) { + if (iovs == NULL) { + return false; + } + return iovs[0].iov_base != NULL; } @@ -585,7 +594,6 @@ spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, u bool buf_allocated; assert(cb != NULL); - assert(bdev_io->u.bdev.iovs != NULL); alignment = spdk_bdev_get_buf_align(bdev_io->bdev); buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); @@ -2744,6 +2752,65 @@ spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, return 0; } +int +spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + bool populate, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { + return -ENOTSUP; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; + bdev_io->u.bdev.zcopy.commit = 0; + bdev_io->u.bdev.zcopy.start = 1; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { + return -EINVAL; + } + + bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; + bdev_io->u.bdev.zcopy.start = 0; + bdev_io->internal.caller_ctx = cb_arg; + bdev_io->internal.cb = cb; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; + + spdk_bdev_io_submit(bdev_io); + return 0; +} + int spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, uint64_t offset, uint64_t len, @@ -4018,10 +4085,8 @@ spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *i switch (bdev_io->type) { case SPDK_BDEV_IO_TYPE_READ: - iovs = bdev_io->u.bdev.iovs; - iovcnt = bdev_io->u.bdev.iovcnt; - break; case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_ZCOPY: iovs = bdev_io->u.bdev.iovs; iovcnt = bdev_io->u.bdev.iovcnt; break;