From b7bfa50468d241c1f6041ff69395ea740adbf01d Mon Sep 17 00:00:00 2001 From: Evgeniy Kochetov Date: Mon, 22 Aug 2022 16:59:37 +0300 Subject: [PATCH] blob: Use bdev copy command in CoW flow if supported Copy-on-write happens when cluster is written for the first time for thin provisioned volume. Currently it is implemented as two separate requests to underlying bdev: read of the whole cluster to bounce buffer and then write of this buffer to the new location on the same underlying bdev. This patch improves copy-on-write flow by utilizing copy command of underlying bdev if it is supported. In this case we have just one request to bdev and don't need the bounce buffer. Signed-off-by: Evgeniy Kochetov Change-Id: I92552e0f18f7a41820d589e7bb1e86160c69183f Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14351 Tested-by: SPDK CI Jenkins Community-CI: Mellanox Build Bot Reviewed-by: Jim Harris Reviewed-by: Shuhei Matsumoto Reviewed-by: Aleksey Marchuk --- include/spdk/blob.h | 4 +++ lib/blob/blobstore.c | 42 ++++++++++++++++++++++++++----- lib/blob/request.c | 16 ++++++++++++ lib/blob/request.h | 4 +++ module/blob/bdev/blob_bdev.c | 48 ++++++++++++++++++++++++++++-------- 5 files changed, 98 insertions(+), 16 deletions(-) diff --git a/include/spdk/blob.h b/include/spdk/blob.h index 629d58320..bf867850a 100644 --- a/include/spdk/blob.h +++ b/include/spdk/blob.h @@ -206,6 +206,10 @@ struct spdk_bs_dev { */ bool (*translate_lba)(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba); + void (*copy)(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count, + struct spdk_bs_dev_cb_args *cb_args); + uint64_t blockcnt; uint32_t blocklen; /* In bytes */ }; diff --git a/lib/blob/blobstore.c b/lib/blob/blobstore.c index 1b28c9240..fcfafc7c9 100644 --- a/lib/blob/blobstore.c +++ b/lib/blob/blobstore.c @@ -2389,6 +2389,28 @@ blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) blob_write_copy_cpl, ctx); } +static bool +blob_can_copy(struct spdk_blob *blob, uint32_t cluster_start_page, uint64_t *base_lba) +{ + uint64_t lba = bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page); + + return (blob->bs->dev->copy != NULL) && + blob->back_bs_dev->translate_lba(blob->back_bs_dev, lba, base_lba); +} + +static void +blob_copy(struct spdk_blob_copy_cluster_ctx *ctx, spdk_bs_user_op_t *op, uint64_t src_lba) +{ + struct spdk_blob *blob = ctx->blob; + uint64_t lba_count = bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz); + + bs_sequence_copy_dev(ctx->seq, + bs_cluster_to_lba(blob->bs, ctx->new_cluster), + src_lba, + lba_count, + blob_write_copy_cpl, ctx); +} + static void bs_allocate_and_copy_cluster(struct spdk_blob *blob, struct spdk_io_channel *_ch, @@ -2400,6 +2422,8 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob, uint32_t cluster_start_page; uint32_t cluster_number; bool is_zeroes; + bool can_copy; + uint64_t copy_src_lba; int rc; ch = spdk_io_channel_get_ctx(_ch); @@ -2431,11 +2455,12 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob, ctx->page = cluster_start_page; ctx->new_cluster_page = ch->new_cluster_page; memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE); + can_copy = blob_can_copy(blob, cluster_start_page, ©_src_lba); is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev, bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz)); - if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) { + if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) { ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); if (!ctx->buf) { @@ -2477,11 +2502,16 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob, TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) { - /* Read cluster from backing device */ - bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, - bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), - bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), - blob_write_copy, ctx); + if (can_copy) { + blob_copy(ctx, op, copy_src_lba); + } else { + /* Read cluster from backing device */ + bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, + bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), + bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), + blob_write_copy, ctx); + } + } else { blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); diff --git a/lib/blob/request.c b/lib/blob/request.c index 7943879b6..bd4d32825 100644 --- a/lib/blob/request.c +++ b/lib/blob/request.c @@ -238,6 +238,22 @@ bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq, &set->cb_args); } +void +bs_sequence_copy_dev(spdk_bs_sequence_t *seq, uint64_t dst_lba, uint64_t src_lba, + uint64_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(blob_rw, "Copying %" PRIu64 " blocks from LBA %" PRIu64 " to LBA %" PRIu64 "\n", + lba_count, src_lba, dst_lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->copy(channel->dev, channel->dev_channel, dst_lba, src_lba, lba_count, &set->cb_args); +} + void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno) { diff --git a/lib/blob/request.h b/lib/blob/request.h index ebee8a1d7..54112824e 100644 --- a/lib/blob/request.h +++ b/lib/blob/request.h @@ -150,6 +150,10 @@ void bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq, uint64_t lba, uint64_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg); +void bs_sequence_copy_dev(spdk_bs_sequence_t *seq, + uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno); void bs_user_op_sequence_finish(void *cb_arg, int bserrno); diff --git a/module/blob/bdev/blob_bdev.c b/module/blob/bdev/blob_bdev.c index a4e52b92b..44e7b4829 100644 --- a/module/blob/bdev/blob_bdev.c +++ b/module/blob/bdev/blob_bdev.c @@ -29,6 +29,7 @@ struct blob_resubmit { void *payload; int iovcnt; uint64_t lba; + uint64_t src_lba; uint32_t lba_count; struct spdk_bs_dev_cb_args *cb_args; struct spdk_blob_ext_io_opts *ext_io_opts; @@ -64,8 +65,9 @@ bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg) static void bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, - int iovcnt, uint64_t lba, uint32_t lba_count, enum spdk_bdev_io_type io_type, - struct spdk_bs_dev_cb_args *cb_args, struct spdk_blob_ext_io_opts *ext_io_opts) + int iovcnt, uint64_t lba, uint64_t src_lba, uint32_t lba_count, + enum spdk_bdev_io_type io_type, struct spdk_bs_dev_cb_args *cb_args, + struct spdk_blob_ext_io_opts *ext_io_opts) { int rc; struct spdk_bdev *bdev = __get_bdev(dev); @@ -85,6 +87,7 @@ bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, voi ctx->payload = payload; ctx->iovcnt = iovcnt; ctx->lba = lba; + ctx->src_lba = src_lba; ctx->lba_count = lba_count; ctx->cb_args = cb_args; ctx->bdev_io_wait.bdev = bdev; @@ -110,7 +113,7 @@ bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *p rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba, lba_count, bdev_blob_io_complete, cb_args); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, payload, 0, lba, + bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -126,7 +129,7 @@ bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void * rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba, lba_count, bdev_blob_io_complete, cb_args); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, payload, 0, lba, + bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -143,7 +146,7 @@ bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, bdev_blob_io_complete, cb_args); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, + bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -160,7 +163,7 @@ bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, bdev_blob_io_complete, cb_args); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, + bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -187,7 +190,7 @@ bdev_blob_readv_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, rc = spdk_bdev_readv_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, bdev_blob_io_complete, cb_args, bdev_io_opts); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, + bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, io_opts); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -214,7 +217,7 @@ bdev_blob_writev_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, rc = spdk_bdev_writev_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, bdev_blob_io_complete, cb_args, bdev_io_opts); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, + bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, io_opts); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -230,7 +233,7 @@ bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba, lba_count, bdev_blob_io_complete, cb_args); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, NULL, 0, lba, + bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args, NULL); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -248,7 +251,7 @@ bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64 rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count, bdev_blob_io_complete, cb_args); if (rc == -ENOMEM) { - bdev_blob_queue_io(dev, channel, NULL, 0, lba, + bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args, NULL); } else if (rc != 0) { cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); @@ -263,6 +266,24 @@ bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64 } } +static void +bdev_blob_copy(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + int rc; + + rc = spdk_bdev_copy_blocks(__get_desc(dev), channel, + dst_lba, src_lba, lba_count, + bdev_blob_io_complete, cb_args); + if (rc == -ENOMEM) { + bdev_blob_queue_io(dev, channel, NULL, 0, dst_lba, src_lba, + lba_count, SPDK_BDEV_IO_TYPE_COPY, cb_args, NULL); + } else if (rc != 0) { + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + } +} + static void bdev_blob_resubmit(void *arg) { @@ -295,6 +316,10 @@ bdev_blob_resubmit(void *arg) bdev_blob_write_zeroes(ctx->dev, ctx->channel, ctx->lba, ctx->lba_count, ctx->cb_args); break; + case SPDK_BDEV_IO_TYPE_COPY: + bdev_blob_copy(ctx->dev, ctx->channel, + ctx->lba, ctx->src_lba, ctx->lba_count, ctx->cb_args); + break; default: SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type); assert(false); @@ -390,6 +415,9 @@ blob_bdev_init(struct blob_bdev *b, struct spdk_bdev_desc *desc) b->bs_dev.writev_ext = bdev_blob_writev_ext; b->bs_dev.write_zeroes = bdev_blob_write_zeroes; b->bs_dev.unmap = bdev_blob_unmap; + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { + b->bs_dev.copy = bdev_blob_copy; + } b->bs_dev.get_base_bdev = bdev_blob_get_base_bdev; b->bs_dev.is_zeroes = bdev_blob_is_zeroes; b->bs_dev.translate_lba = bdev_blob_translate_lba;