blob: Use bdev copy command in CoW flow if supported

Copy-on-write happens when cluster is written for the first time for
thin provisioned volume. Currently it is implemented as two separate
requests to underlying bdev: read of the whole cluster to bounce
buffer and then write of this buffer to the new location on the same
underlying bdev.

This patch improves copy-on-write flow by utilizing copy command of
underlying bdev if it is supported. In this case we have just one
request to bdev and don't need the bounce buffer.

Signed-off-by: Evgeniy Kochetov <evgeniik@nvidia.com>
Change-Id: I92552e0f18f7a41820d589e7bb1e86160c69183f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14351
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Community-CI: Mellanox Build Bot
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
This commit is contained in:
Evgeniy Kochetov 2022-08-22 16:59:37 +03:00 committed by Tomasz Zawadzki
parent 9e843fdbd1
commit b7bfa50468
5 changed files with 98 additions and 16 deletions

View File

@ -206,6 +206,10 @@ struct spdk_bs_dev {
*/ */
bool (*translate_lba)(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba); bool (*translate_lba)(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba);
void (*copy)(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count,
struct spdk_bs_dev_cb_args *cb_args);
uint64_t blockcnt; uint64_t blockcnt;
uint32_t blocklen; /* In bytes */ uint32_t blocklen; /* In bytes */
}; };

View File

@ -2389,6 +2389,28 @@ blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
blob_write_copy_cpl, ctx); blob_write_copy_cpl, ctx);
} }
static bool
blob_can_copy(struct spdk_blob *blob, uint32_t cluster_start_page, uint64_t *base_lba)
{
uint64_t lba = bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page);
return (blob->bs->dev->copy != NULL) &&
blob->back_bs_dev->translate_lba(blob->back_bs_dev, lba, base_lba);
}
static void
blob_copy(struct spdk_blob_copy_cluster_ctx *ctx, spdk_bs_user_op_t *op, uint64_t src_lba)
{
struct spdk_blob *blob = ctx->blob;
uint64_t lba_count = bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz);
bs_sequence_copy_dev(ctx->seq,
bs_cluster_to_lba(blob->bs, ctx->new_cluster),
src_lba,
lba_count,
blob_write_copy_cpl, ctx);
}
static void static void
bs_allocate_and_copy_cluster(struct spdk_blob *blob, bs_allocate_and_copy_cluster(struct spdk_blob *blob,
struct spdk_io_channel *_ch, struct spdk_io_channel *_ch,
@ -2400,6 +2422,8 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
uint32_t cluster_start_page; uint32_t cluster_start_page;
uint32_t cluster_number; uint32_t cluster_number;
bool is_zeroes; bool is_zeroes;
bool can_copy;
uint64_t copy_src_lba;
int rc; int rc;
ch = spdk_io_channel_get_ctx(_ch); ch = spdk_io_channel_get_ctx(_ch);
@ -2431,11 +2455,12 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
ctx->page = cluster_start_page; ctx->page = cluster_start_page;
ctx->new_cluster_page = ch->new_cluster_page; ctx->new_cluster_page = ch->new_cluster_page;
memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE); memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE);
can_copy = blob_can_copy(blob, cluster_start_page, &copy_src_lba);
is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev, is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz)); bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));
if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) { if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) {
ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
if (!ctx->buf) { if (!ctx->buf) {
@ -2477,11 +2502,16 @@ bs_allocate_and_copy_cluster(struct spdk_blob *blob,
TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) { if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) {
if (can_copy) {
blob_copy(ctx, op, copy_src_lba);
} else {
/* Read cluster from backing device */ /* Read cluster from backing device */
bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
blob_write_copy, ctx); blob_write_copy, ctx);
}
} else { } else {
blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx);

View File

@ -238,6 +238,22 @@ bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
&set->cb_args); &set->cb_args);
} }
void
bs_sequence_copy_dev(spdk_bs_sequence_t *seq, uint64_t dst_lba, uint64_t src_lba,
uint64_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
{
struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
struct spdk_bs_channel *channel = set->channel;
SPDK_DEBUGLOG(blob_rw, "Copying %" PRIu64 " blocks from LBA %" PRIu64 " to LBA %" PRIu64 "\n",
lba_count, src_lba, dst_lba);
set->u.sequence.cb_fn = cb_fn;
set->u.sequence.cb_arg = cb_arg;
channel->dev->copy(channel->dev, channel->dev_channel, dst_lba, src_lba, lba_count, &set->cb_args);
}
void void
bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno) bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno)
{ {

View File

@ -150,6 +150,10 @@ void bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
uint64_t lba, uint64_t lba_count, uint64_t lba, uint64_t lba_count,
spdk_bs_sequence_cpl cb_fn, void *cb_arg); spdk_bs_sequence_cpl cb_fn, void *cb_arg);
void bs_sequence_copy_dev(spdk_bs_sequence_t *seq,
uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count,
spdk_bs_sequence_cpl cb_fn, void *cb_arg);
void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno); void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno);
void bs_user_op_sequence_finish(void *cb_arg, int bserrno); void bs_user_op_sequence_finish(void *cb_arg, int bserrno);

View File

@ -29,6 +29,7 @@ struct blob_resubmit {
void *payload; void *payload;
int iovcnt; int iovcnt;
uint64_t lba; uint64_t lba;
uint64_t src_lba;
uint32_t lba_count; uint32_t lba_count;
struct spdk_bs_dev_cb_args *cb_args; struct spdk_bs_dev_cb_args *cb_args;
struct spdk_blob_ext_io_opts *ext_io_opts; struct spdk_blob_ext_io_opts *ext_io_opts;
@ -64,8 +65,9 @@ bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg)
static void static void
bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
int iovcnt, uint64_t lba, uint32_t lba_count, enum spdk_bdev_io_type io_type, int iovcnt, uint64_t lba, uint64_t src_lba, uint32_t lba_count,
struct spdk_bs_dev_cb_args *cb_args, struct spdk_blob_ext_io_opts *ext_io_opts) enum spdk_bdev_io_type io_type, struct spdk_bs_dev_cb_args *cb_args,
struct spdk_blob_ext_io_opts *ext_io_opts)
{ {
int rc; int rc;
struct spdk_bdev *bdev = __get_bdev(dev); struct spdk_bdev *bdev = __get_bdev(dev);
@ -85,6 +87,7 @@ bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, voi
ctx->payload = payload; ctx->payload = payload;
ctx->iovcnt = iovcnt; ctx->iovcnt = iovcnt;
ctx->lba = lba; ctx->lba = lba;
ctx->src_lba = src_lba;
ctx->lba_count = lba_count; ctx->lba_count = lba_count;
ctx->cb_args = cb_args; ctx->cb_args = cb_args;
ctx->bdev_io_wait.bdev = bdev; ctx->bdev_io_wait.bdev = bdev;
@ -110,7 +113,7 @@ bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *p
rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba, rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba,
lba_count, bdev_blob_io_complete, cb_args); lba_count, bdev_blob_io_complete, cb_args);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, payload, 0, lba, bdev_blob_queue_io(dev, channel, payload, 0, lba, 0,
lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -126,7 +129,7 @@ bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *
rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba, rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba,
lba_count, bdev_blob_io_complete, cb_args); lba_count, bdev_blob_io_complete, cb_args);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, payload, 0, lba, bdev_blob_queue_io(dev, channel, payload, 0, lba, 0,
lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -143,7 +146,7 @@ bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba, rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba,
lba_count, bdev_blob_io_complete, cb_args); lba_count, bdev_blob_io_complete, cb_args);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0,
lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -160,7 +163,7 @@ bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba, rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba,
lba_count, bdev_blob_io_complete, cb_args); lba_count, bdev_blob_io_complete, cb_args);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0,
lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -187,7 +190,7 @@ bdev_blob_readv_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
rc = spdk_bdev_readv_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, rc = spdk_bdev_readv_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count,
bdev_blob_io_complete, cb_args, bdev_io_opts); bdev_blob_io_complete, cb_args, bdev_io_opts);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args,
io_opts); io_opts);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -214,7 +217,7 @@ bdev_blob_writev_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
rc = spdk_bdev_writev_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, rc = spdk_bdev_writev_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count,
bdev_blob_io_complete, cb_args, bdev_io_opts); bdev_blob_io_complete, cb_args, bdev_io_opts);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args,
io_opts); io_opts);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -230,7 +233,7 @@ bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba, rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba,
lba_count, bdev_blob_io_complete, cb_args); lba_count, bdev_blob_io_complete, cb_args);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, NULL, 0, lba, bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0,
lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args, NULL); lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args, NULL);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -248,7 +251,7 @@ bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64
rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count, rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count,
bdev_blob_io_complete, cb_args); bdev_blob_io_complete, cb_args);
if (rc == -ENOMEM) { if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, NULL, 0, lba, bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0,
lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args, NULL); lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args, NULL);
} else if (rc != 0) { } else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
@ -263,6 +266,24 @@ bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64
} }
} }
static void
bdev_blob_copy(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count,
struct spdk_bs_dev_cb_args *cb_args)
{
int rc;
rc = spdk_bdev_copy_blocks(__get_desc(dev), channel,
dst_lba, src_lba, lba_count,
bdev_blob_io_complete, cb_args);
if (rc == -ENOMEM) {
bdev_blob_queue_io(dev, channel, NULL, 0, dst_lba, src_lba,
lba_count, SPDK_BDEV_IO_TYPE_COPY, cb_args, NULL);
} else if (rc != 0) {
cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
}
}
static void static void
bdev_blob_resubmit(void *arg) bdev_blob_resubmit(void *arg)
{ {
@ -295,6 +316,10 @@ bdev_blob_resubmit(void *arg)
bdev_blob_write_zeroes(ctx->dev, ctx->channel, bdev_blob_write_zeroes(ctx->dev, ctx->channel,
ctx->lba, ctx->lba_count, ctx->cb_args); ctx->lba, ctx->lba_count, ctx->cb_args);
break; break;
case SPDK_BDEV_IO_TYPE_COPY:
bdev_blob_copy(ctx->dev, ctx->channel,
ctx->lba, ctx->src_lba, ctx->lba_count, ctx->cb_args);
break;
default: default:
SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type); SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type);
assert(false); assert(false);
@ -390,6 +415,9 @@ blob_bdev_init(struct blob_bdev *b, struct spdk_bdev_desc *desc)
b->bs_dev.writev_ext = bdev_blob_writev_ext; b->bs_dev.writev_ext = bdev_blob_writev_ext;
b->bs_dev.write_zeroes = bdev_blob_write_zeroes; b->bs_dev.write_zeroes = bdev_blob_write_zeroes;
b->bs_dev.unmap = bdev_blob_unmap; b->bs_dev.unmap = bdev_blob_unmap;
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
b->bs_dev.copy = bdev_blob_copy;
}
b->bs_dev.get_base_bdev = bdev_blob_get_base_bdev; b->bs_dev.get_base_bdev = bdev_blob_get_base_bdev;
b->bs_dev.is_zeroes = bdev_blob_is_zeroes; b->bs_dev.is_zeroes = bdev_blob_is_zeroes;
b->bs_dev.translate_lba = bdev_blob_translate_lba; b->bs_dev.translate_lba = bdev_blob_translate_lba;