diff --git a/module/bdev/raid/raid5f.c b/module/bdev/raid/raid5f.c index 8d89d8eb0..9a0f94063 100644 --- a/module/bdev/raid/raid5f.c +++ b/module/bdev/raid/raid5f.c @@ -11,6 +11,7 @@ #include "spdk/util.h" #include "spdk/likely.h" #include "spdk/log.h" +#include "spdk/xor.h" /* Maximum concurrent full stripe writes per io channel */ #define RAID5F_MAX_STRIPES 32 @@ -41,6 +42,9 @@ struct stripe_request { /* The stripe's parity chunk */ struct chunk *parity_chunk; + /* Buffer for stripe parity */ + void *parity_buf; + TAILQ_ENTRY(stripe_request) link; /* Array of chunks corresponding to base_bdevs */ @@ -56,11 +60,28 @@ struct raid5f_info { /* Number of stripes on this array */ uint64_t total_stripes; + + /* Alignment for buffer allocation */ + size_t buf_alignment; }; struct raid5f_io_channel { /* All available stripe requests on this channel */ TAILQ_HEAD(, stripe_request) free_stripe_requests; + + /* Array of iovec iterators for each data chunk */ + struct iov_iter { + struct iovec *iovs; + int iovcnt; + int index; + size_t offset; + } *chunk_iov_iters; + + /* Array of source buffer pointers for parity calculation */ + void **chunk_xor_buffers; + + /* Bounce buffers for parity calculation in case of unaligned source buffers */ + struct iovec *chunk_xor_bounce_buffers; }; #define __CHUNK_IN_RANGE(req, c) \ @@ -109,6 +130,87 @@ raid5f_stripe_request_release(struct stripe_request *stripe_req) TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests, stripe_req, link); } +static int +raid5f_xor_stripe(struct stripe_request *stripe_req) +{ + struct raid_bdev_io *raid_io = stripe_req->raid_io; + struct raid5f_io_channel *r5ch = stripe_req->r5ch; + struct raid_bdev *raid_bdev = raid_io->raid_bdev; + size_t remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift; + uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev); + void *dest = stripe_req->parity_buf; + size_t alignment_mask = spdk_xor_get_optimal_alignment() - 1; + struct chunk *chunk; + int ret; + uint8_t c; + + c = 0; + FOR_EACH_DATA_CHUNK(stripe_req, chunk) { + struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[c]; + bool aligned = true; + int i; + + for (i = 0; i < chunk->iovcnt; i++) { + if (((uintptr_t)chunk->iovs[i].iov_base & alignment_mask) || + (chunk->iovs[i].iov_len & alignment_mask)) { + aligned = false; + break; + } + } + + if (aligned) { + iov_iter->iovs = chunk->iovs; + iov_iter->iovcnt = chunk->iovcnt; + } else { + iov_iter->iovs = &r5ch->chunk_xor_bounce_buffers[c]; + iov_iter->iovcnt = 1; + spdk_iovcpy(chunk->iovs, chunk->iovcnt, iov_iter->iovs, iov_iter->iovcnt); + } + + iov_iter->index = 0; + iov_iter->offset = 0; + + c++; + } + + while (remaining > 0) { + size_t len = remaining; + uint8_t i; + + for (i = 0; i < n_src; i++) { + struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i]; + struct iovec *iov = &iov_iter->iovs[iov_iter->index]; + + len = spdk_min(len, iov->iov_len - iov_iter->offset); + r5ch->chunk_xor_buffers[i] = iov->iov_base + iov_iter->offset; + } + + assert(len > 0); + + ret = spdk_xor_gen(dest, r5ch->chunk_xor_buffers, n_src, len); + if (spdk_unlikely(ret)) { + SPDK_ERRLOG("stripe xor failed\n"); + return ret; + } + + for (i = 0; i < n_src; i++) { + struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i]; + struct iovec *iov = &iov_iter->iovs[iov_iter->index]; + + iov_iter->offset += len; + if (iov_iter->offset == iov->iov_len) { + iov_iter->offset = 0; + iov_iter->index++; + } + } + dest += len; + + remaining -= len; + } + + return 0; +} + static void raid5f_chunk_write_complete(struct chunk *chunk, enum spdk_bdev_io_status status) { @@ -165,7 +267,7 @@ raid5f_chunk_write(struct chunk *chunk) * these means there are no more to complete for the stripe request, we can * release the stripe request as well. */ - uint64_t base_bdev_io_not_submitted = raid5f_stripe_data_chunks_num(raid_bdev) - + uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs - raid_io->base_bdev_io_submitted; if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted, @@ -238,6 +340,11 @@ raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req, } } + stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->parity_buf; + stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size << + raid_bdev->blocklen_shift; + stripe_req->parity_chunk->iovcnt = 1; + return 0; } @@ -248,15 +355,7 @@ raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req) struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted]; struct chunk *chunk; - if (start >= stripe_req->parity_chunk) { - start++; - } - FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) { - if (chunk == stripe_req->parity_chunk) { - continue; - } - if (spdk_unlikely(raid5f_chunk_write(chunk) != 0)) { break; } @@ -267,7 +366,10 @@ raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req) static void raid5f_submit_stripe_request(struct stripe_request *stripe_req) { - /* TODO: parity */ + if (spdk_unlikely(raid5f_xor_stripe(stripe_req) != 0)) { + raid_bdev_io_complete(stripe_req->raid_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } raid5f_stripe_request_submit_chunks(stripe_req); } @@ -300,7 +402,7 @@ raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index) TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link); raid_io->module_private = stripe_req; - raid_io->base_bdev_io_remaining = raid5f_stripe_data_chunks_num(raid_bdev); + raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; raid5f_submit_stripe_request(stripe_req); @@ -397,6 +499,8 @@ raid5f_stripe_request_free(struct stripe_request *stripe_req) free(chunk->iovs); } + spdk_dma_free(stripe_req->parity_buf); + free(stripe_req); } @@ -421,24 +525,45 @@ raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch) chunk->iovcnt_max = 4; chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0])); if (!chunk->iovs) { - raid5f_stripe_request_free(stripe_req); - return NULL; + goto err; } } + stripe_req->parity_buf = spdk_dma_malloc(raid_bdev->strip_size << raid_bdev->blocklen_shift, + r5f_info->buf_alignment, NULL); + if (!stripe_req->parity_buf) { + goto err; + } + return stripe_req; +err: + raid5f_stripe_request_free(stripe_req); + return NULL; } static void raid5f_ioch_destroy(void *io_device, void *ctx_buf) { struct raid5f_io_channel *r5ch = ctx_buf; + struct raid5f_info *r5f_info = io_device; + struct raid_bdev *raid_bdev = r5f_info->raid_bdev; struct stripe_request *stripe_req; + int i; while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests))) { TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link); raid5f_stripe_request_free(stripe_req); } + + if (r5ch->chunk_xor_bounce_buffers) { + for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { + free(r5ch->chunk_xor_bounce_buffers[i].iov_base); + } + free(r5ch->chunk_xor_bounce_buffers); + } + + free(r5ch->chunk_xor_buffers); + free(r5ch->chunk_iov_iters); } static int @@ -446,6 +571,9 @@ raid5f_ioch_create(void *io_device, void *ctx_buf) { struct raid5f_io_channel *r5ch = ctx_buf; struct raid5f_info *r5f_info = io_device; + struct raid_bdev *raid_bdev = r5f_info->raid_bdev; + size_t chunk_len = raid_bdev->strip_size << raid_bdev->blocklen_shift; + int status = 0; int i; TAILQ_INIT(&r5ch->free_stripe_requests); @@ -455,15 +583,48 @@ raid5f_ioch_create(void *io_device, void *ctx_buf) stripe_req = raid5f_stripe_request_alloc(r5ch); if (!stripe_req) { - SPDK_ERRLOG("Failed to initialize io channel\n"); - raid5f_ioch_destroy(r5f_info, r5ch); - return -ENOMEM; + status = -ENOMEM; + goto out; } TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests, stripe_req, link); } - return 0; + r5ch->chunk_iov_iters = calloc(raid5f_stripe_data_chunks_num(raid_bdev), + sizeof(r5ch->chunk_iov_iters[0])); + if (!r5ch->chunk_iov_iters) { + status = -ENOMEM; + goto out; + } + + r5ch->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), + sizeof(r5ch->chunk_xor_buffers[0])); + if (!r5ch->chunk_xor_buffers) { + status = -ENOMEM; + goto out; + } + + r5ch->chunk_xor_bounce_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), + sizeof(r5ch->chunk_xor_bounce_buffers[0])); + if (!r5ch->chunk_xor_bounce_buffers) { + status = -ENOMEM; + goto out; + } + + for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { + status = posix_memalign(&r5ch->chunk_xor_bounce_buffers[i].iov_base, + spdk_xor_get_optimal_alignment(), chunk_len); + if (status) { + goto out; + } + r5ch->chunk_xor_bounce_buffers[i].iov_len = chunk_len; + } +out: + if (status) { + SPDK_ERRLOG("Failed to initialize io channel\n"); + raid5f_ioch_destroy(r5f_info, r5ch); + } + return status; } static int @@ -472,6 +633,7 @@ raid5f_start(struct raid_bdev *raid_bdev) uint64_t min_blockcnt = UINT64_MAX; struct raid_base_bdev_info *base_info; struct raid5f_info *r5f_info; + size_t alignment; r5f_info = calloc(1, sizeof(*r5f_info)); if (!r5f_info) { @@ -480,12 +642,15 @@ raid5f_start(struct raid_bdev *raid_bdev) } r5f_info->raid_bdev = raid_bdev; + alignment = spdk_xor_get_optimal_alignment(); RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); + alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev)); } r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size; r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev); + r5f_info->buf_alignment = alignment; raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes; raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; diff --git a/test/unit/lib/bdev/raid/raid5f.c/raid5f_ut.c b/test/unit/lib/bdev/raid/raid5f.c/raid5f_ut.c index 94bbad905..54b1dcad0 100644 --- a/test/unit/lib/bdev/raid/raid5f.c/raid5f_ut.c +++ b/test/unit/lib/bdev/raid/raid5f.c/raid5f_ut.c @@ -14,6 +14,7 @@ DEFINE_STUB_V(raid_bdev_module_list_add, (struct raid_bdev_module *raid_module)); DEFINE_STUB_V(raid_bdev_queue_io_wait, (struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn)); +DEFINE_STUB(spdk_bdev_get_buf_align, size_t, (const struct spdk_bdev *bdev), 0); void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status) @@ -209,6 +210,9 @@ struct raid_io_info { void *src_buf; void *dest_buf; size_t buf_size; + void *parity_buf; + void *reference_parity; + size_t parity_buf_size; enum spdk_bdev_io_status status; bool failed; int remaining; @@ -341,21 +345,27 @@ spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, stripe_req = raid5f_chunk_stripe_req(chunk); test_raid_bdev_io = (struct test_raid_bdev_io *)spdk_bdev_io_from_ctx(stripe_req->raid_io); io_info = test_raid_bdev_io->io_info; - raid_bdev = io_info->r5f_info->raid_bdev; - SPDK_CU_ASSERT_FATAL(chunk != stripe_req->parity_chunk); + raid_bdev = io_info->r5f_info->raid_bdev; stripe_idx_off = offset_blocks / raid_bdev->strip_size - io_info->offset_blocks / io_info->r5f_info->stripe_blocks; - data_chunk_idx = chunk < stripe_req->parity_chunk ? chunk->index : chunk->index - 1; - dest_buf = test_raid_bdev_io->buf + - (stripe_idx_off * io_info->r5f_info->stripe_blocks + - data_chunk_idx * raid_bdev->strip_size) * - raid_bdev->bdev.blocklen; + if (chunk == stripe_req->parity_chunk) { + if (io_info->parity_buf == NULL) { + goto submit; + } + dest_buf = io_info->parity_buf + stripe_idx_off * raid_bdev->strip_size_kb * 1024; + } else { + data_chunk_idx = chunk < stripe_req->parity_chunk ? chunk->index : chunk->index - 1; + dest_buf = test_raid_bdev_io->buf + + (stripe_idx_off * io_info->r5f_info->stripe_blocks + + data_chunk_idx * raid_bdev->strip_size) * + raid_bdev->bdev.blocklen; + } memcpy(dest_buf, iov->iov_base, iov->iov_len); - +submit: submit_io(test_raid_bdev_io->io_info, desc, cb, cb_arg); return 0; @@ -382,6 +392,14 @@ spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, return 0; } +static void +xor_block(uint8_t *a, uint8_t *b, size_t size) +{ + while (size-- > 0) { + a[size] ^= b[size]; + } +} + static void test_raid5f_write_request(struct raid_io_info *io_info) { @@ -394,6 +412,11 @@ test_raid5f_write_request(struct raid_io_info *io_info) raid5f_submit_rw_request(raid_io); process_io_completions(io_info); + + if (io_info->status == SPDK_BDEV_IO_STATUS_SUCCESS && io_info->parity_buf) { + CU_ASSERT(memcmp(io_info->parity_buf, io_info->reference_parity, + io_info->parity_buf_size) == 0); + } } static void @@ -424,6 +447,8 @@ deinit_io_info(struct raid_io_info *io_info) { free(io_info->src_buf); free(io_info->dest_buf); + free(io_info->parity_buf); + free(io_info->reference_parity); } static void @@ -463,6 +488,35 @@ init_io_info(struct raid_io_info *io_info, struct raid5f_info *r5f_info, TAILQ_INIT(&io_info->bdev_io_queue); } +static void +io_info_setup_parity(struct raid_io_info *io_info) +{ + struct raid5f_info *r5f_info = io_info->r5f_info; + struct raid_bdev *raid_bdev = r5f_info->raid_bdev; + uint32_t blocklen = raid_bdev->bdev.blocklen; + uint64_t num_stripes = io_info->num_blocks / r5f_info->stripe_blocks; + size_t strip_len = raid_bdev->strip_size * blocklen; + void *src = io_info->src_buf; + void *dest; + unsigned i, j; + + io_info->parity_buf_size = num_stripes * strip_len; + io_info->parity_buf = calloc(1, io_info->parity_buf_size); + SPDK_CU_ASSERT_FATAL(io_info->parity_buf != NULL); + + io_info->reference_parity = calloc(1, io_info->parity_buf_size); + SPDK_CU_ASSERT_FATAL(io_info->reference_parity != NULL); + + dest = io_info->reference_parity; + for (i = 0; i < num_stripes; i++) { + for (j = 0; j < raid5f_stripe_data_chunks_num(raid_bdev); j++) { + xor_block(dest, src, strip_len); + src += strip_len; + } + dest += strip_len; + } +} + static void test_raid5f_submit_rw_request(struct raid5f_info *r5f_info, struct raid_bdev_io_channel *raid_ch, enum spdk_bdev_io_type io_type, uint64_t stripe_index, uint64_t stripe_offset_blocks, @@ -478,6 +532,7 @@ test_raid5f_submit_rw_request(struct raid5f_info *r5f_info, struct raid_bdev_io_ test_raid5f_read_request(&io_info); break; case SPDK_BDEV_IO_TYPE_WRITE: + io_info_setup_parity(&io_info); test_raid5f_write_request(&io_info); break; default: