diff --git a/include/spdk/nvmf_transport.h b/include/spdk/nvmf_transport.h index 42ea9b9e1..7c01acd74 100644 --- a/include/spdk/nvmf_transport.h +++ b/include/spdk/nvmf_transport.h @@ -80,6 +80,12 @@ struct spdk_nvmf_dif_info { uint32_t orig_length; }; +struct spdk_nvmf_stripped_data { + uint32_t iovcnt; + struct iovec iov[NVMF_REQ_MAX_BUFFERS]; + void *buffers[NVMF_REQ_MAX_BUFFERS]; +}; + enum spdk_nvmf_zcopy_phase { NVMF_ZCOPY_PHASE_NONE, /* Request is not using ZCOPY */ NVMF_ZCOPY_PHASE_INIT, /* Requesting Buffers */ @@ -104,6 +110,7 @@ struct spdk_nvmf_request { uint32_t iovcnt; struct iovec iov[NVMF_REQ_MAX_BUFFERS]; void *buffers[NVMF_REQ_MAX_BUFFERS]; + struct spdk_nvmf_stripped_data *stripped_data; struct spdk_nvmf_dif_info dif; diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index d19e0da21..02eb86bd0 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -48,6 +48,7 @@ #include "spdk_internal/rdma.h" #include "nvmf_internal.h" +#include "transport.h" #include "spdk_internal/trace_defs.h" @@ -810,6 +811,8 @@ nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts) rdma_req->req.qpair = NULL; } rdma_req->req.cmd = NULL; + rdma_req->req.iovcnt = 0; + rdma_req->req.stripped_data = NULL; /* Set up memory to send responses */ rdma_req->req.rsp = &resources->cpls[i]; @@ -1484,19 +1487,29 @@ nvmf_rdma_fill_wr_sgl_with_dif(struct spdk_nvmf_rdma_poll_group *rgroup, struct spdk_dif_ctx *dif_ctx = &rdma_req->req.dif.dif_ctx; struct ibv_sge *sg_ele; struct iovec *iov; + struct iovec *rdma_iov; uint32_t lkey, remaining; uint32_t remaining_data_block, data_block_size, md_size; uint32_t sge_len; int rc; data_block_size = dif_ctx->block_size - dif_ctx->md_size; - md_size = dif_ctx->md_size; - remaining_data_block = data_block_size; + + if (spdk_likely(!rdma_req->req.stripped_data)) { + rdma_iov = rdma_req->req.iov; + remaining_data_block = data_block_size; + md_size = dif_ctx->md_size; + } else { + rdma_iov = rdma_req->req.stripped_data->iov; + total_length = total_length / dif_ctx->block_size * data_block_size; + remaining_data_block = total_length; + md_size = 0; + } wr->num_sge = 0; while (total_length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) { - iov = &rdma_req->req.iov[rdma_req->iovpos]; + iov = rdma_iov + rdma_req->iovpos; rc = spdk_rdma_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation); if (spdk_unlikely(rc)) { return rc; @@ -1612,6 +1625,17 @@ nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, assert(req->iovcnt <= rqpair->max_send_sge); + /* When dif_insert_or_strip is true and the I/O data length is greater than one block, + * the stripped_buffers are got for DIF stripping. */ + if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) + && (req->dif.elba_length > req->dif.dif_ctx.block_size))) { + rc = nvmf_request_get_stripped_buffers(req, &rgroup->group, + &rtransport->transport, req->dif.orig_length); + if (rc != 0) { + SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc); + } + } + rdma_req->iovpos = 0; if (spdk_unlikely(req->dif_enabled)) { @@ -1706,6 +1730,17 @@ nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtranspor return rc; } + /* When dif_insert_or_strip is true and the I/O data length is greater than one block, + * the stripped_buffers are got for DIF stripping. */ + if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) + && (req->dif.elba_length > req->dif.dif_ctx.block_size))) { + rc = nvmf_request_get_stripped_buffers(req, &rgroup->group, + &rtransport->transport, req->dif.orig_length); + if (rc != 0) { + SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc); + } + } + /* The first WR must always be the embedded data WR. This is how we unwind them later. */ current_wr = &rdma_req->data.wr; assert(current_wr != NULL); @@ -1890,6 +1925,11 @@ _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport); } + if (rdma_req->req.stripped_data) { + nvmf_request_free_stripped_buffers(&rdma_req->req, + &rqpair->poller->group->group, + &rtransport->transport); + } nvmf_rdma_request_free_data(rdma_req, rtransport); rdma_req->req.length = 0; rdma_req->req.iovcnt = 0; @@ -2116,9 +2156,15 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_dif_error error_blk; num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size); - - rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, - &rdma_req->req.dif.dif_ctx, &error_blk); + if (!rdma_req->req.stripped_data) { + rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, + &rdma_req->req.dif.dif_ctx, &error_blk); + } else { + rc = spdk_dif_verify_copy(rdma_req->req.stripped_data->iov, + rdma_req->req.stripped_data->iovcnt, + rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, + &rdma_req->req.dif.dif_ctx, &error_blk); + } if (rc) { struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; diff --git a/lib/nvmf/tcp.c b/lib/nvmf/tcp.c index ffa1c8fcc..ff88feb04 100644 --- a/lib/nvmf/tcp.c +++ b/lib/nvmf/tcp.c @@ -1048,6 +1048,8 @@ nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair) tcp_req->req.rsp = (union nvmf_c2h_msg *)&tcp_req->rsp; tcp_req->req.cmd = (union nvmf_h2c_msg *)&tcp_req->cmd; + tcp_req->req.stripped_data = NULL; + /* Initialize request state to FREE */ tcp_req->state = TCP_REQUEST_STATE_FREE; TAILQ_INSERT_TAIL(&tqpair->tcp_req_free_queue, tcp_req, state_link); diff --git a/lib/nvmf/transport.c b/lib/nvmf/transport.c index eb3f2cfc9..31f6ed3ca 100644 --- a/lib/nvmf/transport.c +++ b/lib/nvmf/transport.c @@ -690,7 +690,9 @@ spdk_nvmf_request_free_buffers(struct spdk_nvmf_request *req, req->data_from_pool = false; } -static inline int +typedef int (*set_buffer_callback)(struct spdk_nvmf_request *req, void *buf, + uint32_t length, uint32_t io_unit_size); +static int nvmf_request_set_buffer(struct spdk_nvmf_request *req, void *buf, uint32_t length, uint32_t io_unit_size) { @@ -708,9 +710,9 @@ static int nvmf_request_get_buffers(struct spdk_nvmf_request *req, struct spdk_nvmf_transport_poll_group *group, struct spdk_nvmf_transport *transport, - uint32_t length) + uint32_t length, uint32_t io_unit_size, + set_buffer_callback cb_func) { - uint32_t io_unit_size = transport->opts.io_unit_size; uint32_t num_buffers; uint32_t i = 0, j; void *buffer, *buffers[NVMF_REQ_MAX_BUFFERS]; @@ -730,7 +732,7 @@ nvmf_request_get_buffers(struct spdk_nvmf_request *req, STAILQ_REMOVE_HEAD(&group->buf_cache, link); assert(buffer != NULL); - length = nvmf_request_set_buffer(req, buffer, length, io_unit_size); + length = cb_func(req, buffer, length, io_unit_size); i++; } else { if (spdk_mempool_get_bulk(transport->data_buf_pool, buffers, @@ -738,7 +740,7 @@ nvmf_request_get_buffers(struct spdk_nvmf_request *req, return -ENOMEM; } for (j = 0; j < num_buffers - i; j++) { - length = nvmf_request_set_buffer(req, buffers[j], length, io_unit_size); + length = cb_func(req, buffers[j], length, io_unit_size); } i += num_buffers - i; } @@ -746,7 +748,6 @@ nvmf_request_get_buffers(struct spdk_nvmf_request *req, assert(length == 0); - req->data_from_pool = true; return 0; } @@ -759,11 +760,90 @@ spdk_nvmf_request_get_buffers(struct spdk_nvmf_request *req, int rc; req->iovcnt = 0; - - rc = nvmf_request_get_buffers(req, group, transport, length); - if (rc == -ENOMEM) { + rc = nvmf_request_get_buffers(req, group, transport, length, + transport->opts.io_unit_size, + nvmf_request_set_buffer); + if (!rc) { + req->data_from_pool = true; + } else if (rc == -ENOMEM) { spdk_nvmf_request_free_buffers(req, group, transport); + return rc; } return rc; } + +static int +nvmf_request_set_stripped_buffer(struct spdk_nvmf_request *req, void *buf, uint32_t length, + uint32_t io_unit_size) +{ + struct spdk_nvmf_stripped_data *data = req->stripped_data; + + data->buffers[data->iovcnt] = buf; + data->iov[data->iovcnt].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK); + data->iov[data->iovcnt].iov_len = spdk_min(length, io_unit_size); + length -= data->iov[data->iovcnt].iov_len; + data->iovcnt++; + + return length; +} + +void +nvmf_request_free_stripped_buffers(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_stripped_data *data = req->stripped_data; + uint32_t i; + + for (i = 0; i < data->iovcnt; i++) { + if (group->buf_cache_count < group->buf_cache_size) { + STAILQ_INSERT_HEAD(&group->buf_cache, + (struct spdk_nvmf_transport_pg_cache_buf *)data->buffers[i], + link); + group->buf_cache_count++; + } else { + spdk_mempool_put(transport->data_buf_pool, data->buffers[i]); + } + } + free(data); + req->stripped_data = NULL; +} + +int +nvmf_request_get_stripped_buffers(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport, + uint32_t length) +{ + uint32_t block_size = req->dif.dif_ctx.block_size; + uint32_t data_block_size = block_size - req->dif.dif_ctx.md_size; + uint32_t io_unit_size = transport->opts.io_unit_size / block_size * data_block_size; + struct spdk_nvmf_stripped_data *data; + uint32_t i; + int rc; + + /* Data blocks must be block aligned */ + for (i = 0; i < req->iovcnt; i++) { + if (req->iov[i].iov_len % block_size) { + return -EINVAL; + } + } + + data = calloc(1, sizeof(*data)); + if (data == NULL) { + SPDK_ERRLOG("Unable to allocate memory for stripped_data.\n"); + return -ENOMEM; + } + req->stripped_data = data; + req->stripped_data->iovcnt = 0; + + rc = nvmf_request_get_buffers(req, group, transport, length, io_unit_size, + nvmf_request_set_stripped_buffer); + if (rc == -ENOMEM) { + nvmf_request_free_stripped_buffers(req, group, transport); + return rc; + } + return rc; +} diff --git a/lib/nvmf/transport.h b/lib/nvmf/transport.h index c9cac3d5a..979b231d4 100644 --- a/lib/nvmf/transport.h +++ b/lib/nvmf/transport.h @@ -78,4 +78,13 @@ int nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, void nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_request *req); +void nvmf_request_free_stripped_buffers(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport); + +int nvmf_request_get_stripped_buffers(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport, + uint32_t length); + #endif /* SPDK_NVMF_TRANSPORT_H */ diff --git a/lib/nvmf/vfio_user.c b/lib/nvmf/vfio_user.c index 7eb5fc3bd..7e9f6b743 100644 --- a/lib/nvmf/vfio_user.c +++ b/lib/nvmf/vfio_user.c @@ -1405,6 +1405,7 @@ alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *s req->qpair = &sq->qpair; req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp; req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd; + req->stripped_data = NULL; TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link); } diff --git a/test/unit/lib/nvmf/rdma.c/rdma_ut.c b/test/unit/lib/nvmf/rdma.c/rdma_ut.c index e20788311..e2f68bac4 100644 --- a/test/unit/lib/nvmf/rdma.c/rdma_ut.c +++ b/test/unit/lib/nvmf/rdma.c/rdma_ut.c @@ -172,6 +172,10 @@ static void reset_nvmf_rdma_request(struct spdk_nvmf_rdma_request *rdma_req) rdma_req->data.wr.sg_list[i].lkey = 0; } rdma_req->req.iovcnt = 0; + if (rdma_req->req.stripped_data) { + free(rdma_req->req.stripped_data); + rdma_req->req.stripped_data = NULL; + } } static void @@ -943,6 +947,7 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void) 0, 0, 0, 0, 0); rdma_req.req.dif_enabled = true; rtransport.transport.opts.io_unit_size = data_bs * 8; + rdma_req.req.qpair->transport = &rtransport.transport; sgl->keyed.length = data_bs * 4; rc = nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req); @@ -953,16 +958,14 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void) CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length); CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4); CU_ASSERT((uint64_t)rdma_req.req.data == 0x2000); - CU_ASSERT(rdma_req.data.wr.num_sge == 4); + CU_ASSERT(rdma_req.data.wr.num_sge == 1); CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE); CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF); CU_ASSERT((uint64_t)rdma_req.req.buffers[0] == 0x2000); - for (i = 0; i < 4; ++i) { - CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size)); - CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs); - CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY); - } + CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == 0x2000); + CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rdma_req.req.length); + CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY); /* Part 2: simple I/O, one SGL equal to io unit size, io_unit_size is not aligned with md_size, block size 512 */ @@ -1055,16 +1058,14 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void) CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length); CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4); CU_ASSERT((uint64_t)rdma_req.req.data == 0x2000); - CU_ASSERT(rdma_req.data.wr.num_sge == 4); + CU_ASSERT(rdma_req.data.wr.num_sge == 1); CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE); CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF); CU_ASSERT((uint64_t)rdma_req.req.buffers[0] == 0x2000); - for (i = 0; i < 4; ++i) { - CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size)); - CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs); - CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY); - } + CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == 0x2000); + CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rdma_req.req.length); + CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY); /* Part 5: simple I/O, one SGL equal to 2x io unit size, io_unit_size is aligned with md_size, block size 512 */ @@ -1085,18 +1086,14 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void) CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length); CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4); CU_ASSERT((uint64_t)rdma_req.req.data == 0x2000); - CU_ASSERT(rdma_req.data.wr.num_sge == 4); + CU_ASSERT(rdma_req.data.wr.num_sge == 2); CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE); CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF); CU_ASSERT((uint64_t)rdma_req.req.buffers[0] == 0x2000); for (i = 0; i < 2; ++i) { - CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size)); - CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs); - } - for (i = 0; i < 2; ++i) { - CU_ASSERT(rdma_req.data.wr.sg_list[i + 2].addr == 0x2000 + i * (data_bs + md_size)); - CU_ASSERT(rdma_req.data.wr.sg_list[i + 2].length == data_bs); + CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000); + CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs * 2); } /* Part 6: simple I/O, one SGL larger than the transport io unit size, io_unit_size is not aligned to md_size, @@ -1257,26 +1254,21 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void) CU_ASSERT(rdma_req.req.length == data_bs * 4 * 2); CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length); CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4 * 2); - CU_ASSERT(rdma_req.data.wr.num_sge == 4); - for (i = 0; i < 4; ++i) { - CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == (uintptr_t)((unsigned char *)aligned_buffer) + i * - (data_bs + md_size)); - CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs); - } + CU_ASSERT(rdma_req.data.wr.num_sge == 1); + CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == (uintptr_t)(aligned_buffer)); + CU_ASSERT(rdma_req.data.wr.sg_list[0].length == data_bs * 4); CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0x44); CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0x4000); CU_ASSERT(rdma_req.data.wr.next == &data->wr); CU_ASSERT(data->wr.wr.rdma.rkey == 0x44); CU_ASSERT(data->wr.wr.rdma.remote_addr == 0x4000 + data_bs * 4); - CU_ASSERT(data->wr.num_sge == 4); - for (i = 0; i < 4; ++i) { - CU_ASSERT(data->wr.sg_list[i].addr == (uintptr_t)((unsigned char *)aligned_buffer) + i * - (data_bs + md_size)); - CU_ASSERT(data->wr.sg_list[i].length == data_bs); - } + CU_ASSERT(data->wr.num_sge == 1); + CU_ASSERT(data->wr.sg_list[0].addr == (uintptr_t)(aligned_buffer)); + CU_ASSERT(data->wr.sg_list[0].length == data_bs * 4); CU_ASSERT(data->wr.next == &rdma_req.rsp.wr); + reset_nvmf_rdma_request(&rdma_req); } static void