nvmf/rdma: Improve read performance in DIF strip mode

The rdma buffer for stripping DIF metadata is added. CPU strips the DIF
metadata and copies it to the rdma buffer, improving the rdma write
bandwith. The network bandwidth during 4KB random read test is increased
from 79 Gbps to 99 Gbps, the IOPS is increased from 2075K to 2637K.

Fixes issue #2418

Signed-off-by: Chunsong Feng <fengchunsong@huawei.com>
Change-Id: If1c31256f0390f31d396812fa33cd650bf52b336
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11861
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Chunsong Feng 2022-03-09 11:44:03 +00:00 committed by Tomasz Zawadzki
parent 0bd7ace836
commit 0db0c443df
7 changed files with 183 additions and 46 deletions

View File

@ -80,6 +80,12 @@ struct spdk_nvmf_dif_info {
uint32_t orig_length;
};
struct spdk_nvmf_stripped_data {
uint32_t iovcnt;
struct iovec iov[NVMF_REQ_MAX_BUFFERS];
void *buffers[NVMF_REQ_MAX_BUFFERS];
};
enum spdk_nvmf_zcopy_phase {
NVMF_ZCOPY_PHASE_NONE, /* Request is not using ZCOPY */
NVMF_ZCOPY_PHASE_INIT, /* Requesting Buffers */
@ -104,6 +110,7 @@ struct spdk_nvmf_request {
uint32_t iovcnt;
struct iovec iov[NVMF_REQ_MAX_BUFFERS];
void *buffers[NVMF_REQ_MAX_BUFFERS];
struct spdk_nvmf_stripped_data *stripped_data;
struct spdk_nvmf_dif_info dif;

View File

@ -48,6 +48,7 @@
#include "spdk_internal/rdma.h"
#include "nvmf_internal.h"
#include "transport.h"
#include "spdk_internal/trace_defs.h"
@ -810,6 +811,8 @@ nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts)
rdma_req->req.qpair = NULL;
}
rdma_req->req.cmd = NULL;
rdma_req->req.iovcnt = 0;
rdma_req->req.stripped_data = NULL;
/* Set up memory to send responses */
rdma_req->req.rsp = &resources->cpls[i];
@ -1484,19 +1487,29 @@ nvmf_rdma_fill_wr_sgl_with_dif(struct spdk_nvmf_rdma_poll_group *rgroup,
struct spdk_dif_ctx *dif_ctx = &rdma_req->req.dif.dif_ctx;
struct ibv_sge *sg_ele;
struct iovec *iov;
struct iovec *rdma_iov;
uint32_t lkey, remaining;
uint32_t remaining_data_block, data_block_size, md_size;
uint32_t sge_len;
int rc;
data_block_size = dif_ctx->block_size - dif_ctx->md_size;
md_size = dif_ctx->md_size;
remaining_data_block = data_block_size;
if (spdk_likely(!rdma_req->req.stripped_data)) {
rdma_iov = rdma_req->req.iov;
remaining_data_block = data_block_size;
md_size = dif_ctx->md_size;
} else {
rdma_iov = rdma_req->req.stripped_data->iov;
total_length = total_length / dif_ctx->block_size * data_block_size;
remaining_data_block = total_length;
md_size = 0;
}
wr->num_sge = 0;
while (total_length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) {
iov = &rdma_req->req.iov[rdma_req->iovpos];
iov = rdma_iov + rdma_req->iovpos;
rc = spdk_rdma_get_translation(device->map, iov->iov_base, iov->iov_len, &mem_translation);
if (spdk_unlikely(rc)) {
return rc;
@ -1612,6 +1625,17 @@ nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
assert(req->iovcnt <= rqpair->max_send_sge);
/* When dif_insert_or_strip is true and the I/O data length is greater than one block,
* the stripped_buffers are got for DIF stripping. */
if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST)
&& (req->dif.elba_length > req->dif.dif_ctx.block_size))) {
rc = nvmf_request_get_stripped_buffers(req, &rgroup->group,
&rtransport->transport, req->dif.orig_length);
if (rc != 0) {
SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc);
}
}
rdma_req->iovpos = 0;
if (spdk_unlikely(req->dif_enabled)) {
@ -1706,6 +1730,17 @@ nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtranspor
return rc;
}
/* When dif_insert_or_strip is true and the I/O data length is greater than one block,
* the stripped_buffers are got for DIF stripping. */
if (spdk_unlikely(req->dif_enabled && (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST)
&& (req->dif.elba_length > req->dif.dif_ctx.block_size))) {
rc = nvmf_request_get_stripped_buffers(req, &rgroup->group,
&rtransport->transport, req->dif.orig_length);
if (rc != 0) {
SPDK_INFOLOG(rdma, "Get stripped buffers fail %d, fallback to req.iov.\n", rc);
}
}
/* The first WR must always be the embedded data WR. This is how we unwind them later. */
current_wr = &rdma_req->data.wr;
assert(current_wr != NULL);
@ -1890,6 +1925,11 @@ _nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport);
}
if (rdma_req->req.stripped_data) {
nvmf_request_free_stripped_buffers(&rdma_req->req,
&rqpair->poller->group->group,
&rtransport->transport);
}
nvmf_rdma_request_free_data(rdma_req, rtransport);
rdma_req->req.length = 0;
rdma_req->req.iovcnt = 0;
@ -2116,9 +2156,15 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_dif_error error_blk;
num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
&rdma_req->req.dif.dif_ctx, &error_blk);
if (!rdma_req->req.stripped_data) {
rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
&rdma_req->req.dif.dif_ctx, &error_blk);
} else {
rc = spdk_dif_verify_copy(rdma_req->req.stripped_data->iov,
rdma_req->req.stripped_data->iovcnt,
rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
&rdma_req->req.dif.dif_ctx, &error_blk);
}
if (rc) {
struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl;

View File

@ -1048,6 +1048,8 @@ nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair)
tcp_req->req.rsp = (union nvmf_c2h_msg *)&tcp_req->rsp;
tcp_req->req.cmd = (union nvmf_h2c_msg *)&tcp_req->cmd;
tcp_req->req.stripped_data = NULL;
/* Initialize request state to FREE */
tcp_req->state = TCP_REQUEST_STATE_FREE;
TAILQ_INSERT_TAIL(&tqpair->tcp_req_free_queue, tcp_req, state_link);

View File

@ -690,7 +690,9 @@ spdk_nvmf_request_free_buffers(struct spdk_nvmf_request *req,
req->data_from_pool = false;
}
static inline int
typedef int (*set_buffer_callback)(struct spdk_nvmf_request *req, void *buf,
uint32_t length, uint32_t io_unit_size);
static int
nvmf_request_set_buffer(struct spdk_nvmf_request *req, void *buf, uint32_t length,
uint32_t io_unit_size)
{
@ -708,9 +710,9 @@ static int
nvmf_request_get_buffers(struct spdk_nvmf_request *req,
struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_transport *transport,
uint32_t length)
uint32_t length, uint32_t io_unit_size,
set_buffer_callback cb_func)
{
uint32_t io_unit_size = transport->opts.io_unit_size;
uint32_t num_buffers;
uint32_t i = 0, j;
void *buffer, *buffers[NVMF_REQ_MAX_BUFFERS];
@ -730,7 +732,7 @@ nvmf_request_get_buffers(struct spdk_nvmf_request *req,
STAILQ_REMOVE_HEAD(&group->buf_cache, link);
assert(buffer != NULL);
length = nvmf_request_set_buffer(req, buffer, length, io_unit_size);
length = cb_func(req, buffer, length, io_unit_size);
i++;
} else {
if (spdk_mempool_get_bulk(transport->data_buf_pool, buffers,
@ -738,7 +740,7 @@ nvmf_request_get_buffers(struct spdk_nvmf_request *req,
return -ENOMEM;
}
for (j = 0; j < num_buffers - i; j++) {
length = nvmf_request_set_buffer(req, buffers[j], length, io_unit_size);
length = cb_func(req, buffers[j], length, io_unit_size);
}
i += num_buffers - i;
}
@ -746,7 +748,6 @@ nvmf_request_get_buffers(struct spdk_nvmf_request *req,
assert(length == 0);
req->data_from_pool = true;
return 0;
}
@ -759,11 +760,90 @@ spdk_nvmf_request_get_buffers(struct spdk_nvmf_request *req,
int rc;
req->iovcnt = 0;
rc = nvmf_request_get_buffers(req, group, transport, length);
if (rc == -ENOMEM) {
rc = nvmf_request_get_buffers(req, group, transport, length,
transport->opts.io_unit_size,
nvmf_request_set_buffer);
if (!rc) {
req->data_from_pool = true;
} else if (rc == -ENOMEM) {
spdk_nvmf_request_free_buffers(req, group, transport);
return rc;
}
return rc;
}
static int
nvmf_request_set_stripped_buffer(struct spdk_nvmf_request *req, void *buf, uint32_t length,
uint32_t io_unit_size)
{
struct spdk_nvmf_stripped_data *data = req->stripped_data;
data->buffers[data->iovcnt] = buf;
data->iov[data->iovcnt].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) &
~NVMF_DATA_BUFFER_MASK);
data->iov[data->iovcnt].iov_len = spdk_min(length, io_unit_size);
length -= data->iov[data->iovcnt].iov_len;
data->iovcnt++;
return length;
}
void
nvmf_request_free_stripped_buffers(struct spdk_nvmf_request *req,
struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_transport *transport)
{
struct spdk_nvmf_stripped_data *data = req->stripped_data;
uint32_t i;
for (i = 0; i < data->iovcnt; i++) {
if (group->buf_cache_count < group->buf_cache_size) {
STAILQ_INSERT_HEAD(&group->buf_cache,
(struct spdk_nvmf_transport_pg_cache_buf *)data->buffers[i],
link);
group->buf_cache_count++;
} else {
spdk_mempool_put(transport->data_buf_pool, data->buffers[i]);
}
}
free(data);
req->stripped_data = NULL;
}
int
nvmf_request_get_stripped_buffers(struct spdk_nvmf_request *req,
struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_transport *transport,
uint32_t length)
{
uint32_t block_size = req->dif.dif_ctx.block_size;
uint32_t data_block_size = block_size - req->dif.dif_ctx.md_size;
uint32_t io_unit_size = transport->opts.io_unit_size / block_size * data_block_size;
struct spdk_nvmf_stripped_data *data;
uint32_t i;
int rc;
/* Data blocks must be block aligned */
for (i = 0; i < req->iovcnt; i++) {
if (req->iov[i].iov_len % block_size) {
return -EINVAL;
}
}
data = calloc(1, sizeof(*data));
if (data == NULL) {
SPDK_ERRLOG("Unable to allocate memory for stripped_data.\n");
return -ENOMEM;
}
req->stripped_data = data;
req->stripped_data->iovcnt = 0;
rc = nvmf_request_get_buffers(req, group, transport, length, io_unit_size,
nvmf_request_set_stripped_buffer);
if (rc == -ENOMEM) {
nvmf_request_free_stripped_buffers(req, group, transport);
return rc;
}
return rc;
}

View File

@ -78,4 +78,13 @@ int nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
void nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
struct spdk_nvmf_request *req);
void nvmf_request_free_stripped_buffers(struct spdk_nvmf_request *req,
struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_transport *transport);
int nvmf_request_get_stripped_buffers(struct spdk_nvmf_request *req,
struct spdk_nvmf_transport_poll_group *group,
struct spdk_nvmf_transport *transport,
uint32_t length);
#endif /* SPDK_NVMF_TRANSPORT_H */

View File

@ -1405,6 +1405,7 @@ alloc_sq_reqs(struct nvmf_vfio_user_ctrlr *vu_ctrlr, struct nvmf_vfio_user_sq *s
req->qpair = &sq->qpair;
req->rsp = (union nvmf_c2h_msg *)&vu_req->rsp;
req->cmd = (union nvmf_h2c_msg *)&vu_req->cmd;
req->stripped_data = NULL;
TAILQ_INSERT_TAIL(&sq->free_reqs, vu_req, link);
}

View File

@ -172,6 +172,10 @@ static void reset_nvmf_rdma_request(struct spdk_nvmf_rdma_request *rdma_req)
rdma_req->data.wr.sg_list[i].lkey = 0;
}
rdma_req->req.iovcnt = 0;
if (rdma_req->req.stripped_data) {
free(rdma_req->req.stripped_data);
rdma_req->req.stripped_data = NULL;
}
}
static void
@ -943,6 +947,7 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
0, 0, 0, 0, 0);
rdma_req.req.dif_enabled = true;
rtransport.transport.opts.io_unit_size = data_bs * 8;
rdma_req.req.qpair->transport = &rtransport.transport;
sgl->keyed.length = data_bs * 4;
rc = nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req);
@ -953,16 +958,14 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length);
CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4);
CU_ASSERT((uint64_t)rdma_req.req.data == 0x2000);
CU_ASSERT(rdma_req.data.wr.num_sge == 4);
CU_ASSERT(rdma_req.data.wr.num_sge == 1);
CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE);
CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF);
CU_ASSERT((uint64_t)rdma_req.req.buffers[0] == 0x2000);
for (i = 0; i < 4; ++i) {
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size));
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
}
CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == 0x2000);
CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rdma_req.req.length);
CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY);
/* Part 2: simple I/O, one SGL equal to io unit size, io_unit_size is not aligned with md_size,
block size 512 */
@ -1055,16 +1058,14 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length);
CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4);
CU_ASSERT((uint64_t)rdma_req.req.data == 0x2000);
CU_ASSERT(rdma_req.data.wr.num_sge == 4);
CU_ASSERT(rdma_req.data.wr.num_sge == 1);
CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE);
CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF);
CU_ASSERT((uint64_t)rdma_req.req.buffers[0] == 0x2000);
for (i = 0; i < 4; ++i) {
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size));
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
CU_ASSERT(rdma_req.data.wr.sg_list[i].lkey == RDMA_UT_LKEY);
}
CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == 0x2000);
CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rdma_req.req.length);
CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == RDMA_UT_LKEY);
/* Part 5: simple I/O, one SGL equal to 2x io unit size, io_unit_size is aligned with md_size,
block size 512 */
@ -1085,18 +1086,14 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length);
CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4);
CU_ASSERT((uint64_t)rdma_req.req.data == 0x2000);
CU_ASSERT(rdma_req.data.wr.num_sge == 4);
CU_ASSERT(rdma_req.data.wr.num_sge == 2);
CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE);
CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF);
CU_ASSERT((uint64_t)rdma_req.req.buffers[0] == 0x2000);
for (i = 0; i < 2; ++i) {
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000 + i * (data_bs + md_size));
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
}
for (i = 0; i < 2; ++i) {
CU_ASSERT(rdma_req.data.wr.sg_list[i + 2].addr == 0x2000 + i * (data_bs + md_size));
CU_ASSERT(rdma_req.data.wr.sg_list[i + 2].length == data_bs);
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000);
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs * 2);
}
/* Part 6: simple I/O, one SGL larger than the transport io unit size, io_unit_size is not aligned to md_size,
@ -1257,26 +1254,21 @@ test_spdk_nvmf_rdma_request_parse_sgl_with_md(void)
CU_ASSERT(rdma_req.req.length == data_bs * 4 * 2);
CU_ASSERT(rdma_req.req.dif.orig_length == rdma_req.req.length);
CU_ASSERT(rdma_req.req.dif.elba_length == (data_bs + md_size) * 4 * 2);
CU_ASSERT(rdma_req.data.wr.num_sge == 4);
for (i = 0; i < 4; ++i) {
CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == (uintptr_t)((unsigned char *)aligned_buffer) + i *
(data_bs + md_size));
CU_ASSERT(rdma_req.data.wr.sg_list[i].length == data_bs);
}
CU_ASSERT(rdma_req.data.wr.num_sge == 1);
CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == (uintptr_t)(aligned_buffer));
CU_ASSERT(rdma_req.data.wr.sg_list[0].length == data_bs * 4);
CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0x44);
CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0x4000);
CU_ASSERT(rdma_req.data.wr.next == &data->wr);
CU_ASSERT(data->wr.wr.rdma.rkey == 0x44);
CU_ASSERT(data->wr.wr.rdma.remote_addr == 0x4000 + data_bs * 4);
CU_ASSERT(data->wr.num_sge == 4);
for (i = 0; i < 4; ++i) {
CU_ASSERT(data->wr.sg_list[i].addr == (uintptr_t)((unsigned char *)aligned_buffer) + i *
(data_bs + md_size));
CU_ASSERT(data->wr.sg_list[i].length == data_bs);
}
CU_ASSERT(data->wr.num_sge == 1);
CU_ASSERT(data->wr.sg_list[0].addr == (uintptr_t)(aligned_buffer));
CU_ASSERT(data->wr.sg_list[0].length == data_bs * 4);
CU_ASSERT(data->wr.next == &rdma_req.rsp.wr);
reset_nvmf_rdma_request(&rdma_req);
}
static void