From c818233b4171c955353a10aa7537e81a6615eb49 Mon Sep 17 00:00:00 2001 From: Ivan Betsis Date: Wed, 12 Feb 2020 12:36:58 +0200 Subject: [PATCH] nvmf/rdma: Add WR batch rdma new transport option With x86 and low queue depths with multiple QPs/initiators there is a benefit from disable batch when we have randread IO pattern. Testing environment: x86, Intel(R) Xeon(R) Silver 4116 CPU @ 2.10GH Several results: init_cores | QD | BS | Avg IOPs/BW without batch | Avg IOPS/BW with batch 8 cores | 4 | 4K | 1870087 / 7305 | 1594014 / 6226 8 cores | 8 | 4K | 1853573 / 7240 | 1576400 / 6157 8 cores | 16 | 4K | 1819643 / 7108 | 1569487 / 6130 8 cores | 32 | 4K | 1815467 / 7092 | 1569909 / 6132 16 cores | 4 | 4K | 1908018 / 7453 | 1566843 / 6120 16 cores | 8 | 4K | 1906081 / 7446 | 1562110 / 6102 16 cores | 16 | 4K | 1880706 / 7346 | 1555060 / 6074 16 cores | 32 | 4K | 1835878 / 7171 | 1548156 / 6046 Signed-off-by: Ivan Betsis Signed-off-by: Evgeniy Kochetov Signed-off-by: Alexey Marchuk Signed-off-by: Sasha Kotchubievsky Change-Id: Icdbbbdf83f137eb4f05bd2063268ee2a7d87335a Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/924 Reviewed-by: Jim Harris Reviewed-by: Shuhei Matsumoto Tested-by: SPDK CI Jenkins Community-CI: Broadcom CI --- CHANGELOG.md | 5 +++++ lib/nvmf/rdma.c | 32 ++++++++++++++++++++++++++++- test/unit/lib/nvmf/rdma.c/rdma_ut.c | 14 +++++++------ 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88e4884ba..6054edfc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -79,6 +79,11 @@ library now no longer exists. The contents of the bdev_rpc library have been moved to the bdev library. The app_rpc library now no longer exists. +### nvmf + +Add 'no_wr_batching' parameter to 'spdk_nvmf_transport_opts' struct to disable +Work Requests batching in RDMA transport. + ### scsi Two new APIs have been added `spdk_scsi_dev_construct_ext` and diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index 858acd34f..51f2af4f6 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -476,6 +476,7 @@ struct spdk_nvmf_rdma_port { struct rdma_transport_opts { uint32_t max_srq_depth; bool no_srq; + bool no_wr_batching; int acceptor_backlog; }; @@ -519,6 +520,14 @@ static bool nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_nvmf_rdma_request *rdma_req); +static void +_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller); + +static void +_poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller); + static inline int nvmf_rdma_check_ibv_state(enum ibv_qp_state state) { @@ -1038,6 +1047,8 @@ static void nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first) { struct ibv_recv_wr *last; + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, transport); last = first; while (last->next != NULL) { @@ -1054,6 +1065,10 @@ nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_r rqpair->resources->recvs_to_post.last->next = first; rqpair->resources->recvs_to_post.last = last; } + + if (rtransport->rdma_opts.no_wr_batching) { + _poller_submit_recvs(rtransport, rqpair->poller); + } } static int @@ -1062,10 +1077,13 @@ request_transfer_in(struct spdk_nvmf_request *req) struct spdk_nvmf_rdma_request *rdma_req; struct spdk_nvmf_qpair *qpair; struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_transport *rtransport; qpair = req->qpair; rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, transport); assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); assert(rdma_req != NULL); @@ -1073,6 +1091,9 @@ request_transfer_in(struct spdk_nvmf_request *req) if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, &rdma_req->data.wr)) { STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); } + if (rtransport->rdma_opts.no_wr_batching) { + _poller_submit_sends(rtransport, rqpair->poller); + } rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; @@ -1088,12 +1109,15 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) struct spdk_nvmf_rdma_qpair *rqpair; struct spdk_nvme_cpl *rsp; struct ibv_send_wr *first = NULL; + struct spdk_nvmf_rdma_transport *rtransport; *data_posted = 0; qpair = req->qpair; rsp = &req->rsp->nvme_cpl; rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, transport); /* Advance our sq_head pointer */ if (qpair->sq_head == qpair->sq_head_max) { @@ -1131,6 +1155,9 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, first)) { STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); } + if (rtransport->rdma_opts.no_wr_batching) { + _poller_submit_sends(rtransport, rqpair->poller); + } /* +1 for the rsp wr */ rqpair->current_send_depth += num_outstanding_data_wr + 1; @@ -2235,6 +2262,7 @@ nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, #define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false #define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100 #define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1 +#define SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING false static void nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) @@ -2308,6 +2336,7 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) rtransport->rdma_opts.max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; rtransport->rdma_opts.no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ; rtransport->rdma_opts.acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG; + rtransport->rdma_opts.no_wr_batching = SPDK_NVMF_RDMA_DEFAULT_NO_WR_BATCHING; if (opts->transport_specific != NULL && spdk_json_decode_object_relaxed(opts->transport_specific, rdma_transport_opts_decoder, SPDK_COUNTOF(rdma_transport_opts_decoder), @@ -2322,7 +2351,7 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" " in_capsule_data_size=%d, max_aq_depth=%d,\n" " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d," - " acceptor_backlog=%d, abort_timeout_sec=%d\n", + " acceptor_backlog=%d, no_wr_batching=%d abort_timeout_sec=%d\n", opts->max_queue_depth, opts->max_io_size, opts->max_qpairs_per_ctrlr - 1, @@ -2333,6 +2362,7 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) rtransport->rdma_opts.max_srq_depth, rtransport->rdma_opts.no_srq, rtransport->rdma_opts.acceptor_backlog, + rtransport->rdma_opts.no_wr_batching, opts->abort_timeout_sec); /* I/O unit size cannot be larger than max I/O size */ diff --git a/test/unit/lib/nvmf/rdma.c/rdma_ut.c b/test/unit/lib/nvmf/rdma.c/rdma_ut.c index b6eae8774..31cf8fd4f 100644 --- a/test/unit/lib/nvmf/rdma.c/rdma_ut.c +++ b/test/unit/lib/nvmf/rdma.c/rdma_ut.c @@ -577,7 +577,8 @@ static void qpair_reset(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_poller *poller, struct spdk_nvmf_rdma_device *device, - struct spdk_nvmf_rdma_resources *resources) + struct spdk_nvmf_rdma_resources *resources, + struct spdk_nvmf_transport *transport) { memset(rqpair, 0, sizeof(*rqpair)); STAILQ_INIT(&rqpair->pending_rdma_write_queue); @@ -591,6 +592,7 @@ qpair_reset(struct spdk_nvmf_rdma_qpair *rqpair, rqpair->max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES; rqpair->max_send_depth = 16; rqpair->max_read_depth = 16; + rqpair->qpair.transport = transport; resources->recvs_to_post.first = resources->recvs_to_post.last = NULL; } @@ -622,7 +624,7 @@ test_spdk_nvmf_rdma_request_process(void) group.group.buf_cache_size = 0; group.group.buf_cache_count = 0; poller_reset(&poller, &group); - qpair_reset(&rqpair, &poller, &device, &resources); + qpair_reset(&rqpair, &poller, &device, &resources, &rtransport.transport); rtransport.transport.opts = g_rdma_ut_transport_opts; rtransport.transport.data_buf_pool = spdk_mempool_create("test_data_pool", 16, 128, 0, 0); @@ -661,7 +663,7 @@ test_spdk_nvmf_rdma_request_process(void) free_recv(rdma_recv); free_req(rdma_req); poller_reset(&poller, &group); - qpair_reset(&rqpair, &poller, &device, &resources); + qpair_reset(&rqpair, &poller, &device, &resources, &rtransport.transport); /* Test 2: single SGL WRITE request */ rdma_recv = create_recv(&rqpair, SPDK_NVME_OPC_WRITE); @@ -695,7 +697,7 @@ test_spdk_nvmf_rdma_request_process(void) free_recv(rdma_recv); free_req(rdma_req); poller_reset(&poller, &group); - qpair_reset(&rqpair, &poller, &device, &resources); + qpair_reset(&rqpair, &poller, &device, &resources, &rtransport.transport); /* Test 3: WRITE+WRITE ibv_send batching */ { @@ -754,7 +756,7 @@ test_spdk_nvmf_rdma_request_process(void) free_recv(recv2); free_req(req2); poller_reset(&poller, &group); - qpair_reset(&rqpair, &poller, &device, &resources); + qpair_reset(&rqpair, &poller, &device, &resources, &rtransport.transport); } /* Test 4, invalid command, check xfer type */ @@ -783,7 +785,7 @@ test_spdk_nvmf_rdma_request_process(void) free_recv(rdma_recv_inv); free_req(rdma_req_inv); poller_reset(&poller, &group); - qpair_reset(&rqpair, &poller, &device, &resources); + qpair_reset(&rqpair, &poller, &device, &resources, &rtransport.transport); } spdk_mempool_free(rtransport.transport.data_buf_pool);