nvme_rdma: Support SRQ for I/O qpairs

Support SRQ in RDMA transport of NVMe-oF initiator.

Add a new spdk_nvme_transport_opts structure and add rdma_srq_size
to the spdk_nvme_transport_opts structure.

For the user of the NVMe driver, provide two public APIs,
spdk_nvme_transport_get_opts() and spdk_nvme_transport_set_opts().

In the NVMe driver, the instance of spdk_nvme_transport_opts,
g_spdk_nvme_transport_opts, is accessible throughtout.

From an issue that async event handling caused conflicts between
initiator and target, the NVMe-oF RDMA initiator does not handle
the LAST_WQE_REACHED event. Hence, it may geta WC for a already
destroyed QP. To clarify this, add a comment in the source code.

The following is a result of a small performance evaluation using
SPDK NVMe perf tool. Even for queue_depth=1, overhead was less than 1%.
Eventually, we may be able to enable SRQ by default for NVMe-oF
initiator.

1.1 randwrite, qd=1, srq=enabled
./build/examples/perf -q 1 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  162411.97     634.42       6.14       5.42     284.07
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  163095.87     637.09       6.12       5.41     423.95
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  164725.30     643.46       6.06       5.32     165.60
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  162548.57     634.96       6.14       5.39     227.24
========================================================
Total                                                                     :  652781.70    2549.93       6.12

1.2 randwrite, qd=1, srq=disabled
./build/examples/perf -q 1 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  163398.03     638.27       6.11       5.33     240.76
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  164632.47     643.10       6.06       5.29     125.22
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  164694.40     643.34       6.06       5.31     408.43
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  164007.13     640.65       6.08       5.33     170.10
========================================================
Total                                                                     :  656732.03    2565.36       6.08       5.29     408.43

2.1 randread, qd=1, srq=enabled
./build/examples/perf -q 1 -s 1024 -w randread -t 30 -c 0xF -o 4096 -r '
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  153514.40     599.67       6.50       5.97     277.22
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  153567.57     599.87       6.50       5.95     408.06
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  153590.33     599.96       6.50       5.88     134.74
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  153357.40     599.05       6.51       5.97     229.03
========================================================
Total                                                                     :  614029.70    2398.55       6.50       5.88     408.06

2.2 randread, qd=1, srq=disabled
./build/examples/perf -q 1 -s 1024 -w randread -t 30 -c 0XF -o 4096 -r '
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  154452.40     603.33       6.46       5.94     233.15
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  154711.67     604.34       6.45       5.91      25.55
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  154717.70     604.37       6.45       5.88     130.92
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  154713.77     604.35       6.45       5.91     128.19
========================================================
Total                                                                     :  618595.53    2416.39       6.45       5.88     233.15

3.1 randwrite, qd=32, srq=enabled
./build/examples/perf -q 32 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r 'trtype:RDMA adrfam:IPv4 traddr:1.1.18.1 trsvcid:4420'
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  672608.17    2627.38      47.56      11.33     326.96
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  672386.20    2626.51      47.58      11.03     221.88
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  673343.70    2630.25      47.51       9.11     387.54
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  672799.10    2628.12      47.55      10.48     552.80
========================================================
Total                                                                     : 2691137.17   10512.25      47.55       9.11     552.80

3.2 randwrite, qd=32, srq=disabled
./build/examples/perf -q 32 -s 1024 -w randwrite -t 30 -c 0XF -o 4096 -r 'trtype:RDMA adrfam:IPv4 traddr:1.1.18.1 trsvcid:4420'
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  672647.53    2627.53      47.56      11.13     389.95
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  672756.50    2627.96      47.55       9.53     394.83
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  672464.63    2626.81      47.57       9.48     528.07
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  673250.73    2629.89      47.52       9.43     389.83
========================================================
Total                                                                     : 2691119.40   10512.19      47.55       9.43     528.07

4.1 randread, qd=32, srq=enabled
./build/examples/perf -q 32 -s 1024 -w randread -t 30 -c 0xF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  677286.30    2645.65      47.23      12.29     335.90
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  677554.97    2646.70      47.22      20.39     196.21
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  677086.07    2644.87      47.25      19.17     386.26
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  677654.93    2647.09      47.21      18.92     181.05
========================================================
Total                                                                     : 2709582.27   10584.31      47.23      12.29     386.26

4.2 randread, qd=32, srq=disabled
./build/examples/perf -q 32 -s 1024 -w randread -t 30 -c 0XF -o 4096 -r
========================================================
                                                                                                              Latency(us)
Device Information                                                        :       IOPS      MiB/s    Average        min        max
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  0:  677432.60    2646.22      47.22      13.05     435.91
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  1:  677450.43    2646.29      47.22      16.26     178.60
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  2:  677647.10    2647.06      47.21      17.82     177.83
RDMA (addr:1.1.18.1 subnqn:nqn.2016-06.io.spdk:cnode1) NSID 1 from core  3:  677047.33    2644.72      47.25      15.62     308.21
========================================================
Total                                                                     : 2709577.47   10584.29      47.23      13.05     435.91

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Signed-off-by: Denis Nagorny <denisn@nvidia.com>
Signed-off-by: Evgeniy Kochetov <evgeniik@nvidia.com>
Change-Id: I843a5eda14e872bf6e2010e9f63b8e46d5bba691
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14174
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
This commit is contained in:
Shuhei Matsumoto 2022-12-07 16:24:08 +09:00 committed by Jim Harris
parent 4999a9850c
commit bcd987ea2d
7 changed files with 300 additions and 34 deletions

View File

@ -76,6 +76,15 @@ and disable CPU core locks in runtime.
Added --rpcs-allowed command line option. Users can specify a comma-separated list of RPC
names with this option to restrict allowed RPCs to only that list.
### nvme
NVMe transport options were newly introduced. The NVMe transport options are defined via
the `spdk_nvme_transport_opts` structure and configured via `spdk_nvme_transport_get_opts`
and `spdk_nvme_transport_get_opts` functions.
Shared receive queue was supported by the RDMA transport. It can be configured by
a new NVMe transport option `rdma_srq_size`.
### rpc
Added spdk_rpc_set_allowlist to restrict allowed RPCs to the specified list.

View File

@ -4098,6 +4098,44 @@ static void __attribute__((constructor)) _spdk_nvme_transport_register_##name(vo
spdk_nvme_transport_register(transport_ops); \
}
/**
* NVMe transport options.
*/
struct spdk_nvme_transport_opts {
/**
* It is used for RDMA transport.
*
* The queue depth of a shared rdma receive queue.
*/
uint32_t rdma_srq_size;
/**
* The size of spdk_nvme_transport_opts according to the caller of this library is used for ABI
* compatibility. The library uses this field to know how many fields in this
* structure are valid. And the library will populate any remaining fields with default values.
*/
size_t opts_size;
} __attribute__((packed));
SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_transport_opts) == 12, "Incorrect size");
/**
* Get the current NVMe transport options.
*
* \param[out] opts Will be filled with the current options for spdk_nvme_transport_set_opts().
* \param opts_size Must be set to sizeof(struct spdk_nvme_transport_opts).
*/
void spdk_nvme_transport_get_opts(struct spdk_nvme_transport_opts *opts, size_t opts_size);
/**
* Set the NVMe transport options.
*
* \param opts Pointer to the allocated spdk_nvme_transport_opts structure with new values.
* \param opts_size Must be set to sizeof(struct spdk_nvme_transport_opts).
*
* \return 0 on success, or negated errno on failure.
*/
int spdk_nvme_transport_set_opts(const struct spdk_nvme_transport_opts *opts, size_t opts_size);
#ifdef __cplusplus
}
#endif

View File

@ -34,6 +34,8 @@
extern pid_t g_spdk_nvme_pid;
extern struct spdk_nvme_transport_opts g_spdk_nvme_transport_opts;
/*
* Some Intel devices support vendor-unique read latency log page even
* though the log page directory says otherwise.

View File

@ -133,10 +133,15 @@ struct nvme_rdma_poller_stats {
};
struct nvme_rdma_poll_group;
struct nvme_rdma_rsps;
struct nvme_rdma_poller {
struct ibv_context *device;
struct ibv_cq *cq;
struct spdk_rdma_srq *srq;
struct nvme_rdma_rsps *rsps;
struct ibv_pd *pd;
struct spdk_rdma_mem_map *mr_map;
uint32_t refcnt;
int required_num_wc;
int current_num_wc;
@ -170,6 +175,7 @@ typedef int (*nvme_rdma_cm_event_cb)(struct nvme_rdma_qpair *rqpair, int ret);
struct nvme_rdma_rsp_opts {
uint16_t num_entries;
struct nvme_rdma_qpair *rqpair;
struct spdk_rdma_srq *srq;
struct spdk_rdma_mem_map *mr_map;
};
@ -193,6 +199,7 @@ struct nvme_rdma_qpair {
struct spdk_rdma_qp *rdma_qp;
struct rdma_cm_id *cm_id;
struct ibv_cq *cq;
struct spdk_rdma_srq *srq;
struct spdk_nvme_rdma_req *rdma_reqs;
@ -707,12 +714,16 @@ nvme_rdma_qpair_set_poller(struct spdk_nvme_qpair *qpair)
return -EINVAL;
}
if (nvme_rdma_resize_cq(rqpair, poller)) {
nvme_rdma_poll_group_put_poller(group, poller);
return -EPROTO;
if (!poller->srq) {
if (nvme_rdma_resize_cq(rqpair, poller)) {
nvme_rdma_poll_group_put_poller(group, poller);
return -EPROTO;
}
}
rqpair->cq = poller->cq;
rqpair->srq = poller->srq;
rqpair->rsps = poller->rsps;
rqpair->poller = poller;
return 0;
}
@ -758,7 +769,11 @@ nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair)
attr.send_cq = rqpair->cq;
attr.recv_cq = rqpair->cq;
attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */
attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */
if (rqpair->srq) {
attr.srq = rqpair->srq->srq;
} else {
attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */
}
attr.cap.max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge);
attr.cap.max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge);
@ -839,6 +854,20 @@ nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair)
return rc;
}
static inline int
nvme_rdma_poller_submit_recvs(struct nvme_rdma_poller *poller)
{
struct ibv_recv_wr *bad_recv_wr;
int rc;
rc = spdk_rdma_srq_flush_recv_wrs(poller->srq, &bad_recv_wr);
if (spdk_unlikely(rc)) {
nvme_rdma_reset_failed_recvs(poller->rsps, bad_recv_wr, rc);
}
return rc;
}
#define nvme_rdma_trace_ibv_sge(sg_list) \
if (sg_list) { \
SPDK_DEBUGLOG(nvme, "local addr %p length 0x%x lkey 0x%x\n", \
@ -916,7 +945,11 @@ nvme_rdma_create_rsps(struct nvme_rdma_rsp_opts *opts)
nvme_rdma_trace_ibv_sge(recv_wr->sg_list);
spdk_rdma_qp_queue_recv_wrs(opts->rqpair->rdma_qp, recv_wr);
if (opts->rqpair) {
spdk_rdma_qp_queue_recv_wrs(opts->rqpair->rdma_qp, recv_wr);
} else {
spdk_rdma_srq_queue_recv_wrs(opts->srq, recv_wr);
}
}
rsps->num_entries = opts->num_entries;
@ -1127,24 +1160,27 @@ nvme_rdma_connect_established(struct nvme_rdma_qpair *rqpair, int ret)
}
SPDK_DEBUGLOG(nvme, "RDMA requests created\n");
opts.num_entries = rqpair->num_entries;
opts.rqpair = rqpair;
opts.mr_map = rqpair->mr_map;
if (!rqpair->srq) {
opts.num_entries = rqpair->num_entries;
opts.rqpair = rqpair;
opts.srq = NULL;
opts.mr_map = rqpair->mr_map;
rqpair->rsps = nvme_rdma_create_rsps(&opts);
if (!rqpair->rsps) {
SPDK_ERRLOG("Unable to create rqpair RDMA responses\n");
return -1;
}
SPDK_DEBUGLOG(nvme, "RDMA responses created\n");
rqpair->rsps = nvme_rdma_create_rsps(&opts);
if (!rqpair->rsps) {
SPDK_ERRLOG("Unable to create rqpair RDMA responses\n");
return -1;
}
SPDK_DEBUGLOG(nvme, "RDMA responses created\n");
ret = nvme_rdma_qpair_submit_recvs(rqpair);
SPDK_DEBUGLOG(nvme, "rc =%d\n", ret);
if (ret) {
SPDK_ERRLOG("Unable to submit rqpair RDMA responses\n");
return -1;
ret = nvme_rdma_qpair_submit_recvs(rqpair);
SPDK_DEBUGLOG(nvme, "rc =%d\n", ret);
if (ret) {
SPDK_ERRLOG("Unable to submit rqpair RDMA responses\n");
return -1;
}
SPDK_DEBUGLOG(nvme, "RDMA responses submitted\n");
}
SPDK_DEBUGLOG(nvme, "RDMA responses submitted\n");
rqpair->state = NVME_RDMA_QPAIR_STATE_FABRIC_CONNECT_SEND;
@ -1863,6 +1899,8 @@ nvme_rdma_qpair_destroy(struct nvme_rdma_qpair *rqpair)
rqpair->poller = NULL;
rqpair->cq = NULL;
rqpair->srq = NULL;
rqpair->rsps = NULL;
} else if (rqpair->cq) {
ibv_destroy_cq(rqpair->cq);
rqpair->cq = NULL;
@ -1892,7 +1930,8 @@ nvme_rdma_qpair_disconnected(struct nvme_rdma_qpair *rqpair, int ret)
goto quiet;
}
if (rqpair->current_num_sends != 0 || rqpair->rsps->current_num_recvs != 0) {
if (rqpair->current_num_sends != 0 ||
(!rqpair->srq && rqpair->rsps->current_num_recvs != 0)) {
rqpair->state = NVME_RDMA_QPAIR_STATE_LINGERING;
rqpair->evt_timeout_ticks = (NVME_RDMA_DISCONNECTED_QPAIR_TIMEOUT_US * spdk_get_ticks_hz()) /
SPDK_SEC_TO_USEC + spdk_get_ticks();
@ -1913,7 +1952,8 @@ static int
nvme_rdma_qpair_wait_until_quiet(struct nvme_rdma_qpair *rqpair)
{
if (spdk_get_ticks() < rqpair->evt_timeout_ticks &&
(rqpair->current_num_sends != 0 || rqpair->rsps->current_num_recvs != 0)) {
(rqpair->current_num_sends != 0 ||
(!rqpair->srq && rqpair->rsps->current_num_recvs != 0))) {
return -EAGAIN;
}
@ -2372,7 +2412,11 @@ nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_re
recv_wr->next = NULL;
nvme_rdma_trace_ibv_sge(recv_wr->sg_list);
spdk_rdma_qp_queue_recv_wrs(rqpair->rdma_qp, recv_wr);
if (!rqpair->srq) {
spdk_rdma_qp_queue_recv_wrs(rqpair->rdma_qp, recv_wr);
} else {
spdk_rdma_srq_queue_recv_wrs(rqpair->srq, recv_wr);
}
}
#define MAX_COMPLETIONS_PER_POLL 128
@ -2431,29 +2475,45 @@ nvme_rdma_log_wc_status(struct nvme_rdma_qpair *rqpair, struct ibv_wc *wc)
}
static inline int
nvme_rdma_process_recv_completion(struct ibv_wc *wc, struct nvme_rdma_wr *rdma_wr)
nvme_rdma_process_recv_completion(struct nvme_rdma_poller *poller, struct ibv_wc *wc,
struct nvme_rdma_wr *rdma_wr)
{
struct nvme_rdma_qpair *rqpair;
struct spdk_nvme_rdma_req *rdma_req;
struct spdk_nvme_rdma_rsp *rdma_rsp;
rdma_rsp = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_rsp, rdma_wr);
rqpair = rdma_rsp->rqpair;
if (poller && poller->srq) {
rqpair = get_rdma_qpair_from_wc(poller->group, wc);
if (spdk_unlikely(!rqpair)) {
/* Since we do not handle the LAST_WQE_REACHED event, we do not know when
* a Receive Queue in a QP, that is associated with an SRQ, is flushed.
* We may get a WC for a already destroyed QP.
*
* However, for the SRQ, this is not any error. Hence, just re-post the
* receive request to the SRQ to reuse for other QPs, and return 0.
*/
spdk_rdma_srq_queue_recv_wrs(poller->srq, rdma_rsp->recv_wr);
return 0;
}
} else {
rqpair = rdma_rsp->rqpair;
}
assert(rqpair->rsps->current_num_recvs > 0);
rqpair->rsps->current_num_recvs--;
if (wc->status) {
nvme_rdma_log_wc_status(rqpair, wc);
nvme_rdma_fail_qpair(&rqpair->qpair, 0);
return -ENXIO;
goto err_wc;
}
SPDK_DEBUGLOG(nvme, "CQ recv completion\n");
if (wc->byte_len < sizeof(struct spdk_nvme_cpl)) {
SPDK_ERRLOG("recv length %u less than expected response size\n", wc->byte_len);
nvme_rdma_fail_qpair(&rqpair->qpair, 0);
return -ENXIO;
goto err_wc;
}
rdma_req = &rqpair->rdma_reqs[rdma_rsp->cpl.cid];
rdma_req->completion_flags |= NVME_RDMA_RECV_COMPLETED;
@ -2475,6 +2535,13 @@ nvme_rdma_process_recv_completion(struct ibv_wc *wc, struct nvme_rdma_wr *rdma_w
rqpair->num_completions++;
return 1;
err_wc:
nvme_rdma_fail_qpair(&rqpair->qpair, 0);
if (poller && poller->srq) {
spdk_rdma_srq_queue_recv_wrs(poller->srq, rdma_rsp->recv_wr);
}
return -ENXIO;
}
static inline int
@ -2505,6 +2572,9 @@ nvme_rdma_process_send_completion(struct nvme_rdma_poller *poller,
rqpair->current_num_sends--;
nvme_rdma_log_wc_status(rqpair, wc);
nvme_rdma_fail_qpair(&rqpair->qpair, 0);
if (rdma_req->rdma_rsp && poller && poller->srq) {
spdk_rdma_srq_queue_recv_wrs(poller->srq, rdma_req->rdma_rsp->recv_wr);
}
return -ENXIO;
}
@ -2561,7 +2631,7 @@ nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size,
rdma_wr = (struct nvme_rdma_wr *)wc[i].wr_id;
switch (rdma_wr->type) {
case RDMA_WR_TYPE_RECV:
_rc = nvme_rdma_process_recv_completion(&wc[i], rdma_wr);
_rc = nvme_rdma_process_recv_completion(poller, &wc[i], rdma_wr);
break;
case RDMA_WR_TYPE_SEND:
@ -2767,6 +2837,18 @@ nvme_rdma_poller_destroy(struct nvme_rdma_poller *poller)
if (poller->cq) {
ibv_destroy_cq(poller->cq);
}
if (poller->rsps) {
nvme_rdma_free_rsps(poller->rsps);
}
if (poller->srq) {
spdk_rdma_srq_destroy(poller->srq);
}
if (poller->mr_map) {
spdk_rdma_free_mem_map(&poller->mr_map);
}
if (poller->pd) {
spdk_rdma_put_pd(poller->pd);
}
free(poller);
}
@ -2774,6 +2856,11 @@ static struct nvme_rdma_poller *
nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx)
{
struct nvme_rdma_poller *poller;
struct ibv_device_attr dev_attr;
struct spdk_rdma_srq_init_attr srq_init_attr = {};
struct nvme_rdma_rsp_opts opts;
int num_cqe;
int rc;
poller = calloc(1, sizeof(*poller));
if (poller == NULL) {
@ -2783,7 +2870,68 @@ nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *
poller->group = group;
poller->device = ctx;
poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0);
if (g_spdk_nvme_transport_opts.rdma_srq_size != 0) {
rc = ibv_query_device(ctx, &dev_attr);
if (rc) {
SPDK_ERRLOG("Unable to query RDMA device.\n");
goto fail;
}
poller->pd = spdk_rdma_get_pd(ctx);
if (poller->pd == NULL) {
SPDK_ERRLOG("Unable to get PD.\n");
goto fail;
}
poller->mr_map = spdk_rdma_create_mem_map(poller->pd, &g_nvme_hooks,
SPDK_RDMA_MEMORY_MAP_ROLE_INITIATOR);
if (poller->mr_map == NULL) {
SPDK_ERRLOG("Unable to create memory map.\n");
goto fail;
}
srq_init_attr.stats = &poller->stats.rdma_stats.recv;
srq_init_attr.pd = poller->pd;
srq_init_attr.srq_init_attr.attr.max_wr = spdk_min((uint32_t)dev_attr.max_srq_wr,
g_spdk_nvme_transport_opts.rdma_srq_size);
srq_init_attr.srq_init_attr.attr.max_sge = spdk_min(dev_attr.max_sge,
NVME_RDMA_DEFAULT_RX_SGE);
poller->srq = spdk_rdma_srq_create(&srq_init_attr);
if (poller->srq == NULL) {
SPDK_ERRLOG("Unable to create SRQ.\n");
goto fail;
}
opts.num_entries = g_spdk_nvme_transport_opts.rdma_srq_size;
opts.rqpair = NULL;
opts.srq = poller->srq;
opts.mr_map = poller->mr_map;
poller->rsps = nvme_rdma_create_rsps(&opts);
if (poller->rsps == NULL) {
SPDK_ERRLOG("Unable to create poller RDMA responses.\n");
goto fail;
}
rc = nvme_rdma_poller_submit_recvs(poller);
if (rc) {
SPDK_ERRLOG("Unable to submit poller RDMA responses.\n");
goto fail;
}
/*
* When using an srq, fix the size of the completion queue at startup.
* The initiator sends only send and recv WRs. Hence, the multiplier is 2.
* (The target sends also data WRs. Hence, the multiplier is 3.)
*/
num_cqe = g_spdk_nvme_transport_opts.rdma_srq_size * 2;
} else {
num_cqe = DEFAULT_NVME_RDMA_CQ_SIZE;
}
poller->cq = ibv_create_cq(poller->device, num_cqe, group, NULL, 0);
if (poller->cq == NULL) {
SPDK_ERRLOG("Unable to create CQ, errno %d.\n", errno);
@ -2792,7 +2940,7 @@ nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *
STAILQ_INSERT_HEAD(&group->pollers, poller, link);
group->num_pollers++;
poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE;
poller->current_num_wc = num_cqe;
poller->required_num_wc = 0;
return poller;
@ -2983,6 +3131,9 @@ nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *
} while (poller_completions < completions_per_poller);
total_completions += poller_completions;
poller->stats.completions += rdma_completions;
if (poller->srq) {
nvme_rdma_poller_submit_recvs(poller);
}
}
STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
@ -2997,7 +3148,9 @@ nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *
}
nvme_rdma_qpair_submit_sends(rqpair);
nvme_rdma_qpair_submit_recvs(rqpair);
if (!rqpair->srq) {
nvme_rdma_qpair_submit_recvs(rqpair);
}
if (rqpair->num_completions > 0) {
nvme_qpair_resubmit_requests(qpair, rqpair->num_completions);
}

View File

@ -25,6 +25,10 @@ TAILQ_HEAD(nvme_transport_list, spdk_nvme_transport) g_spdk_nvme_transports =
struct spdk_nvme_transport g_spdk_transports[SPDK_MAX_NUM_OF_TRANSPORTS] = {};
int g_current_transport_index = 0;
struct spdk_nvme_transport_opts g_spdk_nvme_transport_opts = {
.rdma_srq_size = 0,
};
const struct spdk_nvme_transport *
nvme_get_first_transport(void)
{
@ -792,3 +796,59 @@ nvme_transport_get_trtype(const struct spdk_nvme_transport *transport)
{
return transport->ops.type;
}
void
spdk_nvme_transport_get_opts(struct spdk_nvme_transport_opts *opts, size_t opts_size)
{
if (opts == NULL) {
SPDK_ERRLOG("opts should not be NULL.\n");
return;
}
if (opts_size == 0) {
SPDK_ERRLOG("opts_size should not be zero.\n");
return;
}
opts->opts_size = opts_size;
#define SET_FIELD(field) \
if (offsetof(struct spdk_nvme_transport_opts, field) + sizeof(opts->field) <= opts_size) { \
opts->field = g_spdk_nvme_transport_opts.field; \
} \
SET_FIELD(rdma_srq_size);
/* Do not remove this statement, you should always update this statement when you adding a new field,
* and do not forget to add the SET_FIELD statement for your added field. */
SPDK_STATIC_ASSERT(sizeof(struct spdk_nvme_transport_opts) == 12, "Incorrect size");
#undef SET_FIELD
}
int
spdk_nvme_transport_set_opts(const struct spdk_nvme_transport_opts *opts, size_t opts_size)
{
if (opts == NULL) {
SPDK_ERRLOG("opts should not be NULL.\n");
return -EINVAL;
}
if (opts_size == 0) {
SPDK_ERRLOG("opts_size should not be zero.\n");
return -EINVAL;
}
#define SET_FIELD(field) \
if (offsetof(struct spdk_nvme_transport_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
g_spdk_nvme_transport_opts.field = opts->field; \
} \
SET_FIELD(rdma_srq_size);
g_spdk_nvme_transport_opts.opts_size = opts->opts_size;
#undef SET_FIELD
return 0;
}

View File

@ -5,6 +5,8 @@
spdk_nvme_transport_register;
spdk_nvme_transport_available;
spdk_nvme_transport_available_by_name;
spdk_nvme_transport_get_opts;
spdk_nvme_transport_set_opts;
spdk_nvme_transport_id_parse;
spdk_nvme_transport_id_populate_trstring;
spdk_nvme_transport_id_parse_trtype;

View File

@ -11,7 +11,9 @@
#define RDMA_UT_LKEY 123
#define RDMA_UT_RKEY 312
struct spdk_nvme_transport_opts g_spdk_nvme_transport_opts = {};
struct spdk_rdma_qp g_spdk_rdma_qp = {};
struct spdk_rdma_srq g_spdk_rdma_srq = {};
DEFINE_STUB(spdk_rdma_qp_create, struct spdk_rdma_qp *, (struct rdma_cm_id *cm_id,
struct spdk_rdma_qp_init_attr *qp_attr), &g_spdk_rdma_qp);
DEFINE_STUB(spdk_rdma_qp_accept, int, (struct spdk_rdma_qp *spdk_rdma_qp,
@ -24,7 +26,7 @@ DEFINE_STUB(spdk_rdma_qp_queue_send_wrs, bool, (struct spdk_rdma_qp *spdk_rdma_q
DEFINE_STUB(spdk_rdma_qp_flush_send_wrs, int, (struct spdk_rdma_qp *spdk_rdma_qp,
struct ibv_send_wr **bad_wr), 0);
DEFINE_STUB(spdk_rdma_srq_create, struct spdk_rdma_srq *,
(struct spdk_rdma_srq_init_attr *init_attr), NULL);
(struct spdk_rdma_srq_init_attr *init_attr), &g_spdk_rdma_srq);
DEFINE_STUB(spdk_rdma_srq_destroy, int, (struct spdk_rdma_srq *rdma_srq), 0);
DEFINE_STUB(spdk_rdma_srq_queue_recv_wrs, bool, (struct spdk_rdma_srq *rdma_srq,
struct ibv_recv_wr *first), true);