rdma: batch rdma sends.

By batching ibv sends each time we poll, we can reduce the number of
MMIO writes that we do.

Change-Id: Ia5a07b0037365abfa8732629c34d34a9ed49ac70
Signed-off-by: Seth Howell <seth.howell@intel.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/449253
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Seth Howell 2019-03-26 11:23:34 -07:00 committed by Jim Harris
parent 4c03cae6f5
commit 9d63933b7f

View File

@ -286,6 +286,11 @@ struct spdk_nvmf_rdma_resource_opts {
bool shared;
};
struct spdk_nvmf_send_wr_list {
struct ibv_send_wr *first;
struct ibv_send_wr *last;
};
struct spdk_nvmf_rdma_resources {
/* Array of size "max_queue_depth" containing RDMA requests. */
struct spdk_nvmf_rdma_request *reqs;
@ -356,6 +361,9 @@ struct spdk_nvmf_rdma_qpair {
/* The maximum number of SGEs per WR on the recv queue */
uint32_t max_recv_sge;
/* The list of pending send requests for a transfer */
struct spdk_nvmf_send_wr_list sends_to_post;
struct spdk_nvmf_rdma_resources *resources;
STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_read_queue;
@ -619,7 +627,8 @@ nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
while (next_data_wr) {
current_data_wr = next_data_wr;
send_wr = current_data_wr->wr.next;
if (send_wr != NULL && send_wr != &rdma_req->rsp.wr) {
if (send_wr != NULL && send_wr != &rdma_req->rsp.wr &&
send_wr->wr_id == current_data_wr->wr.wr_id) {
next_data_wr = SPDK_CONTAINEROF(send_wr, struct spdk_nvmf_rdma_request_data, wr);
} else {
next_data_wr = NULL;
@ -979,6 +988,9 @@ spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0);
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair);
rqpair->sends_to_post.first = NULL;
rqpair->sends_to_post.last = NULL;
if (rqpair->poller->srq == NULL) {
rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
transport = &rtransport->transport;
@ -1013,14 +1025,34 @@ error:
return -1;
}
/* Append the given send wr structure to the qpair's outstanding sends list. */
/* This function accepts either a single wr or the first wr in a linked list. */
static void
nvmf_rdma_qpair_queue_send_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *first)
{
struct ibv_send_wr *last;
last = first;
while (last->next != NULL) {
last = last->next;
}
if (rqpair->sends_to_post.first == NULL) {
rqpair->sends_to_post.first = first;
rqpair->sends_to_post.last = last;
} else {
rqpair->sends_to_post.last->next = first;
rqpair->sends_to_post.last = last;
}
}
static int
request_transfer_in(struct spdk_nvmf_request *req)
{
int rc;
struct spdk_nvmf_rdma_request *rdma_req;
struct spdk_nvmf_qpair *qpair;
struct spdk_nvmf_rdma_qpair *rqpair;
struct ibv_send_wr *bad_wr = NULL;
qpair = req->qpair;
rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
@ -1029,13 +1061,7 @@ request_transfer_in(struct spdk_nvmf_request *req)
assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
assert(rdma_req != NULL);
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair);
rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr);
if (rc) {
SPDK_ERRLOG("Unable to transfer data from host to target\n");
return -1;
}
nvmf_rdma_qpair_queue_send_wrs(rqpair, &rdma_req->data.wr);
rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
return 0;
@ -1051,7 +1077,7 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
struct spdk_nvmf_rdma_qpair *rqpair;
struct spdk_nvme_cpl *rsp;
struct ibv_recv_wr *bad_recv_wr = NULL;
struct ibv_send_wr *send_wr, *bad_send_wr = NULL;
struct ibv_send_wr *first = NULL;
*data_posted = 0;
qpair = req->qpair;
@ -1089,24 +1115,15 @@ request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
* RDMA WRITEs to transfer data, plus an RDMA SEND
* containing the response.
*/
send_wr = &rdma_req->rsp.wr;
first = &rdma_req->rsp.wr;
if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair);
send_wr = &rdma_req->data.wr;
first = &rdma_req->data.wr;
*data_posted = 1;
num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
}
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair);
/* Send the completion */
rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr);
if (rc) {
SPDK_ERRLOG("Unable to send response capsule\n");
return rc;
}
nvmf_rdma_qpair_queue_send_wrs(rqpair, first);
/* +1 for the rsp wr */
rqpair->current_send_depth += num_outstanding_data_wr + 1;
@ -1797,6 +1814,7 @@ nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
rdma_req->req.length = 0;
rdma_req->req.iovcnt = 0;
rdma_req->req.data = NULL;
rdma_req->rsp.wr.next = NULL;
rdma_req->data.wr.next = NULL;
rqpair->qd--;
@ -3190,6 +3208,88 @@ spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req)
}
#endif
static void
_qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc)
{
struct spdk_nvmf_rdma_wr *bad_rdma_wr;
struct spdk_nvmf_rdma_request *prev_rdma_req = NULL, *cur_rdma_req = NULL;
SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc);
for (; bad_wr != NULL; bad_wr = bad_wr->next) {
bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id;
assert(rqpair->current_send_depth > 0);
rqpair->current_send_depth--;
switch (bad_rdma_wr->type) {
case RDMA_WR_TYPE_DATA:
cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr);
if (bad_wr->opcode == IBV_WR_RDMA_READ) {
assert(rqpair->current_read_depth > 0);
rqpair->current_read_depth--;
}
break;
case RDMA_WR_TYPE_SEND:
cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr);
break;
default:
SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair);
prev_rdma_req = cur_rdma_req;
continue;
}
if (prev_rdma_req == cur_rdma_req) {
/* this request was handled by an earlier wr. i.e. we were performing an nvme read. */
/* We only have to check against prev_wr since each requests wrs are contiguous in this list. */
continue;
}
switch (cur_rdma_req->state) {
case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
break;
case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
case RDMA_REQUEST_STATE_COMPLETING:
cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
break;
default:
SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n",
cur_rdma_req->state, rqpair);
continue;
}
spdk_nvmf_rdma_request_process(rtransport, cur_rdma_req);
prev_rdma_req = cur_rdma_req;
}
if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) {
/* Disconnect the connection. */
spdk_nvmf_rdma_start_disconnect(rqpair);
}
}
static void
_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poller *rpoller)
{
struct spdk_nvmf_rdma_qpair *rqpair;
struct ibv_send_wr *bad_wr = NULL;
int rc;
TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) {
if (rqpair->sends_to_post.first != NULL) {
rc = ibv_post_send(rqpair->cm_id->qp, rqpair->sends_to_post.first, &bad_wr);
/* bad wr always points to the first wr that failed. */
if (rc) {
_qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc);
}
rqpair->sends_to_post.first = NULL;
rqpair->sends_to_post.last = NULL;
}
}
}
static int
spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poller *rpoller)
@ -3319,6 +3419,9 @@ spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
return -1;
}
/* submit outstanding work requests. */
_poller_submit_sends(rtransport, rpoller);
return count;
}