rdma: Unset IBV_SEND_SIGNALED flag for RDMA_WRITE operations

Unsetting this flag will decrease the number of WRs retrieved during CQ polling and will decrease
the oeverall processing time. Since RDMA_WRITE operations are always paired with RDMA_SEND (response),
it is possible to track the number of outstanding WRs relying on the completed response WR.
Completed WRs of type RDMA_WR_TYPE_DATA are now always RDMA_READ operations.

The patch shows %2 better peformance for read operations on x86 machine. The performance was measured using perf with the following parameters:
-q 16 -o 4096 -w read -t 300 -c 2
with nvme null device, each measurement was done 4 times

avg IOPS (with patch): 865861.71
avg IOPS (master): 847958.77

avg latency (with patch): 18.46 [us]
avg latency (master): 18.85 [us]

Change-Id: Ifd3329fbd0e45dd5f27213b36b9444308660fc8b
Signed-off-by: Alexey Marchuk <alexeymar@mellanox.com>
Signed-off-by: Sasha Kotchubievsky <sashakot@mellanox.com>
Signed-off-by: Evgenii Kochetov <evgeniik@mellanox.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/456469
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Seth Howell <seth.howell5141@gmail.com>
Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Alexey Marchuk 2019-05-24 07:13:09 +00:00 committed by Ben Walker
parent 1900c8c261
commit 53777de855

View File

@ -2,7 +2,7 @@
* BSD LICENSE
*
* Copyright (c) Intel Corporation. All rights reserved.
* Copyright (c) 2018 Mellanox Technologies LTD. All rights reserved.
* Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -1487,12 +1487,13 @@ nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
for (i = 0; i < num_sgl_descriptors; i++) {
if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE;
current_data_wr->wr.send_flags = 0;
} else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
current_data_wr->wr.opcode = IBV_WR_RDMA_READ;
current_data_wr->wr.send_flags = IBV_SEND_SIGNALED;
} else {
assert(false);
}
work_requests[i]->wr.send_flags = IBV_SEND_SIGNALED;
work_requests[i]->wr.sg_list = work_requests[i]->sgl;
work_requests[i]->wr.wr_id = rdma_req->data.wr.wr_id;
current_data_wr->wr.next = &work_requests[i]->wr;
@ -1502,9 +1503,11 @@ nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
current_data_wr->wr.opcode = IBV_WR_RDMA_WRITE;
current_data_wr->wr.next = &rdma_req->rsp.wr;
current_data_wr->wr.send_flags = 0;
} else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
current_data_wr->wr.opcode = IBV_WR_RDMA_READ;
current_data_wr->wr.next = NULL;
current_data_wr->wr.send_flags = IBV_SEND_SIGNALED;
}
return 0;
}
@ -1755,9 +1758,11 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE;
rdma_req->data.wr.next = &rdma_req->rsp.wr;
rdma_req->data.wr.send_flags &= ~IBV_SEND_SIGNALED;
} else if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
rdma_req->data.wr.opcode = IBV_WR_RDMA_READ;
rdma_req->data.wr.next = NULL;
rdma_req->data.wr.send_flags |= IBV_SEND_SIGNALED;
}
/* set the number of outstanding data WRs for this request. */
@ -3427,10 +3432,11 @@ spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
}
rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
rqpair->current_send_depth--;
/* +1 for the response wr */
rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1;
rdma_req->num_outstanding_data_wr = 0;
spdk_nvmf_rdma_request_process(rtransport, rdma_req);
assert(rdma_req->num_outstanding_data_wr == 0);
break;
case RDMA_WR_TYPE_RECV:
/* rdma_recv->qpair will be invalid if using an SRQ. In that case we have to get the qpair from the wc. */
@ -3462,16 +3468,13 @@ spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
rqpair->current_send_depth--;
rdma_req->num_outstanding_data_wr--;
if (!wc[i].status) {
if (wc[i].opcode == IBV_WC_RDMA_READ) {
assert(wc[i].opcode == IBV_WC_RDMA_READ);
rqpair->current_read_depth--;
/* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */
if (rdma_req->num_outstanding_data_wr == 0) {
rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
spdk_nvmf_rdma_request_process(rtransport, rdma_req);
}
} else {
assert(wc[i].opcode == IBV_WC_RDMA_WRITE);
}
} else {
/* If the data transfer fails still force the queue into the error state,
* if we were performing an RDMA_READ, we need to force the request into a