From 0480df054b0b74708b92f44b3ede37dd059d1615 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Fri, 15 Jul 2016 16:33:03 -0700 Subject: [PATCH] nvmf: generate RDMA SGLs on the fly There is no need to allocate ibv_sge structures within the RDMA request; we can just fill them out on the stack right before submitting each request. Change-Id: I438ff0be2f6d07ffa933255c92c4ec964aa1b235 Signed-off-by: Daniel Verkamp --- lib/nvmf/rdma.c | 125 ++++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 58 deletions(-) diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index 64830211e..6586492f9 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -93,12 +93,7 @@ struct spdk_nvmf_rdma_request { union nvmf_c2h_msg rsp; struct ibv_mr *rsp_mr; - struct ibv_sge send_sgl; - struct ibv_sge recv_sgl[2]; - struct ibv_mr *bb_mr; - uint8_t *bb; - uint32_t bb_len; }; struct spdk_nvmf_rdma { @@ -207,11 +202,13 @@ free_rdma_req(struct spdk_nvmf_rdma_request *rdma_req) SPDK_ERRLOG("Unable to de-register rsp_mr\n"); } - if (rdma_req->bb_mr && rdma_dereg_mr(rdma_req->bb_mr)) { - SPDK_ERRLOG("Unable to de-register bb_mr\n"); + if (rdma_req->bb_mr) { + rte_free(rdma_req->bb_mr->addr); + if (rdma_dereg_mr(rdma_req->bb_mr)) { + SPDK_ERRLOG("Unable to de-register bb_mr\n"); + } } - rte_free(rdma_req->bb); rte_free(rdma_req); } @@ -242,6 +239,8 @@ alloc_rdma_req(struct spdk_nvmf_conn *conn) { struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); struct spdk_nvmf_rdma_request *rdma_req; + void *bb; + size_t bb_len; rdma_req = rte_zmalloc("nvmf_rdma_req", sizeof(*rdma_req), 0); if (!rdma_req) { @@ -256,31 +255,21 @@ alloc_rdma_req(struct spdk_nvmf_conn *conn) return NULL; } - rdma_req->recv_sgl[0].addr = (uint64_t)&rdma_req->cmd; - rdma_req->recv_sgl[0].length = sizeof(rdma_req->cmd); - rdma_req->recv_sgl[0].lkey = rdma_req->cmd_mr->lkey; - - rdma_req->bb_len = DEFAULT_BB_SIZE; - rdma_req->bb = rte_zmalloc("nvmf_bb", rdma_req->bb_len, 0); - if (!rdma_req->bb) { - SPDK_ERRLOG("Unable to get %u byte bounce buffer\n", rdma_req->bb_len); + bb_len = DEFAULT_BB_SIZE; + bb = rte_zmalloc("nvmf_bb", bb_len, 0); + if (!bb) { + SPDK_ERRLOG("Unable to get %zu byte bounce buffer\n", bb_len); free_rdma_req(rdma_req); return NULL; } - rdma_req->bb_mr = rdma_reg_read(rdma_conn->cm_id, - (void *)rdma_req->bb, - rdma_req->bb_len); + rdma_req->bb_mr = rdma_reg_read(rdma_conn->cm_id, bb, bb_len); if (rdma_req->bb_mr == NULL) { SPDK_ERRLOG("Unable to register bb_mr\n"); + rte_free(bb); free_rdma_req(rdma_req); return NULL; } - /* initialize data buffer sgl */ - rdma_req->recv_sgl[1].addr = (uint64_t)rdma_req->bb; - rdma_req->recv_sgl[1].length = rdma_req->bb_len; - rdma_req->recv_sgl[1].lkey = rdma_req->bb_mr->lkey; - rdma_req->rsp_mr = rdma_reg_msgs(rdma_conn->cm_id, &rdma_req->rsp, sizeof(rdma_req->rsp)); if (rdma_req->rsp_mr == NULL) { SPDK_ERRLOG("Unable to register rsp_mr\n"); @@ -288,11 +277,6 @@ alloc_rdma_req(struct spdk_nvmf_conn *conn) return NULL; } - /* initialize send_sgl */ - rdma_req->send_sgl.addr = (uint64_t)&rdma_req->rsp; - rdma_req->send_sgl.length = sizeof(rdma_req->rsp); - rdma_req->send_sgl.lkey = rdma_req->rsp_mr->lkey; - rdma_req->req.cmd = &rdma_req->cmd; rdma_req->req.rsp = &rdma_req->rsp; rdma_req->req.conn = conn; @@ -350,34 +334,34 @@ static void nvmf_ibv_send_wr_init(struct ibv_send_wr *wr, struct spdk_nvmf_request *req, struct ibv_sge *sg_list, - uint64_t wr_id, enum ibv_wr_opcode opcode, int send_flags) { + struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); RTE_VERIFY(wr != NULL); RTE_VERIFY(sg_list != NULL); memset(wr, 0, sizeof(*wr)); - wr->wr_id = wr_id; + wr->wr_id = (uint64_t)rdma_req; wr->next = NULL; wr->opcode = opcode; wr->send_flags = send_flags; wr->sg_list = sg_list; wr->num_sge = 1; +} - if (req != NULL) { - struct spdk_nvme_sgl_descriptor *sgl = &req->cmd->nvme_cmd.dptr.sgl1; +static void +nvmf_ibv_send_wr_set_rkey(struct ibv_send_wr *wr, struct spdk_nvmf_request *req) +{ + struct spdk_nvme_sgl_descriptor *sgl = &req->cmd->nvme_cmd.dptr.sgl1; - RTE_VERIFY(sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK); + RTE_VERIFY(sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK); - wr->wr.rdma.rkey = sgl->keyed.key; - wr->wr.rdma.remote_addr = sgl->address; + wr->wr.rdma.rkey = sgl->keyed.key; + wr->wr.rdma.remote_addr = sgl->address; - SPDK_TRACELOG(SPDK_TRACE_RDMA, "rkey %x remote_addr %p\n", - wr->wr.rdma.rkey, (void *)wr->wr.rdma.remote_addr); - } - - nvmf_trace_ibv_sge(wr->sg_list); + SPDK_TRACELOG(SPDK_TRACE_RDMA, "rkey %x remote_addr %p\n", + wr->wr.rdma.rkey, (void *)wr->wr.rdma.remote_addr); } static int @@ -387,13 +371,16 @@ nvmf_post_rdma_read(struct spdk_nvmf_request *req) struct spdk_nvmf_conn *conn = req->conn; struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); + struct ibv_sge sge; int rc; - /* temporarily adjust SGE to only copy what the host is prepared to send. */ - rdma_req->recv_sgl[1].length = req->length; + sge.addr = (uint64_t)rdma_req->bb_mr->addr; + sge.lkey = rdma_req->bb_mr->lkey; + sge.length = req->length; + nvmf_trace_ibv_sge(&sge); - nvmf_ibv_send_wr_init(&wr, req, &rdma_req->recv_sgl[1], (uint64_t)rdma_req, - IBV_WR_RDMA_READ, IBV_SEND_SIGNALED); + nvmf_ibv_send_wr_init(&wr, req, &sge, IBV_WR_RDMA_READ, IBV_SEND_SIGNALED); + nvmf_ibv_send_wr_set_rkey(&wr, req); spdk_trace_record(TRACE_RDMA_READ_START, 0, 0, (uint64_t)req, 0); rc = ibv_post_send(rdma_conn->qp, &wr, &bad_wr); @@ -410,13 +397,16 @@ nvmf_post_rdma_write(struct spdk_nvmf_conn *conn, struct ibv_send_wr wr, *bad_wr = NULL; struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); + struct ibv_sge sge; int rc; - /* temporarily adjust SGE to only copy what the host is prepared to receive. */ - rdma_req->recv_sgl[1].length = req->length; + sge.addr = (uint64_t)rdma_req->bb_mr->addr; + sge.lkey = rdma_req->bb_mr->lkey; + sge.length = req->length; + nvmf_trace_ibv_sge(&sge); - nvmf_ibv_send_wr_init(&wr, req, &rdma_req->recv_sgl[1], (uint64_t)rdma_req, - IBV_WR_RDMA_WRITE, 0); + nvmf_ibv_send_wr_init(&wr, req, &sge, IBV_WR_RDMA_WRITE, 0); + nvmf_ibv_send_wr_set_rkey(&wr, req); spdk_trace_record(TRACE_RDMA_WRITE_START, 0, 0, (uint64_t)req, 0); rc = ibv_post_send(rdma_conn->qp, &wr, &bad_wr); @@ -433,6 +423,7 @@ nvmf_post_rdma_recv(struct spdk_nvmf_conn *conn, struct ibv_recv_wr wr, *bad_wr = NULL; struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); + struct ibv_sge sg_list[2]; int rc; /* Update Connection SQ Tracking, increment @@ -442,13 +433,22 @@ nvmf_post_rdma_recv(struct spdk_nvmf_conn *conn, conn->sq_head < (rdma_conn->queue_depth - 1) ? (conn->sq_head++) : (conn->sq_head = 0); SPDK_TRACELOG(SPDK_TRACE_RDMA, "sq_head %x, sq_depth %x\n", conn->sq_head, rdma_conn->queue_depth); + sg_list[0].addr = (uint64_t)&rdma_req->cmd; + sg_list[0].length = sizeof(rdma_req->cmd); + sg_list[0].lkey = rdma_req->cmd_mr->lkey; + nvmf_trace_ibv_sge(&sg_list[0]); + + sg_list[1].addr = (uint64_t)rdma_req->bb_mr->addr; + sg_list[1].length = rdma_req->bb_mr->length; + sg_list[1].lkey = rdma_req->bb_mr->lkey; + nvmf_trace_ibv_sge(&sg_list[1]); + + memset(&wr, 0, sizeof(wr)); wr.wr_id = (uintptr_t)rdma_req; wr.next = NULL; - wr.sg_list = &rdma_req->recv_sgl[0]; + wr.sg_list = sg_list; wr.num_sge = 2; - nvmf_trace_ibv_sge(&rdma_req->recv_sgl[0]); - rc = ibv_post_recv(rdma_conn->qp, &wr, &bad_wr); if (rc) { SPDK_ERRLOG("Failure posting rdma recv, rc = 0x%x\n", rc); @@ -463,13 +463,15 @@ nvmf_post_rdma_send(struct spdk_nvmf_conn *conn, struct ibv_send_wr wr, *bad_wr = NULL; struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn); struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req); + struct ibv_sge sge; int rc; - /* restore the SGL length that may have been modified */ - rdma_req->recv_sgl[1].length = rdma_req->bb_len; + sge.addr = (uint64_t)&rdma_req->rsp; + sge.length = sizeof(rdma_req->rsp); + sge.lkey = rdma_req->rsp_mr->lkey; + nvmf_trace_ibv_sge(&sge); - nvmf_ibv_send_wr_init(&wr, NULL, &rdma_req->send_sgl, (uint64_t)rdma_req, - IBV_WR_SEND, IBV_SEND_SIGNALED); + nvmf_ibv_send_wr_init(&wr, req, &sge, IBV_WR_SEND, IBV_SEND_SIGNALED); spdk_trace_record(TRACE_NVMF_IO_COMPLETE, 0, 0, (uint64_t)req, 0); rc = ibv_post_send(rdma_conn->qp, &wr, &bad_wr); @@ -766,6 +768,7 @@ nvmf_recv(struct spdk_nvmf_rdma_request *rdma_req, struct ibv_wc *wc) { int ret; struct spdk_nvmf_request *req; + void *bb; if (wc->byte_len < sizeof(struct spdk_nvmf_capsule_cmd)) { SPDK_ERRLOG("recv length %u less than capsule header\n", wc->byte_len); @@ -774,9 +777,10 @@ nvmf_recv(struct spdk_nvmf_rdma_request *rdma_req, struct ibv_wc *wc) req = &rdma_req->req; + bb = rdma_req->bb_mr->addr; ret = spdk_nvmf_request_prep_data(req, - rdma_req->bb, wc->byte_len - sizeof(struct spdk_nvmf_capsule_cmd), - rdma_req->bb, rdma_req->recv_sgl[1].length); + bb, wc->byte_len - sizeof(struct spdk_nvmf_capsule_cmd), + bb, rdma_req->bb_mr->length); if (ret < 0) { SPDK_ERRLOG("prep_data failed\n"); return spdk_nvmf_request_complete(req); @@ -1092,6 +1096,11 @@ nvmf_check_rdma_completions(struct spdk_nvmf_conn *conn) } rdma_req = (struct spdk_nvmf_rdma_request *)wc.wr_id; + if (rdma_req == NULL) { + SPDK_ERRLOG("Got CQ completion for NULL rdma_req\n"); + return -1; + } + req = &rdma_req->req; switch (wc.opcode) {