Spdk/lib/nvmf/rdma.c
Ben Walker 8a701c3f8d nvmf: Use the inline SGL for keyed SGLs if the size is small enough
For small SGLs, even if they are keyed and not inline, use the
buffer we allocated for inline data.

Change-Id: I5051c43aabacb20a4247b2feaf2af801dba5f5a9
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
2016-07-26 15:24:01 -07:00

1182 lines
35 KiB
C

/*-
* BSD LICENSE
*
* Copyright (c) Intel Corporation.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <arpa/inet.h>
#include <fcntl.h>
#include <infiniband/verbs.h>
#include <rdma/rdma_cma.h>
#include <rdma/rdma_verbs.h>
#include <unistd.h>
#include <stdio.h>
#include <stdint.h>
#include <rte_config.h>
#include <rte_debug.h>
#include <rte_cycles.h>
#include <rte_timer.h>
#include <rte_lcore.h>
#include <rte_malloc.h>
#include "nvmf_internal.h"
#include "request.h"
#include "session.h"
#include "subsystem.h"
#include "transport.h"
#include "spdk/assert.h"
#include "spdk/log.h"
#include "spdk/nvmf_spec.h"
#include "spdk/trace.h"
#define ACCEPT_TIMEOUT (rte_get_timer_hz() >> 10) /* ~1ms */
/*
RDMA Connection Resouce Defaults
*/
#define NVMF_DEFAULT_TX_SGE 1
#define NVMF_DEFAULT_RX_SGE 2
struct spdk_nvmf_rdma_request {
struct spdk_nvmf_request req;
/* In Capsule data buffer */
void *buf;
};
struct spdk_nvmf_rdma_conn {
struct spdk_nvmf_conn conn;
struct rdma_cm_id *cm_id;
struct ibv_context *ctx;
struct ibv_comp_channel *comp_channel;
struct ibv_cq *cq;
struct ibv_qp *qp;
/* The maximum number of I/O outstanding on this connection at one time */
uint16_t queue_depth;
/* The maximum number of active RDMA READ and WRITE operations at one time */
uint16_t rw_depth;
/* The current number of I/O outstanding on this connection */
int num_outstanding_reqs;
/* Array of size "queue_depth" containing RDMA requests. */
struct spdk_nvmf_rdma_request *reqs;
/* Array of size "queue_depth" containing 64 byte capsules
* used for receive.
*/
union nvmf_h2c_msg *cmds;
struct ibv_mr *cmds_mr;
/* Array of size "queue_depth" containing 16 byte completions
* to be sent back to the user.
*/
union nvmf_c2h_msg *cpls;
struct ibv_mr *cpls_mr;
/* Array of size "queue_depth * InCapsuleDataSize" containing
* buffers to be used for in capsule data. TODO: Currently, all data
* is in capsule.
*/
void *bufs;
struct ibv_mr *bufs_mr;
TAILQ_ENTRY(spdk_nvmf_rdma_conn) link;
};
/* List of RDMA connections that have not yet received a CONNECT capsule */
static TAILQ_HEAD(, spdk_nvmf_rdma_conn) g_pending_conns = TAILQ_HEAD_INITIALIZER(g_pending_conns);
struct spdk_nvmf_rdma {
struct rte_timer acceptor_timer;
struct rdma_event_channel *acceptor_event_channel;
struct rdma_cm_id *acceptor_listen_id;
uint16_t max_queue_depth;
uint32_t max_io_size;
uint32_t in_capsule_data_size;
};
static struct spdk_nvmf_rdma g_rdma = { };
static inline struct spdk_nvmf_rdma_conn *
get_rdma_conn(struct spdk_nvmf_conn *conn)
{
return (struct spdk_nvmf_rdma_conn *)((uintptr_t)conn + offsetof(struct spdk_nvmf_rdma_conn, conn));
}
static inline struct spdk_nvmf_rdma_request *
get_rdma_req(struct spdk_nvmf_request *req)
{
return (struct spdk_nvmf_rdma_request *)((uintptr_t)req + offsetof(struct spdk_nvmf_rdma_request,
req));
}
static int nvmf_post_rdma_recv(struct spdk_nvmf_request *req);
static void
spdk_nvmf_rdma_conn_destroy(struct spdk_nvmf_rdma_conn *rdma_conn)
{
if (rdma_conn->cmds_mr) {
rdma_dereg_mr(rdma_conn->cmds_mr);
}
if (rdma_conn->cpls_mr) {
rdma_dereg_mr(rdma_conn->cpls_mr);
}
if (rdma_conn->bufs_mr) {
rdma_dereg_mr(rdma_conn->bufs_mr);
}
if (rdma_conn->cm_id) {
rdma_destroy_qp(rdma_conn->cm_id);
}
if (rdma_conn->cq) {
ibv_destroy_cq(rdma_conn->cq);
}
if (rdma_conn->comp_channel) {
ibv_destroy_comp_channel(rdma_conn->comp_channel);
}
if (rdma_conn->cm_id) {
rdma_destroy_id(rdma_conn->cm_id);
}
/* Free all memory */
rte_free(rdma_conn->cmds);
rte_free(rdma_conn->cpls);
rte_free(rdma_conn->bufs);
free(rdma_conn->reqs);
free(rdma_conn);
}
static struct spdk_nvmf_rdma_conn *
spdk_nvmf_rdma_conn_create(struct rdma_cm_id *id, uint16_t queue_depth, uint16_t rw_depth)
{
struct spdk_nvmf_rdma_conn *rdma_conn;
struct spdk_nvmf_conn *conn;
int rc, i;
struct ibv_qp_init_attr attr;
struct spdk_nvmf_rdma_request *rdma_req;
rdma_conn = calloc(1, sizeof(struct spdk_nvmf_rdma_conn));
if (rdma_conn == NULL) {
SPDK_ERRLOG("Could not allocate new connection.\n");
return NULL;
}
rdma_conn->queue_depth = queue_depth;
rdma_conn->rw_depth = rw_depth;
rdma_conn->ctx = id->verbs;
rdma_conn->cm_id = id;
rdma_conn->comp_channel = ibv_create_comp_channel(id->verbs);
if (!rdma_conn->comp_channel) {
SPDK_ERRLOG("create completion channel error!\n");
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return NULL;
}
rc = fcntl(rdma_conn->comp_channel->fd, F_SETFL, O_NONBLOCK);
if (rc < 0) {
SPDK_ERRLOG("fcntl to set comp channel to non-blocking failed\n");
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return NULL;
}
/*
* Size the CQ to handle completions for RECV, SEND, and either READ or WRITE.
*/
rdma_conn->cq = ibv_create_cq(id->verbs, (queue_depth * 3), rdma_conn, rdma_conn->comp_channel,
0);
if (!rdma_conn->cq) {
SPDK_ERRLOG("create cq error!\n");
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return NULL;
}
memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
attr.qp_type = IBV_QPT_RC;
attr.send_cq = rdma_conn->cq;
attr.recv_cq = rdma_conn->cq;
attr.cap.max_send_wr = rdma_conn->queue_depth * 2; /* SEND, READ, and WRITE operations */
attr.cap.max_recv_wr = rdma_conn->queue_depth; /* RECV operations */
attr.cap.max_send_sge = NVMF_DEFAULT_TX_SGE;
attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE;
rc = rdma_create_qp(rdma_conn->cm_id, NULL, &attr);
if (rc) {
SPDK_ERRLOG("rdma_create_qp failed\n");
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return NULL;
}
rdma_conn->qp = rdma_conn->cm_id->qp;
conn = &rdma_conn->conn;
conn->transport = &spdk_nvmf_transport_rdma;
id->context = conn;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "New RDMA Connection: %p\n", conn);
rdma_conn->reqs = calloc(rdma_conn->queue_depth, sizeof(*rdma_conn->reqs));
rdma_conn->cmds = rte_calloc("nvmf_rdma_cmd", rdma_conn->queue_depth,
sizeof(*rdma_conn->cmds), 0);
rdma_conn->cpls = rte_calloc("nvmf_rdma_cpl", rdma_conn->queue_depth,
sizeof(*rdma_conn->cpls), 0);
rdma_conn->bufs = rte_calloc("nvmf_rdma_buf", rdma_conn->queue_depth,
g_rdma.in_capsule_data_size, 0);
if (!rdma_conn->reqs || !rdma_conn->cmds || !rdma_conn->cpls || !rdma_conn->bufs) {
SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n");
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return NULL;
}
rdma_conn->cmds_mr = rdma_reg_msgs(rdma_conn->cm_id, rdma_conn->cmds,
queue_depth * sizeof(*rdma_conn->cmds));
rdma_conn->cpls_mr = rdma_reg_msgs(rdma_conn->cm_id, rdma_conn->cpls,
queue_depth * sizeof(*rdma_conn->cpls));
rdma_conn->bufs_mr = rdma_reg_msgs(rdma_conn->cm_id, rdma_conn->bufs,
rdma_conn->queue_depth * g_rdma.in_capsule_data_size);
if (!rdma_conn->cmds_mr || !rdma_conn->cpls_mr || !rdma_conn->bufs_mr) {
SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n");
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return NULL;
}
for (i = 0; i < queue_depth; i++) {
rdma_req = &rdma_conn->reqs[i];
rdma_req->buf = (void *)((uintptr_t)rdma_conn->bufs + (i * g_rdma.in_capsule_data_size));
rdma_req->req.cmd = &rdma_conn->cmds[i];
rdma_req->req.rsp = &rdma_conn->cpls[i];
rdma_req->req.conn = &rdma_conn->conn;
if (nvmf_post_rdma_recv(&rdma_req->req)) {
SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n");
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return NULL;
}
}
return rdma_conn;
}
static void
nvmf_trace_ibv_sge(struct ibv_sge *sg_list)
{
if (sg_list) {
SPDK_TRACELOG(SPDK_TRACE_RDMA, "local addr %p length 0x%x lkey 0x%x\n",
(void *)sg_list->addr, sg_list->length, sg_list->lkey);
}
}
static void
nvmf_ibv_send_wr_init(struct ibv_send_wr *wr,
struct spdk_nvmf_request *req,
struct ibv_sge *sg_list,
enum ibv_wr_opcode opcode,
int send_flags)
{
struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req);
RTE_VERIFY(wr != NULL);
RTE_VERIFY(sg_list != NULL);
memset(wr, 0, sizeof(*wr));
wr->wr_id = (uint64_t)rdma_req;
wr->next = NULL;
wr->opcode = opcode;
wr->send_flags = send_flags;
wr->sg_list = sg_list;
wr->num_sge = 1;
}
static void
nvmf_ibv_send_wr_set_rkey(struct ibv_send_wr *wr, struct spdk_nvmf_request *req)
{
struct spdk_nvme_sgl_descriptor *sgl = &req->cmd->nvme_cmd.dptr.sgl1;
RTE_VERIFY(sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK);
wr->wr.rdma.rkey = sgl->keyed.key;
wr->wr.rdma.remote_addr = sgl->address;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "rkey %x remote_addr %p\n",
wr->wr.rdma.rkey, (void *)wr->wr.rdma.remote_addr);
}
static int
nvmf_post_rdma_read(struct spdk_nvmf_request *req)
{
struct ibv_send_wr wr, *bad_wr = NULL;
struct spdk_nvmf_conn *conn = req->conn;
struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn);
struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req);
struct ibv_sge sge;
int rc;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, conn);
sge.addr = (uintptr_t)rdma_req->buf;
sge.lkey = rdma_conn->bufs_mr->lkey;
sge.length = req->length;
nvmf_trace_ibv_sge(&sge);
nvmf_ibv_send_wr_init(&wr, req, &sge, IBV_WR_RDMA_READ, IBV_SEND_SIGNALED);
nvmf_ibv_send_wr_set_rkey(&wr, req);
spdk_trace_record(TRACE_RDMA_READ_START, 0, 0, (uintptr_t)req, 0);
rc = ibv_post_send(rdma_conn->qp, &wr, &bad_wr);
if (rc) {
SPDK_ERRLOG("Failure posting rdma read send, rc = 0x%x\n", rc);
}
return rc;
}
static int
nvmf_post_rdma_write(struct spdk_nvmf_request *req)
{
struct ibv_send_wr wr, *bad_wr = NULL;
struct spdk_nvmf_conn *conn = req->conn;
struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn);
struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req);
struct ibv_sge sge;
int rc;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, conn);
sge.addr = (uintptr_t)rdma_req->buf;
sge.lkey = rdma_conn->bufs_mr->lkey;
sge.length = req->length;
nvmf_trace_ibv_sge(&sge);
nvmf_ibv_send_wr_init(&wr, req, &sge, IBV_WR_RDMA_WRITE, 0);
nvmf_ibv_send_wr_set_rkey(&wr, req);
spdk_trace_record(TRACE_RDMA_WRITE_START, 0, 0, (uintptr_t)req, 0);
rc = ibv_post_send(rdma_conn->qp, &wr, &bad_wr);
if (rc) {
SPDK_ERRLOG("Failure posting rdma write send, rc = 0x%x\n", rc);
}
return rc;
}
static int
nvmf_post_rdma_recv(struct spdk_nvmf_request *req)
{
struct ibv_recv_wr wr, *bad_wr = NULL;
struct spdk_nvmf_conn *conn = req->conn;
struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn);
struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req);
struct ibv_sge sg_list[2];
int rc;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA RECV POSTED. Request: %p Connection: %p\n", req, conn);
sg_list[0].addr = (uintptr_t)req->cmd;
sg_list[0].length = sizeof(*req->cmd);
sg_list[0].lkey = rdma_conn->cmds_mr->lkey;
nvmf_trace_ibv_sge(&sg_list[0]);
sg_list[1].addr = (uintptr_t)rdma_req->buf;
sg_list[1].length = g_rdma.in_capsule_data_size;
sg_list[1].lkey = rdma_conn->bufs_mr->lkey;
nvmf_trace_ibv_sge(&sg_list[1]);
memset(&wr, 0, sizeof(wr));
wr.wr_id = (uintptr_t)rdma_req;
wr.next = NULL;
wr.sg_list = sg_list;
wr.num_sge = 2;
rc = ibv_post_recv(rdma_conn->qp, &wr, &bad_wr);
if (rc) {
SPDK_ERRLOG("Failure posting rdma recv, rc = 0x%x\n", rc);
}
return rc;
}
static int
nvmf_post_rdma_send(struct spdk_nvmf_request *req)
{
struct ibv_send_wr wr, *bad_wr = NULL;
struct spdk_nvmf_conn *conn = req->conn;
struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn);
struct ibv_sge sge;
int rc;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, conn);
sge.addr = (uintptr_t)req->rsp;
sge.length = sizeof(*req->rsp);
sge.lkey = rdma_conn->cpls_mr->lkey;
nvmf_trace_ibv_sge(&sge);
nvmf_ibv_send_wr_init(&wr, req, &sge, IBV_WR_SEND, IBV_SEND_SIGNALED);
spdk_trace_record(TRACE_NVMF_IO_COMPLETE, 0, 0, (uintptr_t)req, 0);
rc = ibv_post_send(rdma_conn->qp, &wr, &bad_wr);
if (rc) {
SPDK_ERRLOG("Failure posting rdma send for NVMf completion, rc = 0x%x\n", rc);
}
return rc;
}
static int
spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req)
{
struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
int ret;
/* Was the command successful? */
if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
/* Need to transfer data via RDMA Write */
ret = nvmf_post_rdma_write(req);
if (ret) {
SPDK_ERRLOG("Unable to post rdma write tx descriptor\n");
return -1;
}
}
ret = nvmf_post_rdma_send(req);
if (ret) {
SPDK_ERRLOG("Unable to send response capsule\n");
return -1;
}
return 0;
}
static int
spdk_nvmf_rdma_request_release(struct spdk_nvmf_request *req)
{
struct spdk_nvmf_conn *conn = req->conn;
struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn);
if (nvmf_post_rdma_recv(req)) {
SPDK_ERRLOG("Unable to re-post rx descriptor\n");
return -1;
}
conn->sq_head++;
if (conn->sq_head == rdma_conn->queue_depth) {
conn->sq_head = 0;
}
return 0;
}
static int
nvmf_rdma_connect(struct rdma_cm_event *event)
{
struct spdk_nvmf_rdma_conn *rdma_conn = NULL;
struct ibv_device_attr ibdev_attr;
struct rdma_conn_param *rdma_param = NULL;
struct rdma_conn_param ctrlr_event_data;
const struct spdk_nvmf_rdma_request_private_data *private_data = NULL;
struct spdk_nvmf_rdma_accept_private_data accept_data;
uint16_t sts = 0;
uint16_t queue_depth;
uint16_t rw_depth;
int rc;
/* Check to make sure we know about this rdma device */
if (event->id == NULL) {
SPDK_ERRLOG("connect request: missing cm_id\n");
goto err0;
}
if (event->id->verbs == NULL) {
SPDK_ERRLOG("connect request: missing cm_id ibv_context\n");
goto err0;
}
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n",
event->id->verbs->device->name, event->id->verbs->device->dev_name);
/* Figure out the supported queue depth. This is a multi-step process
* that takes into account hardware maximums, host provided values,
* and our target's internal memory limits */
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Calculating Queue Depth\n");
/* Start with the maximum queue depth allowed by the target */
queue_depth = g_rdma.max_queue_depth;
rw_depth = g_rdma.max_queue_depth;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Target Max Queue Depth: %d\n", g_rdma.max_queue_depth);
/* Next check the local NIC's hardware limitations */
rc = ibv_query_device(event->id->verbs, &ibdev_attr);
if (rc) {
SPDK_ERRLOG("Failed to query RDMA device attributes\n");
sts = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
goto err1;
}
SPDK_TRACELOG(SPDK_TRACE_RDMA,
"Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n",
ibdev_attr.max_qp_wr, ibdev_attr.max_qp_rd_atom);
queue_depth = nvmf_min(queue_depth, ibdev_attr.max_qp_wr);
rw_depth = nvmf_min(rw_depth, ibdev_attr.max_qp_rd_atom);
/* Next check the remote NIC's hardware limitations */
rdma_param = &event->param.conn;
SPDK_TRACELOG(SPDK_TRACE_RDMA,
"Host NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n",
rdma_param->initiator_depth, rdma_param->responder_resources);
rw_depth = nvmf_min(rw_depth, rdma_param->initiator_depth);
/* Finally check for the host software requested values, which are
* optional. */
if (rdma_param->private_data != NULL &&
rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) {
private_data = rdma_param->private_data;
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize);
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize);
queue_depth = nvmf_min(queue_depth, private_data->hrqsize);
queue_depth = nvmf_min(queue_depth, private_data->hsqsize);
}
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n",
queue_depth, rw_depth);
/* TEMPORARY: Limit the queue_depth to the rw_depth due to lack of queueing */
queue_depth = rw_depth;
/* Init the NVMf rdma transport connection */
rdma_conn = spdk_nvmf_rdma_conn_create(event->id, queue_depth, rw_depth);
if (rdma_conn == NULL) {
SPDK_ERRLOG("Error on nvmf connection creation\n");
goto err1;
}
/* Add this RDMA connection to the global list until a CONNECT capsule
* is received. */
TAILQ_INSERT_TAIL(&g_pending_conns, rdma_conn, link);
accept_data.recfmt = 0;
accept_data.crqsize = rdma_conn->queue_depth;
ctrlr_event_data = *rdma_param;
ctrlr_event_data.private_data = &accept_data;
ctrlr_event_data.private_data_len = sizeof(accept_data);
if (event->id->ps == RDMA_PS_TCP) {
ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */
ctrlr_event_data.initiator_depth = rdma_conn->queue_depth;
}
rc = rdma_accept(event->id, &ctrlr_event_data);
if (rc) {
SPDK_ERRLOG("Error on rdma_accept\n");
goto err1;
}
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Sent back the accept\n");
return 0;
err1: {
struct spdk_nvmf_rdma_reject_private_data rej_data;
rej_data.status.sc = sts;
rdma_reject(event->id, &ctrlr_event_data, sizeof(rej_data));
free(rdma_conn);
}
err0:
return -1;
}
static void
spdk_nvmf_handle_disconnect(spdk_event_t event)
{
struct nvmf_session *session = spdk_event_get_arg1(event);
struct spdk_nvmf_conn *conn = spdk_event_get_arg2(event);
nvmf_disconnect(session, conn);
}
static int
nvmf_rdma_disconnect(struct rdma_cm_event *evt)
{
struct spdk_nvmf_conn *conn;
struct nvmf_session *session;
struct spdk_nvmf_rdma_conn *rdma_conn;
spdk_event_t event;
if (evt->id == NULL) {
SPDK_ERRLOG("disconnect request: missing cm_id\n");
return -1;
}
conn = evt->id->context;
if (conn == NULL) {
SPDK_ERRLOG("disconnect request: no active connection\n");
return -1;
}
/* ack the disconnect event before rdma_destroy_id */
rdma_ack_cm_event(evt);
rdma_conn = get_rdma_conn(conn);
session = conn->sess;
if (session == NULL) {
/* No session has been established yet. That means the conn
* must be in the pending connections list. Remove it. */
TAILQ_REMOVE(&g_pending_conns, rdma_conn, link);
spdk_nvmf_rdma_conn_destroy(rdma_conn);
return 0;
}
/* Pass an event to the core that owns this connection */
event = spdk_event_allocate(session->subsys->poller.lcore,
spdk_nvmf_handle_disconnect,
session, conn, NULL);
spdk_event_call(event);
return 0;
}
#ifdef DEBUG
static const char *CM_EVENT_STR[] = {
"RDMA_CM_EVENT_ADDR_RESOLVED",
"RDMA_CM_EVENT_ADDR_ERROR",
"RDMA_CM_EVENT_ROUTE_RESOLVED",
"RDMA_CM_EVENT_ROUTE_ERROR",
"RDMA_CM_EVENT_CONNECT_REQUEST",
"RDMA_CM_EVENT_CONNECT_RESPONSE",
"RDMA_CM_EVENT_CONNECT_ERROR",
"RDMA_CM_EVENT_UNREACHABLE",
"RDMA_CM_EVENT_REJECTED",
"RDMA_CM_EVENT_ESTABLISHED",
"RDMA_CM_EVENT_DISCONNECTED",
"RDMA_CM_EVENT_DEVICE_REMOVAL",
"RDMA_CM_EVENT_MULTICAST_JOIN",
"RDMA_CM_EVENT_MULTICAST_ERROR",
"RDMA_CM_EVENT_ADDR_CHANGE",
"RDMA_CM_EVENT_TIMEWAIT_EXIT"
};
#endif /* DEBUG */
static int
spdk_nvmf_request_prep_data(struct spdk_nvmf_request *req)
{
struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
struct spdk_nvmf_rdma_request *rdma_req = get_rdma_req(req);
enum spdk_nvme_data_transfer xfer;
req->length = 0;
req->xfer = SPDK_NVME_DATA_NONE;
req->data = NULL;
if (cmd->opc == SPDK_NVME_OPC_FABRIC) {
xfer = spdk_nvme_opc_get_data_transfer(req->cmd->nvmf_cmd.fctype);
} else {
xfer = spdk_nvme_opc_get_data_transfer(cmd->opc);
}
if (xfer != SPDK_NVME_DATA_NONE) {
struct spdk_nvme_sgl_descriptor *sgl = (struct spdk_nvme_sgl_descriptor *)&cmd->dptr.sgl1;
if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
(sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
if (sgl->keyed.length > g_rdma.max_io_size) {
SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
sgl->keyed.length, g_rdma.max_io_size);
rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
return -1;
}
if (sgl->keyed.length > g_rdma.in_capsule_data_size) {
/* TODO: Get a large buffer from the central pool. */
SPDK_ERRLOG("SGL length 0x%x exceeds in capsule data buffer size 0x%x\n",
sgl->keyed.length, g_rdma.in_capsule_data_size);
rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
return -1;
} else {
/* Use the in capsule data buffer, even though this isn't in capsule data */
req->data = rdma_req->buf;
req->length = sgl->keyed.length;
}
} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
uint64_t offset = sgl->address;
uint32_t max_len = g_rdma.in_capsule_data_size;
SPDK_TRACELOG(SPDK_TRACE_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
offset, sgl->unkeyed.length);
if (offset > max_len) {
SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
offset, max_len);
rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
return -1;
}
max_len -= (uint32_t)offset;
if (sgl->unkeyed.length > max_len) {
SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
sgl->unkeyed.length, max_len);
rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
return -1;
}
req->data = rdma_req->buf + offset;
req->length = sgl->unkeyed.length;
} else {
SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n",
sgl->generic.type, sgl->generic.subtype);
rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
return -1;
}
if (req->length == 0) {
xfer = SPDK_NVME_DATA_NONE;
req->data = NULL;
}
req->xfer = xfer;
/*
* For any I/O that requires data to be
* pulled into the local buffer before processing by
* the backend NVMe device
*/
if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) {
SPDK_TRACELOG(SPDK_TRACE_NVMF, "Initiating Host to Controller data transfer\n");
/* Wait for transfer to complete before executing command. */
return 1;
}
}
}
if (xfer == SPDK_NVME_DATA_NONE) {
SPDK_TRACELOG(SPDK_TRACE_NVMF, "No data to transfer\n");
assert(req->data == NULL);
assert(req->length == 0);
} else {
assert(req->data != NULL);
assert(req->length != 0);
SPDK_TRACELOG(SPDK_TRACE_NVMF, "%s data ready\n",
xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER ? "Host to Controller" :
"Controller to Host");
}
return 0;
}
static int spdk_nvmf_rdma_poll(struct spdk_nvmf_conn *conn);
static void
nvmf_rdma_accept(struct rte_timer *timer, void *arg)
{
struct rdma_cm_event *event;
int rc;
struct spdk_nvmf_rdma_conn *rdma_conn, *tmp;
if (g_rdma.acceptor_event_channel == NULL) {
return;
}
/* Process pending connections for incoming capsules. The only capsule
* this should ever find is a CONNECT request. */
TAILQ_FOREACH_SAFE(rdma_conn, &g_pending_conns, link, tmp) {
rc = spdk_nvmf_rdma_poll(&rdma_conn->conn);
if (rc < 0) {
TAILQ_REMOVE(&g_pending_conns, rdma_conn, link);
spdk_nvmf_rdma_conn_destroy(rdma_conn);
} else if (rc > 0) {
/* At least one request was processed which is assumed to be
* a CONNECT. Remove this connection from our list. */
TAILQ_REMOVE(&g_pending_conns, rdma_conn, link);
}
}
while (1) {
rc = rdma_get_cm_event(g_rdma.acceptor_event_channel, &event);
if (rc == 0) {
SPDK_TRACELOG(SPDK_TRACE_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]);
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
rc = nvmf_rdma_connect(event);
if (rc < 0) {
SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc);
break;
}
break;
case RDMA_CM_EVENT_ESTABLISHED:
break;
case RDMA_CM_EVENT_ADDR_CHANGE:
case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
rc = nvmf_rdma_disconnect(event);
if (rc < 0) {
SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
break;
}
continue;
default:
SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
break;
}
rdma_ack_cm_event(event);
} else {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
SPDK_ERRLOG("Acceptor Event Error: %s\n", strerror(errno));
}
break;
}
}
}
static int
spdk_nvmf_rdma_acceptor_start(void)
{
struct sockaddr_in addr;
uint16_t sin_port;
int rc;
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_port = g_nvmf_tgt.sin_port;
/* create an event channel with rdmacm to receive
connection oriented requests and notifications */
g_rdma.acceptor_event_channel = rdma_create_event_channel();
if (g_rdma.acceptor_event_channel == NULL) {
SPDK_ERRLOG("rdma_create_event_channel() failed\n");
return -1;
}
rc = fcntl(g_rdma.acceptor_event_channel->fd, F_SETFL, O_NONBLOCK);
if (rc < 0) {
SPDK_ERRLOG("fcntl to set fd to non-blocking failed\n");
goto create_id_error;
}
rc = rdma_create_id(g_rdma.acceptor_event_channel, &g_rdma.acceptor_listen_id, NULL, RDMA_PS_TCP);
if (rc < 0) {
SPDK_ERRLOG("rdma_create_id() failed\n");
goto create_id_error;
}
rc = rdma_bind_addr(g_rdma.acceptor_listen_id, (struct sockaddr *)&addr);
if (rc < 0) {
SPDK_ERRLOG("rdma_bind_addr() failed\n");
goto listen_error;
}
rc = rdma_listen(g_rdma.acceptor_listen_id, 10); /* 10 = backlog */
if (rc < 0) {
SPDK_ERRLOG("rdma_listen() failed\n");
goto listen_error;
}
sin_port = ntohs(rdma_get_src_port(g_rdma.acceptor_listen_id));
SPDK_NOTICELOG("*** NVMf Target Listening on port %d ***\n", sin_port);
rte_timer_init(&g_rdma.acceptor_timer);
rte_timer_reset(&g_rdma.acceptor_timer, ACCEPT_TIMEOUT, PERIODICAL,
rte_lcore_id(), nvmf_rdma_accept, NULL);
return (rc);
listen_error:
rdma_destroy_id(g_rdma.acceptor_listen_id);
create_id_error:
rdma_destroy_event_channel(g_rdma.acceptor_event_channel);
return -1;
}
static void
spdk_nvmf_rdma_acceptor_stop(void)
{
SPDK_TRACELOG(SPDK_TRACE_RDMA, "nvmf_acceptor_stop: shutdown\n");
rte_timer_stop_sync(&g_rdma.acceptor_timer);
}
/*
Initialize with RDMA transport. Query OFED for device list.
*/
static int
spdk_nvmf_rdma_init(uint16_t max_queue_depth, uint32_t max_io_size,
uint32_t in_capsule_data_size)
{
struct ibv_device **dev_list;
struct ibv_context *ibdev_ctx = NULL;
struct ibv_device_attr ibdev_attr;
int num_of_rdma_devices;
int num_devices_found = 0;
int i, ret;
SPDK_NOTICELOG("*** RDMA Transport Init ***\n");
dev_list = ibv_get_device_list(&num_of_rdma_devices);
if (!dev_list) {
SPDK_ERRLOG(" No RDMA verbs devices found\n");
return -1;
}
SPDK_TRACELOG(SPDK_TRACE_RDMA, "%d RDMA verbs device(s) discovered\n", num_of_rdma_devices);
/* Look through the list of devices for one we support */
for (i = 0; i < num_of_rdma_devices; i++) {
SPDK_TRACELOG(SPDK_TRACE_RDMA, " RDMA Device %d:\n", i);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Node type: %d\n", (int)dev_list[i]->node_type);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Transport type: %d\n", (int)dev_list[i]->transport_type);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Name: %s\n", dev_list[i]->name);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Device Name: %s\n", dev_list[i]->dev_name);
ibdev_ctx = ibv_open_device(dev_list[i]);
if (!ibdev_ctx) {
SPDK_ERRLOG(" No rdma context returned for device %d\n", i);
continue;
}
ret = ibv_query_device(ibdev_ctx, &ibdev_attr);
if (ret) {
SPDK_ERRLOG(" Failed on query for device %d\n", i);
ibv_close_device(ibdev_ctx);
continue;
}
/* display device specific attributes */
SPDK_TRACELOG(SPDK_TRACE_RDMA, " RDMA Device Attributes:\n");
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max MR Size: 0x%llx\n", (long long int)ibdev_attr.max_mr_size);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Page Size Cap: 0x%llx\n",
(long long int)ibdev_attr.page_size_cap);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max QPs: 0x%x\n", (int)ibdev_attr.max_qp);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max QP WRs: 0x%x\n", (int)ibdev_attr.max_qp_wr);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max SGE: 0x%x\n", (int)ibdev_attr.max_sge);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max CQs: 0x%x\n", (int)ibdev_attr.max_cq);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max CQE per CQ: 0x%x\n", (int)ibdev_attr.max_cqe);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max MR: 0x%x\n", (int)ibdev_attr.max_mr);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max PD: 0x%x\n", (int)ibdev_attr.max_pd);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max QP RD Atom: 0x%x\n", (int)ibdev_attr.max_qp_rd_atom);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max QP Init RD Atom: 0x%x\n",
(int)ibdev_attr.max_qp_init_rd_atom);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max Res RD Atom: 0x%x\n", (int)ibdev_attr.max_res_rd_atom);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max EE: 0x%x\n", (int)ibdev_attr.max_ee);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max SRQ: 0x%x\n", (int)ibdev_attr.max_srq);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max SRQ WR: 0x%x\n", (int)ibdev_attr.max_srq_wr);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max SRQ SGE: 0x%x\n", (int)ibdev_attr.max_srq_sge);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Max PKeys: 0x%x\n", (int)ibdev_attr.max_pkeys);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " Phys Port Cnt: %d\n", (int)ibdev_attr.phys_port_cnt);
num_devices_found++;
}
ibv_free_device_list(dev_list);
SPDK_TRACELOG(SPDK_TRACE_RDMA, " %d Fabric Intf(s) active\n", num_devices_found);
g_rdma.max_queue_depth = max_queue_depth;
g_rdma.max_io_size = max_io_size;
g_rdma.in_capsule_data_size = in_capsule_data_size;
return num_devices_found;
}
static int
spdk_nvmf_rdma_fini(void)
{
/* Nothing to do */
return 0;
}
static void
spdk_nvmf_rdma_close_conn(struct spdk_nvmf_conn *conn)
{
struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn);
return spdk_nvmf_rdma_conn_destroy(rdma_conn);
}
/* Returns the number of times that spdk_nvmf_request_exec was called,
* or -1 on error.
*/
static int
spdk_nvmf_rdma_poll(struct spdk_nvmf_conn *conn)
{
struct ibv_wc wc;
struct spdk_nvmf_rdma_conn *rdma_conn = get_rdma_conn(conn);
struct spdk_nvmf_rdma_request *rdma_req;
struct spdk_nvmf_request *req;
int rc, count;
count = 0;
while (true) {
rc = ibv_poll_cq(rdma_conn->cq, 1, &wc);
if (rc == 0) // No completions at this time
break;
if (rc < 0) {
SPDK_ERRLOG("Poll CQ error!(%d): %s\n",
errno, strerror(errno));
return -1;
}
/* OK, process the single successful cq event */
if (wc.status) {
SPDK_TRACELOG(SPDK_TRACE_RDMA, "CQ completion error status %d (%s), exiting handler\n",
wc.status, ibv_wc_status_str(wc.status));
return -1;
}
rdma_req = (struct spdk_nvmf_rdma_request *)wc.wr_id;
if (rdma_req == NULL) {
SPDK_ERRLOG("Got CQ completion for NULL rdma_req\n");
return -1;
}
req = &rdma_req->req;
switch (wc.opcode) {
case IBV_WC_SEND:
assert(rdma_conn->num_outstanding_reqs > 0);
rdma_conn->num_outstanding_reqs--;
SPDK_TRACELOG(SPDK_TRACE_RDMA,
"RDMA SEND Complete. Request: %p Connection: %p Outstanding I/O: %d\n",
req, conn, rdma_conn->num_outstanding_reqs);
if (spdk_nvmf_rdma_request_release(req)) {
return -1;
}
break;
case IBV_WC_RDMA_WRITE:
/*
* Will get this event only if we set IBV_SEND_SIGNALED
* flag in rdma_write, to trace rdma write latency
*/
SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA WRITE Complete. Request: %p Connection: %p\n",
req, conn);
spdk_trace_record(TRACE_RDMA_WRITE_COMPLETE, 0, 0, (uint64_t)req, 0);
break;
case IBV_WC_RDMA_READ:
SPDK_TRACELOG(SPDK_TRACE_RDMA, "RDMA READ Complete. Request: %p Connection: %p\n",
req, conn);
spdk_trace_record(TRACE_RDMA_READ_COMPLETE, 0, 0, (uint64_t)req, 0);
rc = spdk_nvmf_request_exec(req);
if (rc) {
SPDK_ERRLOG("request_exec error %d after RDMA Read completion\n", rc);
return -1;
}
count++;
break;
case IBV_WC_RECV:
if (wc.byte_len < sizeof(struct spdk_nvmf_capsule_cmd)) {
SPDK_ERRLOG("recv length %u less than capsule header\n", wc.byte_len);
return -1;
}
rdma_conn->num_outstanding_reqs++;
SPDK_TRACELOG(SPDK_TRACE_RDMA,
"RDMA RECV Complete. Request: %p Connection: %p Outstanding I/O: %d\n",
req, conn, rdma_conn->num_outstanding_reqs);
spdk_trace_record(TRACE_NVMF_IO_START, 0, 0, (uint64_t)req, 0);
memset(req->rsp, 0, sizeof(*req->rsp));
rc = spdk_nvmf_request_prep_data(req);
if (rc < 0) {
SPDK_ERRLOG("prep_data failed\n");
return spdk_nvmf_request_complete(req);
} else if (rc == 0) {
/* Data is immediately available */
rc = spdk_nvmf_request_exec(req);
if (rc < 0) {
SPDK_ERRLOG("Command execution failed\n");
return -1;
}
count++;
} else {
/* Start transfer of data from host to target */
rc = nvmf_post_rdma_read(req);
if (rc) {
SPDK_ERRLOG("Unable to transfer data from host to target\n");
return -1;
}
}
break;
default:
SPDK_ERRLOG("Poll cq opcode type unknown!!!!! completion\n");
return -1;
}
}
return count;
}
static void
nvmf_rdma_discover(struct spdk_nvmf_listen_addr *listen_addr,
struct spdk_nvmf_discovery_log_page_entry *entry)
{
entry->trtype = SPDK_NVMF_TRTYPE_RDMA;
entry->adrfam = SPDK_NVMF_ADRFAM_IPV4;
entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED;
snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%s", listen_addr->trsvc);
snprintf(entry->traddr, sizeof(entry->traddr), "%s", listen_addr->traddr);
entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED;
entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE;
entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM;
}
const struct spdk_nvmf_transport spdk_nvmf_transport_rdma = {
.name = "rdma",
.transport_init = spdk_nvmf_rdma_init,
.transport_fini = spdk_nvmf_rdma_fini,
.transport_start = spdk_nvmf_rdma_acceptor_start,
.transport_stop = spdk_nvmf_rdma_acceptor_stop,
.req_complete = spdk_nvmf_rdma_request_complete,
.req_release = spdk_nvmf_rdma_request_release,
.conn_fini = spdk_nvmf_rdma_close_conn,
.conn_poll = spdk_nvmf_rdma_poll,
.listen_addr_discover = nvmf_rdma_discover,
};
SPDK_LOG_REGISTER_TRACE_FLAG("rdma", SPDK_TRACE_RDMA)