nvmf: SGL support for NVMF RDMA Driver.

Change-Id: I447754c69de432b5a65dc8c1d9ae690926e88c51
Signed-off-by: John Meneghini <johnm@netapp.com>
Signed-off-by: Srikanth kaligotla <kalis@netapp.com>
Signed-off-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-on: https://review.gerrithub.io/410302
Tested-by: SPDK Automated Test System <sys_sgsw@intel.com>
Reviewed-by: Daniel Verkamp <daniel.verkamp@intel.com>
This commit is contained in:
Srikanth kaligotla 2018-05-02 10:01:41 -04:00 committed by Jim Harris
parent a94accabff
commit 8580daa1ac
8 changed files with 139 additions and 24 deletions

View File

@ -67,6 +67,7 @@ struct spdk_nvmf_tgt_opts {
uint32_t in_capsule_data_size;
uint32_t max_io_size;
uint32_t max_subsystems;
uint32_t io_unit_size;
};
/**
* Initialize the default value of opts.

View File

@ -72,6 +72,7 @@ spdk_nvmf_read_config_file_params(struct spdk_conf_section *sp,
int max_queues_per_sess;
int in_capsule_data_size;
int max_io_size;
int io_unit_size;
int acceptor_poll_rate;
max_queue_depth = spdk_conf_section_get_intval(sp, "MaxQueueDepth");
@ -94,6 +95,11 @@ spdk_nvmf_read_config_file_params(struct spdk_conf_section *sp,
opts->max_io_size = max_io_size;
}
io_unit_size = spdk_conf_section_get_intval(sp, "IOUnitSize");
if (io_unit_size >= 0) {
opts->io_unit_size = io_unit_size;
}
acceptor_poll_rate = spdk_conf_section_get_intval(sp, "AcceptorPollRate");
if (acceptor_poll_rate >= 0) {
g_spdk_nvmf_tgt_conf.acceptor_poll_rate = acceptor_poll_rate;

View File

@ -180,8 +180,8 @@ nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
}
spdk_trace_record(TRACE_NVMF_LIB_READ_START, 0, 0, (uint64_t)req, 0);
if (spdk_unlikely(spdk_bdev_read_blocks(desc, ch, req->data, start_lba, num_blocks,
nvmf_bdev_ctrlr_complete_cmd, req))) {
if (spdk_unlikely(spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
nvmf_bdev_ctrlr_complete_cmd, req))) {
rsp->status.sct = SPDK_NVME_SCT_GENERIC;
rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
@ -219,7 +219,7 @@ nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
}
spdk_trace_record(TRACE_NVMF_LIB_WRITE_START, 0, 0, (uint64_t)req, 0);
if (spdk_unlikely(spdk_bdev_write_blocks(desc, ch, req->data, start_lba, num_blocks,
if (spdk_unlikely(spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
nvmf_bdev_ctrlr_complete_cmd, req))) {
rsp->status.sct = SPDK_NVME_SCT_GENERIC;
rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;

View File

@ -51,6 +51,7 @@ SPDK_LOG_REGISTER_COMPONENT("nvmf", SPDK_LOG_NVMF)
#define SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE 4096
#define SPDK_NVMF_DEFAULT_MAX_IO_SIZE 131072
#define SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS 1024
#define SPDK_NVMF_DEFAULT_IO_UNIT_SIZE 131072
void
spdk_nvmf_tgt_opts_init(struct spdk_nvmf_tgt_opts *opts)
@ -60,6 +61,7 @@ spdk_nvmf_tgt_opts_init(struct spdk_nvmf_tgt_opts *opts)
opts->in_capsule_data_size = SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE;
opts->max_io_size = SPDK_NVMF_DEFAULT_MAX_IO_SIZE;
opts->max_subsystems = SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS;
opts->io_unit_size = SPDK_NVMF_DEFAULT_IO_UNIT_SIZE;
}
static int
@ -165,6 +167,14 @@ spdk_nvmf_tgt_create(struct spdk_nvmf_tgt_opts *opts)
tgt->opts = *opts;
}
if ((tgt->opts.max_io_size % tgt->opts.io_unit_size != 0) ||
(tgt->opts.max_io_size / tgt->opts.io_unit_size > SPDK_NVMF_MAX_SGL_ENTRIES)) {
SPDK_ERRLOG("Unsupported IO size, MaxIO:%d, UnitIO:%d\n", tgt->opts.max_io_size,
tgt->opts.io_unit_size);
free(tgt);
return NULL;
}
tgt->discovery_genctr = 0;
tgt->discovery_log_page = NULL;
tgt->discovery_log_page_size = 0;
@ -187,6 +197,7 @@ spdk_nvmf_tgt_create(struct spdk_nvmf_tgt_opts *opts)
SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Max In Capsule Data: %d bytes\n",
tgt->opts.in_capsule_data_size);
SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Max I/O Size: %d bytes\n", tgt->opts.max_io_size);
SPDK_DEBUGLOG(SPDK_LOG_NVMF, "I/O Unit Size: %d bytes\n", tgt->opts.io_unit_size);
return tgt;
}

View File

@ -43,6 +43,8 @@
#include "spdk/queue.h"
#include "spdk/util.h"
#define SPDK_NVMF_MAX_SGL_ENTRIES 16
enum spdk_nvmf_subsystem_state {
SPDK_NVMF_SUBSYSTEM_INACTIVE = 0,
SPDK_NVMF_SUBSYSTEM_ACTIVATING,
@ -138,6 +140,8 @@ struct spdk_nvmf_request {
void *data;
union nvmf_h2c_msg *cmd;
union nvmf_c2h_msg *rsp;
struct iovec iov[SPDK_NVMF_MAX_SGL_ENTRIES];
uint32_t iovcnt;
TAILQ_ENTRY(spdk_nvmf_request) link;
};

View File

@ -55,6 +55,7 @@
*/
#define NVMF_DEFAULT_TX_SGE 1
#define NVMF_DEFAULT_RX_SGE 2
#define NVMF_DEFAULT_DATA_SGE 16
/* The RDMA completion queue size */
#define NVMF_RDMA_CQ_SIZE 4096
@ -129,7 +130,7 @@ struct spdk_nvmf_rdma_recv {
struct spdk_nvmf_rdma_request {
struct spdk_nvmf_request req;
void *data_from_pool;
bool data_from_pool;
enum spdk_nvmf_rdma_request_state state;
@ -142,7 +143,8 @@ struct spdk_nvmf_rdma_request {
struct {
struct ibv_send_wr wr;
struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE];
struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES];
void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES];
} data;
TAILQ_ENTRY(spdk_nvmf_rdma_request) link;
@ -259,6 +261,7 @@ struct spdk_nvmf_rdma_transport {
uint16_t max_queue_depth;
uint32_t max_io_size;
uint32_t io_unit_size;
uint32_t in_capsule_data_size;
TAILQ_HEAD(, spdk_nvmf_rdma_device) devices;
@ -345,7 +348,7 @@ spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
attr.recv_cq = rqpair->poller->cq;
attr.cap.max_send_wr = rqpair->max_queue_depth * 2; /* SEND, READ, and WRITE operations */
attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */
attr.cap.max_send_sge = NVMF_DEFAULT_TX_SGE;
attr.cap.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES;
attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE;
rc = rdma_create_qp(rqpair->cm_id, NULL, &attr);
@ -840,6 +843,55 @@ spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req)
return xfer;
}
static int
spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_device *device,
struct spdk_nvmf_rdma_request *rdma_req)
{
void *buf = NULL;
uint32_t length = rdma_req->req.length;
uint32_t i = 0;
rdma_req->req.iovcnt = 0;
while (length) {
buf = spdk_mempool_get(rtransport->data_buf_pool);
if (!buf) {
goto nomem;
}
rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) &
~NVMF_DATA_BUFFER_MASK);
rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->io_unit_size);
rdma_req->req.iovcnt++;
rdma_req->data.buffers[i] = buf;
rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base);
rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len;
rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map,
(uint64_t)buf))->lkey;
length -= rdma_req->req.iov[i].iov_len;
i++;
}
rdma_req->data_from_pool = true;
return 0;
nomem:
while (i) {
i--;
spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base);
rdma_req->req.iov[i].iov_base = NULL;
rdma_req->req.iov[i].iov_len = 0;
rdma_req->data.wr.sg_list[i].addr = 0;
rdma_req->data.wr.sg_list[i].length = 0;
rdma_req->data.wr.sg_list[i].lkey = 0;
}
rdma_req->req.iovcnt = 0;
return -ENOMEM;
}
static int
spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_device *device,
@ -863,26 +915,25 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
return -1;
}
/* fill request length and populate iovs */
rdma_req->req.length = sgl->keyed.length;
rdma_req->data_from_pool = spdk_mempool_get(rtransport->data_buf_pool);
if (!rdma_req->data_from_pool) {
if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) {
/* No available buffers. Queue this request up. */
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
return 0;
}
/* AIO backend requires block size aligned data buffers,
* 4KiB aligned data buffer should work for most devices.
*/
rdma_req->req.data = (void *)((uintptr_t)(rdma_req->data_from_pool + NVMF_DATA_BUFFER_MASK)
& ~NVMF_DATA_BUFFER_MASK);
rdma_req->data.sgl[0].addr = (uintptr_t)rdma_req->req.data;
rdma_req->data.sgl[0].length = sgl->keyed.length;
rdma_req->data.sgl[0].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map,
(uint64_t)rdma_req->req.data))->lkey;
/* backward compatible */
rdma_req->req.data = rdma_req->req.iov[0].iov_base;
/* rdma wr specifics */
rdma_req->data.wr.num_sge = rdma_req->req.iovcnt;
rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key;
rdma_req->data.wr.wr.rdma.remote_addr = sgl->address;
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took buffer from central pool\n", rdma_req);
SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req,
rdma_req->req.iovcnt);
return 0;
} else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
@ -909,8 +960,13 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
}
rdma_req->req.data = rdma_req->recv->buf + offset;
rdma_req->data_from_pool = NULL;
rdma_req->data_from_pool = false;
rdma_req->req.length = sgl->unkeyed.length;
rdma_req->req.iov[0].iov_base = rdma_req->req.data;
rdma_req->req.iov[0].iov_len = rdma_req->req.length;
rdma_req->req.iovcnt = 1;
return 0;
}
@ -998,7 +1054,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
/* If data is transferring from host to controller and the data didn't
* arrive using in capsule data, we need to do a transfer from the host.
*/
if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool != NULL) {
if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) {
rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER;
TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link);
break;
@ -1068,11 +1124,16 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
rqpair->cur_queue_depth--;
if (rdma_req->data_from_pool) {
/* Put the buffer back in the pool */
spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data_from_pool);
rdma_req->data_from_pool = NULL;
/* Put the buffer/s back in the pool */
for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) {
spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]);
rdma_req->req.iov[i].iov_base = NULL;
rdma_req->data.buffers[i] = NULL;
}
rdma_req->data_from_pool = false;
}
rdma_req->req.length = 0;
rdma_req->req.iovcnt = 0;
rdma_req->req.data = NULL;
rdma_req->state = RDMA_REQUEST_STATE_FREE;
TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link);
@ -1098,6 +1159,7 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt)
struct ibv_context **contexts;
uint32_t i;
int flag;
uint32_t sge_count;
rtransport = calloc(1, sizeof(*rtransport));
if (!rtransport) {
@ -1115,8 +1177,21 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt)
rtransport->max_queue_depth = tgt->opts.max_queue_depth;
rtransport->max_io_size = tgt->opts.max_io_size;
rtransport->io_unit_size = tgt->opts.io_unit_size;
rtransport->in_capsule_data_size = tgt->opts.in_capsule_data_size;
/* I/O unit size cannot be larger than max I/O size */
if (rtransport->io_unit_size > rtransport->max_io_size) {
rtransport->io_unit_size = rtransport->max_io_size;
}
sge_count = rtransport->max_io_size / rtransport->io_unit_size;
if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) {
SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", rtransport->io_unit_size);
free(rtransport);
return NULL;
}
rtransport->event_channel = rdma_create_event_channel();
if (rtransport->event_channel == NULL) {
SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno));
@ -1134,7 +1209,7 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt)
rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma",
rtransport->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */
rtransport->max_io_size + NVMF_DATA_BUFFER_ALIGNMENT,
rtransport->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT,
SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
SPDK_ENV_SOCKET_ID_ANY);
if (!rtransport->data_buf_pool) {

View File

@ -4,3 +4,4 @@
[Nvmf]
MaxQueuesPerSession 4
IOUnitSize 8192

View File

@ -121,6 +121,15 @@ spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
return 0;
}
int
spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
struct iovec *iov, int iovcnt,
uint64_t offset_blocks, uint64_t num_blocks,
spdk_bdev_io_completion_cb cb, void *cb_arg)
{
return 0;
}
int
spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
uint64_t offset_blocks, uint64_t num_blocks,
@ -129,6 +138,14 @@ spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, v
return 0;
}
int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
struct iovec *iov, int iovcnt,
uint64_t offset_blocks, uint64_t num_blocks,
spdk_bdev_io_completion_cb cb, void *cb_arg)
{
return 0;
}
int
spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
uint64_t offset_blocks, uint64_t num_blocks,