diff --git a/include/spdk/nvmf.h b/include/spdk/nvmf.h index 66462b403..81187aa26 100644 --- a/include/spdk/nvmf.h +++ b/include/spdk/nvmf.h @@ -67,6 +67,7 @@ struct spdk_nvmf_tgt_opts { uint32_t in_capsule_data_size; uint32_t max_io_size; uint32_t max_subsystems; + uint32_t io_unit_size; }; /** * Initialize the default value of opts. diff --git a/lib/event/subsystems/nvmf/conf.c b/lib/event/subsystems/nvmf/conf.c index ca53e0a5c..8f6dea431 100644 --- a/lib/event/subsystems/nvmf/conf.c +++ b/lib/event/subsystems/nvmf/conf.c @@ -72,6 +72,7 @@ spdk_nvmf_read_config_file_params(struct spdk_conf_section *sp, int max_queues_per_sess; int in_capsule_data_size; int max_io_size; + int io_unit_size; int acceptor_poll_rate; max_queue_depth = spdk_conf_section_get_intval(sp, "MaxQueueDepth"); @@ -94,6 +95,11 @@ spdk_nvmf_read_config_file_params(struct spdk_conf_section *sp, opts->max_io_size = max_io_size; } + io_unit_size = spdk_conf_section_get_intval(sp, "IOUnitSize"); + if (io_unit_size >= 0) { + opts->io_unit_size = io_unit_size; + } + acceptor_poll_rate = spdk_conf_section_get_intval(sp, "AcceptorPollRate"); if (acceptor_poll_rate >= 0) { g_spdk_nvmf_tgt_conf.acceptor_poll_rate = acceptor_poll_rate; diff --git a/lib/nvmf/ctrlr_bdev.c b/lib/nvmf/ctrlr_bdev.c index 93c7423dc..3db87487e 100644 --- a/lib/nvmf/ctrlr_bdev.c +++ b/lib/nvmf/ctrlr_bdev.c @@ -180,8 +180,8 @@ nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, } spdk_trace_record(TRACE_NVMF_LIB_READ_START, 0, 0, (uint64_t)req, 0); - if (spdk_unlikely(spdk_bdev_read_blocks(desc, ch, req->data, start_lba, num_blocks, - nvmf_bdev_ctrlr_complete_cmd, req))) { + if (spdk_unlikely(spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req))) { rsp->status.sct = SPDK_NVME_SCT_GENERIC; rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; @@ -219,7 +219,7 @@ nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, } spdk_trace_record(TRACE_NVMF_LIB_WRITE_START, 0, 0, (uint64_t)req, 0); - if (spdk_unlikely(spdk_bdev_write_blocks(desc, ch, req->data, start_lba, num_blocks, + if (spdk_unlikely(spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, nvmf_bdev_ctrlr_complete_cmd, req))) { rsp->status.sct = SPDK_NVME_SCT_GENERIC; rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; diff --git a/lib/nvmf/nvmf.c b/lib/nvmf/nvmf.c index eb04f5c5e..27a60e36d 100644 --- a/lib/nvmf/nvmf.c +++ b/lib/nvmf/nvmf.c @@ -51,6 +51,7 @@ SPDK_LOG_REGISTER_COMPONENT("nvmf", SPDK_LOG_NVMF) #define SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 #define SPDK_NVMF_DEFAULT_MAX_IO_SIZE 131072 #define SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS 1024 +#define SPDK_NVMF_DEFAULT_IO_UNIT_SIZE 131072 void spdk_nvmf_tgt_opts_init(struct spdk_nvmf_tgt_opts *opts) @@ -60,6 +61,7 @@ spdk_nvmf_tgt_opts_init(struct spdk_nvmf_tgt_opts *opts) opts->in_capsule_data_size = SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE; opts->max_io_size = SPDK_NVMF_DEFAULT_MAX_IO_SIZE; opts->max_subsystems = SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS; + opts->io_unit_size = SPDK_NVMF_DEFAULT_IO_UNIT_SIZE; } static int @@ -165,6 +167,14 @@ spdk_nvmf_tgt_create(struct spdk_nvmf_tgt_opts *opts) tgt->opts = *opts; } + if ((tgt->opts.max_io_size % tgt->opts.io_unit_size != 0) || + (tgt->opts.max_io_size / tgt->opts.io_unit_size > SPDK_NVMF_MAX_SGL_ENTRIES)) { + SPDK_ERRLOG("Unsupported IO size, MaxIO:%d, UnitIO:%d\n", tgt->opts.max_io_size, + tgt->opts.io_unit_size); + free(tgt); + return NULL; + } + tgt->discovery_genctr = 0; tgt->discovery_log_page = NULL; tgt->discovery_log_page_size = 0; @@ -187,6 +197,7 @@ spdk_nvmf_tgt_create(struct spdk_nvmf_tgt_opts *opts) SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Max In Capsule Data: %d bytes\n", tgt->opts.in_capsule_data_size); SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Max I/O Size: %d bytes\n", tgt->opts.max_io_size); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "I/O Unit Size: %d bytes\n", tgt->opts.io_unit_size); return tgt; } diff --git a/lib/nvmf/nvmf_internal.h b/lib/nvmf/nvmf_internal.h index a0601b9ee..8958449fd 100644 --- a/lib/nvmf/nvmf_internal.h +++ b/lib/nvmf/nvmf_internal.h @@ -43,6 +43,8 @@ #include "spdk/queue.h" #include "spdk/util.h" +#define SPDK_NVMF_MAX_SGL_ENTRIES 16 + enum spdk_nvmf_subsystem_state { SPDK_NVMF_SUBSYSTEM_INACTIVE = 0, SPDK_NVMF_SUBSYSTEM_ACTIVATING, @@ -138,6 +140,8 @@ struct spdk_nvmf_request { void *data; union nvmf_h2c_msg *cmd; union nvmf_c2h_msg *rsp; + struct iovec iov[SPDK_NVMF_MAX_SGL_ENTRIES]; + uint32_t iovcnt; TAILQ_ENTRY(spdk_nvmf_request) link; }; diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index 4ab392610..9b39cb1fe 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -55,6 +55,7 @@ */ #define NVMF_DEFAULT_TX_SGE 1 #define NVMF_DEFAULT_RX_SGE 2 +#define NVMF_DEFAULT_DATA_SGE 16 /* The RDMA completion queue size */ #define NVMF_RDMA_CQ_SIZE 4096 @@ -129,7 +130,7 @@ struct spdk_nvmf_rdma_recv { struct spdk_nvmf_rdma_request { struct spdk_nvmf_request req; - void *data_from_pool; + bool data_from_pool; enum spdk_nvmf_rdma_request_state state; @@ -142,7 +143,8 @@ struct spdk_nvmf_rdma_request { struct { struct ibv_send_wr wr; - struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; + struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; + void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; } data; TAILQ_ENTRY(spdk_nvmf_rdma_request) link; @@ -259,6 +261,7 @@ struct spdk_nvmf_rdma_transport { uint16_t max_queue_depth; uint32_t max_io_size; + uint32_t io_unit_size; uint32_t in_capsule_data_size; TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; @@ -345,7 +348,7 @@ spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) attr.recv_cq = rqpair->poller->cq; attr.cap.max_send_wr = rqpair->max_queue_depth * 2; /* SEND, READ, and WRITE operations */ attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */ - attr.cap.max_send_sge = NVMF_DEFAULT_TX_SGE; + attr.cap.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES; attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; rc = rdma_create_qp(rqpair->cm_id, NULL, &attr); @@ -840,6 +843,55 @@ spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) return xfer; } +static int +spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + void *buf = NULL; + uint32_t length = rdma_req->req.length; + uint32_t i = 0; + + rdma_req->req.iovcnt = 0; + while (length) { + buf = spdk_mempool_get(rtransport->data_buf_pool); + if (!buf) { + goto nomem; + } + + rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK); + rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->io_unit_size); + rdma_req->req.iovcnt++; + rdma_req->data.buffers[i] = buf; + rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); + rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; + rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, + (uint64_t)buf))->lkey; + + length -= rdma_req->req.iov[i].iov_len; + i++; + } + + rdma_req->data_from_pool = true; + + return 0; + +nomem: + while (i) { + i--; + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->req.iov[i].iov_len = 0; + + rdma_req->data.wr.sg_list[i].addr = 0; + rdma_req->data.wr.sg_list[i].length = 0; + rdma_req->data.wr.sg_list[i].lkey = 0; + } + rdma_req->req.iovcnt = 0; + return -ENOMEM; +} + static int spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_nvmf_rdma_device *device, @@ -863,26 +915,25 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, return -1; } + /* fill request length and populate iovs */ rdma_req->req.length = sgl->keyed.length; - rdma_req->data_from_pool = spdk_mempool_get(rtransport->data_buf_pool); - if (!rdma_req->data_from_pool) { + + if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { /* No available buffers. Queue this request up. */ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); return 0; } - /* AIO backend requires block size aligned data buffers, - * 4KiB aligned data buffer should work for most devices. - */ - rdma_req->req.data = (void *)((uintptr_t)(rdma_req->data_from_pool + NVMF_DATA_BUFFER_MASK) - & ~NVMF_DATA_BUFFER_MASK); - rdma_req->data.sgl[0].addr = (uintptr_t)rdma_req->req.data; - rdma_req->data.sgl[0].length = sgl->keyed.length; - rdma_req->data.sgl[0].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, - (uint64_t)rdma_req->req.data))->lkey; + + /* backward compatible */ + rdma_req->req.data = rdma_req->req.iov[0].iov_base; + + /* rdma wr specifics */ + rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; - SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took buffer from central pool\n", rdma_req); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, + rdma_req->req.iovcnt); return 0; } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && @@ -909,8 +960,13 @@ spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, } rdma_req->req.data = rdma_req->recv->buf + offset; - rdma_req->data_from_pool = NULL; + rdma_req->data_from_pool = false; rdma_req->req.length = sgl->unkeyed.length; + + rdma_req->req.iov[0].iov_base = rdma_req->req.data; + rdma_req->req.iov[0].iov_len = rdma_req->req.length; + rdma_req->req.iovcnt = 1; + return 0; } @@ -998,7 +1054,7 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, /* If data is transferring from host to controller and the data didn't * arrive using in capsule data, we need to do a transfer from the host. */ - if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool != NULL) { + if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { rdma_req->state = RDMA_REQUEST_STATE_TRANSFER_PENDING_HOST_TO_CONTROLLER; TAILQ_INSERT_TAIL(&rqpair->pending_rdma_rw_queue, rdma_req, link); break; @@ -1068,11 +1124,16 @@ spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, rqpair->cur_queue_depth--; if (rdma_req->data_from_pool) { - /* Put the buffer back in the pool */ - spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data_from_pool); - rdma_req->data_from_pool = NULL; + /* Put the buffer/s back in the pool */ + for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->data.buffers[i] = NULL; + } + rdma_req->data_from_pool = false; } rdma_req->req.length = 0; + rdma_req->req.iovcnt = 0; rdma_req->req.data = NULL; rdma_req->state = RDMA_REQUEST_STATE_FREE; TAILQ_INSERT_TAIL(&rqpair->free_queue, rdma_req, link); @@ -1098,6 +1159,7 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt) struct ibv_context **contexts; uint32_t i; int flag; + uint32_t sge_count; rtransport = calloc(1, sizeof(*rtransport)); if (!rtransport) { @@ -1115,8 +1177,21 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt) rtransport->max_queue_depth = tgt->opts.max_queue_depth; rtransport->max_io_size = tgt->opts.max_io_size; + rtransport->io_unit_size = tgt->opts.io_unit_size; rtransport->in_capsule_data_size = tgt->opts.in_capsule_data_size; + /* I/O unit size cannot be larger than max I/O size */ + if (rtransport->io_unit_size > rtransport->max_io_size) { + rtransport->io_unit_size = rtransport->max_io_size; + } + + sge_count = rtransport->max_io_size / rtransport->io_unit_size; + if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { + SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", rtransport->io_unit_size); + free(rtransport); + return NULL; + } + rtransport->event_channel = rdma_create_event_channel(); if (rtransport->event_channel == NULL) { SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); @@ -1134,7 +1209,7 @@ spdk_nvmf_rdma_create(struct spdk_nvmf_tgt *tgt) rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma", rtransport->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */ - rtransport->max_io_size + NVMF_DATA_BUFFER_ALIGNMENT, + rtransport->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT, SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, SPDK_ENV_SOCKET_ID_ANY); if (!rtransport->data_buf_pool) { diff --git a/test/nvmf/nvmf.conf b/test/nvmf/nvmf.conf index ac6a5d9e2..abe62e8dc 100644 --- a/test/nvmf/nvmf.conf +++ b/test/nvmf/nvmf.conf @@ -4,3 +4,4 @@ [Nvmf] MaxQueuesPerSession 4 + IOUnitSize 8192 diff --git a/test/unit/lib/nvmf/ctrlr_bdev.c/ctrlr_bdev_ut.c b/test/unit/lib/nvmf/ctrlr_bdev.c/ctrlr_bdev_ut.c index b2dd759a9..51740fd4e 100644 --- a/test/unit/lib/nvmf/ctrlr_bdev.c/ctrlr_bdev_ut.c +++ b/test/unit/lib/nvmf/ctrlr_bdev.c/ctrlr_bdev_ut.c @@ -121,6 +121,15 @@ spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, return 0; } +int +spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return 0; +} + int spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, uint64_t offset_blocks, uint64_t num_blocks, @@ -129,6 +138,14 @@ spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, v return 0; } +int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return 0; +} + int spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, uint64_t offset_blocks, uint64_t num_blocks,