nvmf/tcp: Use the success optimization by default

By now (5.1 is released), the Linux kernel initiator supports the
success optimization and further, the version that doesn't support
it (5.0) was EOL-ed. As such, lets open it up @ spdk by default.

Doing so provides a notable performance improvement: running perf with
iodepth of 64, randread, two threads and block size of 512 bytes for 60s
("-q 64 -w randread -o 512 -c 0x5000 -t 60") over the VMA socket acceleration
library and null backing store, we got 730K IOPS with the success
optimization vs 550K without it.

IOPS           MiB/s    Average       min      max
549274.10     268.20     232.99      93.23 3256354.96
728117.57     355.53     175.76      85.93   14632.16

To allow for interop with older kernel initiators, we added
a config knob under which the success optimization can be
enabled or disabled.

Change-Id: Ia4c79f607f82c3563523ae3e07a67eac95b56dbb
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/457644
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Ziye Yang <ziye.yang@intel.com>
Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com>
This commit is contained in:
Or Gerlitz 2019-06-11 15:07:28 +00:00 committed by Darek Stojaczyk
parent 2224554eec
commit 6629202cbd
8 changed files with 52 additions and 22 deletions

View File

@ -8,6 +8,9 @@ A new file API `spdk_posix_file_load` was added to load file content into a data
### NVMe-oF Target
The c2h success optimization under which a command capsule response is not sent
for reads is turned on. A config knob was added to allow for enable/disable.
Shared receive queue can now be disabled even for NICs that support it using the
`nvmf_create_transport` RPC method parameter `no_srq`. The actual use of a shared
receive queue is predicated on hardware support when this flag is not used.

View File

@ -134,6 +134,9 @@
# Set the number of shared buffers to be cached per poll group
#BufCacheSize 32
# Set whether to use the C2H Success optimization, only used for TCP transport.
# C2HSuccess true
[Nvme]
# NVMe Device Whitelist
# Users may specify which NVMe devices to claim by their transport id.

View File

@ -74,6 +74,7 @@ struct spdk_nvmf_transport_opts {
uint32_t buf_cache_size;
uint32_t max_srq_depth;
bool no_srq;
bool c2h_success;
};
/**

View File

@ -483,6 +483,7 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx)
struct spdk_nvmf_transport_opts opts = { 0 };
enum spdk_nvme_transport_type trtype;
struct spdk_nvmf_transport *transport;
bool bval;
int val;
type = spdk_conf_section_get_val(ctx->sp, "Type");
@ -552,20 +553,31 @@ spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx)
opts.max_srq_depth = val;
} else {
SPDK_ERRLOG("MaxSRQDepth is relevant only for RDMA transport '%s'\n", type);
ctx->cb_fn(-1);
free(ctx);
return;
goto error_out;
}
}
bval = spdk_conf_section_get_boolval(ctx->sp, "C2HSuccess", true);
if (trtype == SPDK_NVME_TRANSPORT_TCP) {
opts.c2h_success = bval;
} else {
SPDK_ERRLOG("C2HSuccess is relevant only for TCP transport '%s'\n", type);
goto error_out;
}
transport = spdk_nvmf_transport_create(trtype, &opts);
if (transport) {
spdk_nvmf_tgt_add_transport(g_spdk_nvmf_tgt, transport, spdk_nvmf_tgt_add_transport_done, ctx);
} else {
ctx->cb_fn(-1);
free(ctx);
return;
goto error_out;
}
return;
error_out:
ctx->cb_fn(-1);
free(ctx);
return;
}
static int

View File

@ -1458,6 +1458,10 @@ static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[]
"no_srq", offsetof(struct nvmf_rpc_create_transport_ctx, opts.no_srq),
spdk_json_decode_bool, true
},
{
"c2h_success", offsetof(struct nvmf_rpc_create_transport_ctx, opts.c2h_success),
spdk_json_decode_bool, true
},
};
static void
@ -1594,6 +1598,8 @@ dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *t
if (type == SPDK_NVME_TRANSPORT_RDMA) {
spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth);
spdk_json_write_named_bool(w, "no_srq", opts->no_srq);
} else if (type == SPDK_NVME_TRANSPORT_TCP) {
spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success);
}
spdk_json_write_object_end(w);

View File

@ -55,9 +55,6 @@
#define NVMF_TCP_PDU_MAX_C2H_DATA_SIZE 131072
#define NVMF_TCP_QPAIR_MAX_C2H_PDU_NUM 64 /* Maximal c2h_data pdu number for ecah tqpair */
/* This is used to support the Linux kernel NVMe-oF initiator */
#define LINUX_KERNEL_SUPPORT_NOT_SENDING_RESP_FOR_C2H 0
/* spdk nvmf related structure */
enum spdk_nvmf_tcp_req_state {
@ -535,14 +532,15 @@ spdk_nvmf_tcp_create(struct spdk_nvmf_transport_opts *opts)
" Transport opts: max_ioq_depth=%d, max_io_size=%d,\n"
" max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
" in_capsule_data_size=%d, max_aq_depth=%d\n"
" num_shared_buffers=%d\n",
" num_shared_buffers=%d, c2h_success=%d\n",
opts->max_queue_depth,
opts->max_io_size,
opts->max_qpairs_per_ctrlr,
opts->io_unit_size,
opts->in_capsule_data_size,
opts->max_aq_depth,
opts->num_shared_buffers);
opts->num_shared_buffers,
opts->c2h_success);
/* I/O unit size cannot be larger than max I/O size */
if (opts->io_unit_size > opts->max_io_size) {
@ -1460,11 +1458,11 @@ spdk_nvmf_tcp_pdu_c2h_data_complete(void *cb_arg)
assert(tcp_req->c2h_data_pdu_num > 0);
tcp_req->c2h_data_pdu_num--;
if (!tcp_req->c2h_data_pdu_num) {
#if LINUX_KERNEL_SUPPORT_NOT_SENDING_RESP_FOR_C2H
nvmf_tcp_request_free(tcp_req);
#else
spdk_nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
#endif
if (tqpair->qpair.transport->opts.c2h_success) {
nvmf_tcp_request_free(tcp_req);
} else {
spdk_nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
}
}
tqpair->c2h_data_pdu_cnt--;
@ -2233,10 +2231,9 @@ spdk_nvmf_tcp_send_c2h_data(struct spdk_nvmf_tcp_qpair *tqpair,
if (iov_index == (tcp_req->req.iovcnt - 1) && (tcp_req->c2h_data_offset == tcp_req->req.length)) {
SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Last pdu for tcp_req=%p on tqpair=%p\n", tcp_req, tqpair);
c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
/* The linux kernel does not support this yet */
#if LINUX_KERNEL_SUPPORT_NOT_SENDING_RESP_FOR_C2H
c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
#endif
if (tqpair->qpair.transport->opts.c2h_success) {
c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
}
TAILQ_REMOVE(&tqpair->queued_c2h_data_tcp_req, tcp_req, link);
}
@ -2748,6 +2745,7 @@ spdk_nvmf_tcp_qpair_set_sq_size(struct spdk_nvmf_qpair *qpair)
#define SPDK_NVMF_TCP_DEFAULT_IO_UNIT_SIZE 131072
#define SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS 511
#define SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE 32
#define SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION true
static void
spdk_nvmf_tcp_opts_init(struct spdk_nvmf_transport_opts *opts)
@ -2760,6 +2758,7 @@ spdk_nvmf_tcp_opts_init(struct spdk_nvmf_transport_opts *opts)
opts->max_aq_depth = SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH;
opts->num_shared_buffers = SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS;
opts->buf_cache_size = SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE;
opts->c2h_success = SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION;
}
const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp = {

View File

@ -1410,7 +1410,8 @@ Format: 'user:u1 secret:s1 muser:mu1 msecret:ms1,user:u2 secret:s2 muser:mu2 mse
num_shared_buffers=args.num_shared_buffers,
buf_cache_size=args.buf_cache_size,
max_srq_depth=args.max_srq_depth,
no_srq=args.no_srq)
no_srq=args.no_srq,
c2h_success=args.c2h_success)
p = subparsers.add_parser('nvmf_create_transport', help='Create NVMf transport')
p.add_argument('-t', '--trtype', help='Transport type (ex. RDMA)', type=str, required=True)
@ -1424,6 +1425,7 @@ Format: 'user:u1 secret:s1 muser:mu1 msecret:ms1,user:u2 secret:s2 muser:mu2 mse
p.add_argument('-b', '--buf-cache-size', help='The number of shared buffers to reserve for each poll group', type=int)
p.add_argument('-s', '--max-srq-depth', help='Max number of outstanding I/O per SRQ. Relevant only for RDMA transport', type=int)
p.add_argument('-r', '--no-srq', action='store_true', help='Disable per-thread shared receive queue. Relevant only for RDMA transport')
p.add_argument('-o', '--c2h-success', help='Enable C2H success optimization. Relevant only for TCP transport', type=bool)
p.set_defaults(func=nvmf_create_transport)
def get_nvmf_transports(args):

View File

@ -46,7 +46,8 @@ def nvmf_create_transport(client,
num_shared_buffers=None,
buf_cache_size=None,
max_srq_depth=None,
no_srq=False):
no_srq=False,
c2h_success=True):
"""NVMf Transport Create options.
Args:
@ -61,6 +62,7 @@ def nvmf_create_transport(client,
buf_cache_size: The number of shared buffers to reserve for each poll group (optional)
max_srq_depth: Max number of outstanding I/O per shared receive queue - RDMA specific (optional)
no_srq: Boolean flag to disable SRQ even for devices that support it - RDMA specific (optional)
c2h_success: Boolean flag to enable/disable the C2H success optimization - TCP specific (optional)
Returns:
True or False
@ -88,6 +90,8 @@ def nvmf_create_transport(client,
params['max_srq_depth'] = max_srq_depth
if no_srq:
params['no_srq'] = no_srq
if c2h_success:
params['c2h_success'] = c2h_success
return client.call('nvmf_create_transport', params)