From ffd751fc0bdf744c96070f2cea46976fda896a0c Mon Sep 17 00:00:00 2001 From: Konrad Sztyber Date: Thu, 9 Dec 2021 16:43:54 +0100 Subject: [PATCH] nvmf/tcp: zero-copy support This patch adds support for using zero-copy operations to execute IO requests in the TCP transport. Of course, they're only used if the underlying bdev supports them. Additionally, only requests with no in-capsule-data can be executed using this mechanism. Added several new states to accommodate for the difference in a way zero-copy is handled. Also, these flows very depending on the type of a request (read or write). It stems from zero-copy semantics: to perform a write we need to wait for zcopy_end completion, while for reads zcopy_end can only be submitted once we send all of the requested data to the host. Signed-off-by: Konrad Sztyber Change-Id: Ie02b494c24bc1acc98557cb4b02e867abf9064e4 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/10796 Reviewed-by: Jim Harris Reviewed-by: Aleksey Marchuk Reviewed-by: Ben Walker Tested-by: SPDK CI Jenkins Community-CI: Broadcom CI --- lib/nvmf/tcp.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 5 deletions(-) diff --git a/lib/nvmf/tcp.c b/lib/nvmf/tcp.c index d2ea4cd4f..e9e8b9368 100644 --- a/lib/nvmf/tcp.c +++ b/lib/nvmf/tcp.c @@ -71,6 +71,12 @@ enum spdk_nvmf_tcp_req_state { /* The request is queued until a data buffer is available. */ TCP_REQUEST_STATE_NEED_BUFFER, + /* The request is waiting for zcopy_start to finish */ + TCP_REQUEST_STATE_AWAITING_ZCOPY_START, + + /* The request has received a zero-copy buffer */ + TCP_REQUEST_STATE_ZCOPY_START_COMPLETED, + /* The request is currently transferring data from the host to the controller. */ TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, @@ -83,6 +89,9 @@ enum spdk_nvmf_tcp_req_state { /* The request is currently executing at the block device */ TCP_REQUEST_STATE_EXECUTING, + /* The request is waiting for zcopy buffers to be commited */ + TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT, + /* The request finished executing at the block device */ TCP_REQUEST_STATE_EXECUTED, @@ -92,6 +101,9 @@ enum spdk_nvmf_tcp_req_state { /* The request is currently transferring final pdus from the controller to the host. */ TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + /* The request is waiting for zcopy buffers to be released (without committing) */ + TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE, + /* The request completed and can be marked free. */ TCP_REQUEST_STATE_COMPLETED, @@ -1426,8 +1438,14 @@ nvmf_tcp_capsule_cmd_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport, tcp_req = nvmf_tcp_req_get(tqpair); if (!tcp_req) { - /* Directly return and make the allocation retry again */ - if (tqpair->state_cntr[TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST] > 0) { + /* Directly return and make the allocation retry again. This can happen if we're + * using asynchronous writes to send the response to the host or when releasing + * zero-copy buffers after a response has been sent. In both cases, the host might + * receive the response before we've finished processing the request and is free to + * send another one. + */ + if (tqpair->state_cntr[TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST] > 0 || + tqpair->state_cntr[TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE] > 0) { return; } @@ -1457,6 +1475,9 @@ nvmf_tcp_capsule_cmd_payload_handle(struct spdk_nvmf_tcp_transport *ttransport, tcp_req = pdu->req; assert(tcp_req != NULL); + /* Zero-copy requests don't support ICD */ + assert(!spdk_nvmf_request_using_zcopy(&tcp_req->req)); + if (capsule_cmd->common.pdo > SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET) { SPDK_ERRLOG("Expected ICReq capsule_cmd pdu offset <= %d, got %c\n", SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET, capsule_cmd->common.pdo); @@ -2224,6 +2245,12 @@ nvmf_tcp_req_parse_sgl(struct spdk_nvmf_tcp_req *tcp_req, req->dif.elba_length = length; } + if (nvmf_ctrlr_use_zcopy(req)) { + SPDK_DEBUGLOG(nvmf_tcp, "Using zero-copy to execute request %p\n", tcp_req); + req->data_from_pool = false; + return 0; + } + if (spdk_nvmf_request_get_buffers(req, group, transport, length)) { /* No available buffers. Queue this request up. */ SPDK_DEBUGLOG(nvmf_tcp, "No available large data buffers. Queueing request %p\n", @@ -2608,6 +2635,19 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, break; } + /* Get a zcopy buffer if the request can be serviced through zcopy */ + if (spdk_nvmf_request_using_zcopy(&tcp_req->req)) { + if (spdk_unlikely(tcp_req->req.dif_enabled)) { + assert(tcp_req->req.dif.elba_length >= tcp_req->req.length); + tcp_req->req.length = tcp_req->req.dif.elba_length; + } + + STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_ZCOPY_START); + spdk_nvmf_request_zcopy_start(&tcp_req->req); + break; + } + if (!tcp_req->req.data) { SPDK_DEBUGLOG(nvmf_tcp, "No buffer allocated for tcp_req(%p) on tqpair(%p\n)", tcp_req, tqpair); @@ -2640,6 +2680,24 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE); break; + case TCP_REQUEST_STATE_AWAITING_ZCOPY_START: + /* Some external code must kick a request into TCP_REQUEST_STATE_ZCOPY_START_COMPLETED + * to escape this state. */ + break; + case TCP_REQUEST_STATE_ZCOPY_START_COMPLETED: + if (spdk_unlikely(spdk_nvme_cpl_is_error(&tcp_req->req.rsp->nvme_cpl))) { + SPDK_DEBUGLOG(nvmf_tcp, "Zero-copy start failed for tcp_req(%p) on tqpair=%p\n", + tcp_req, tqpair); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE); + break; + } + if (tcp_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + SPDK_DEBUGLOG(nvmf_tcp, "Sending R2T for tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair); + nvmf_tcp_send_r2t_pdu(tqpair, tcp_req); + } else { + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED); + } + break; case TCP_REQUEST_STATE_AWAITING_R2T_ACK: spdk_trace_record(TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK, 0, 0, (uintptr_t)tcp_req, tqpair); @@ -2661,14 +2719,26 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, tcp_req->req.length = tcp_req->req.dif.elba_length; } - nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTING); - spdk_nvmf_request_exec(&tcp_req->req); + if (!spdk_nvmf_request_using_zcopy(&tcp_req->req)) { + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTING); + spdk_nvmf_request_exec(&tcp_req->req); + } else { + /* For zero-copy, only requests with data coming from host to the + * controller can end up here. */ + assert(tcp_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT); + spdk_nvmf_request_zcopy_end(&tcp_req->req, true); + } break; case TCP_REQUEST_STATE_EXECUTING: spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTING, 0, 0, (uintptr_t)tcp_req, tqpair); /* Some external code must kick a request into TCP_REQUEST_STATE_EXECUTED * to escape this state. */ break; + case TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT: + /* Some external code must kick a request into TCP_REQUEST_STATE_EXECUTED + * to escape this state. */ + break; case TCP_REQUEST_STATE_EXECUTED: spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTED, 0, 0, (uintptr_t)tcp_req, tqpair); @@ -2690,6 +2760,10 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, /* Some external code must kick a request into TCP_REQUEST_STATE_COMPLETED * to escape this state. */ break; + case TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE: + /* Some external code must kick a request into TCP_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; case TCP_REQUEST_STATE_COMPLETED: spdk_trace_record(TRACE_TCP_REQUEST_STATE_COMPLETED, 0, 0, (uintptr_t)tcp_req, tqpair); if (tcp_req->req.data_from_pool) { @@ -2701,6 +2775,15 @@ nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, assert(tgroup->control_msg_list); SPDK_DEBUGLOG(nvmf_tcp, "Put buf to control msg list\n"); nvmf_tcp_control_msg_put(tgroup->control_msg_list, tcp_req->req.data); + } else if (tcp_req->req.zcopy_bdev_io != NULL) { + /* If the request has an unreleased zcopy bdev_io, it's either a + * read or a failed write */ + assert(spdk_nvmf_request_using_zcopy(&tcp_req->req)); + assert(tcp_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST || + spdk_nvme_cpl_is_error(&tcp_req->req.rsp->nvme_cpl)); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE); + spdk_nvmf_request_zcopy_end(&tcp_req->req, false); + break; } tcp_req->req.length = 0; tcp_req->req.iovcnt = 0; @@ -2821,7 +2904,22 @@ nvmf_tcp_req_complete(struct spdk_nvmf_request *req) ttransport = SPDK_CONTAINEROF(req->qpair->transport, struct spdk_nvmf_tcp_transport, transport); tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req); - nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED); + switch (tcp_req->state) { + case TCP_REQUEST_STATE_EXECUTING: + case TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT: + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED); + break; + case TCP_REQUEST_STATE_AWAITING_ZCOPY_START: + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_ZCOPY_START_COMPLETED); + break; + case TCP_REQUEST_STATE_AWAITING_ZCOPY_RELEASE: + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED); + break; + default: + assert(0 && "Unexpected request state"); + break; + } + nvmf_tcp_req_process(ttransport, tcp_req); return 0; @@ -2960,6 +3058,8 @@ _nvmf_tcp_qpair_abort_request(void *ctx) switch (tcp_req_to_abort->state) { case TCP_REQUEST_STATE_EXECUTING: + case TCP_REQUEST_STATE_AWAITING_ZCOPY_START: + case TCP_REQUEST_STATE_AWAITING_ZCOPY_COMMIT: rc = nvmf_ctrlr_abort_request(req); if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) { return SPDK_POLLER_BUSY; @@ -2975,6 +3075,7 @@ _nvmf_tcp_qpair_abort_request(void *ctx) break; case TCP_REQUEST_STATE_AWAITING_R2T_ACK: + case TCP_REQUEST_STATE_ZCOPY_START_COMPLETED: nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort); break;