From 8e8a5f7c280ce958bd2a8cb5b34071363fbb08fb Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 21 Nov 2019 17:21:29 +0200 Subject: [PATCH] nvme/tcp: Use writev_async for sending data on sockets Amortize the writev syscall cost by using the writev_async socket API. This allows the socket layer to batch writes into one system call and also apply further optimizations such as posix's MSG_ZEROCOPY when they are available. As part of doing so we remove the error return in the socket layer writev_async implementation for sockets that don't have a poll group. Doing so eliminates the send queue processing. Signed-off-by: Or Gerlitz Change-Id: I5432ae322afaff7b96c22269fc06b75f9ae60b81 Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/475420 Reviewed-by: Ben Walker Reviewed-by: Jim Harris Reviewed-by: Shuhei Matsumoto Tested-by: SPDK CI Jenkins --- lib/nvme/nvme_tcp.c | 90 +++++++++------------------------------ module/sock/posix/posix.c | 5 --- 2 files changed, 21 insertions(+), 74 deletions(-) diff --git a/lib/nvme/nvme_tcp.c b/lib/nvme/nvme_tcp.c index 083080bc0..4d4eef000 100644 --- a/lib/nvme/nvme_tcp.c +++ b/lib/nvme/nvme_tcp.c @@ -290,79 +290,21 @@ nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) return 0; } -static int -nvme_tcp_qpair_process_send_queue(struct nvme_tcp_qpair *tqpair) +static void +_pdu_write_done(void *cb_arg, int err) { - const int array_size = 32; - struct iovec iovs[array_size]; - int iovcnt = 0; - int bytes = 0; - uint32_t mapped_length; - struct nvme_tcp_pdu *pdu; - int pdu_length; - TAILQ_HEAD(, nvme_tcp_pdu) completed_pdus_list; + struct nvme_tcp_pdu *pdu = cb_arg; + struct nvme_tcp_qpair *tqpair = pdu->qpair; - pdu = TAILQ_FIRST(&tqpair->send_queue); + TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); - if (pdu == NULL) { - return 0; + if (err != 0) { + nvme_tcp_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair); + return; } - /* - * Build up a list of iovecs for the first few PDUs in the - * tqpair 's send_queue. - */ - while (pdu != NULL && ((array_size - iovcnt) >= (2 + (int)pdu->data_iovcnt))) { - iovcnt += nvme_tcp_build_iovs(&iovs[iovcnt], array_size - iovcnt, - pdu, tqpair->host_hdgst_enable, - tqpair->host_ddgst_enable, &mapped_length); - pdu = TAILQ_NEXT(pdu, tailq); - } - - bytes = spdk_sock_writev(tqpair->sock, iovs, iovcnt); - SPDK_DEBUGLOG(SPDK_LOG_NVME, "bytes=%d are out\n", bytes); - if (bytes == -1) { - if (errno == EWOULDBLOCK || errno == EAGAIN) { - return 1; - } else { - SPDK_ERRLOG("spdk_sock_writev() failed, errno %d: %s\n", - errno, spdk_strerror(errno)); - return -errno; - } - } - - pdu = TAILQ_FIRST(&tqpair->send_queue); - - /* - * Free any PDUs that were fully written. If a PDU was only - * partially written, update its writev_offset so that next - * time only the unwritten portion will be sent to writev(). - */ - TAILQ_INIT(&completed_pdus_list); - while (bytes > 0) { - pdu_length = pdu->hdr->common.plen - pdu->writev_offset; - assert(pdu_length > 0); - if (bytes >= pdu_length) { - bytes -= pdu_length; - TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); - TAILQ_INSERT_TAIL(&completed_pdus_list, pdu, tailq); - pdu = TAILQ_FIRST(&tqpair->send_queue); - - } else { - pdu->writev_offset += bytes; - bytes = 0; - } - } - - while (!TAILQ_EMPTY(&completed_pdus_list)) { - pdu = TAILQ_FIRST(&completed_pdus_list); - TAILQ_REMOVE(&completed_pdus_list, pdu, tailq); - assert(pdu->cb_fn != NULL); - pdu->cb_fn(pdu->cb_arg); - } - - return TAILQ_EMPTY(&tqpair->send_queue) ? 0 : 1; - + assert(pdu->cb_fn != NULL); + pdu->cb_fn(pdu->cb_arg); } static int @@ -374,6 +316,7 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair, int enable_digest; int hlen; uint32_t crc32c; + uint32_t mapped_length = 0; hlen = pdu->hdr->common.hlen; enable_digest = 1; @@ -397,7 +340,16 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair, pdu->cb_fn = cb_fn; pdu->cb_arg = cb_arg; + + pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu, + tqpair->host_hdgst_enable, tqpair->host_ddgst_enable, + &mapped_length); + pdu->qpair = tqpair; + pdu->sock_req.cb_fn = _pdu_write_done; + pdu->sock_req.cb_arg = pdu; TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); + spdk_sock_writev_async(tqpair->sock, &pdu->sock_req); + return 0; } @@ -1437,7 +1389,7 @@ nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_c uint32_t reaped; int rc; - rc = nvme_tcp_qpair_process_send_queue(tqpair); + rc = spdk_sock_flush(tqpair->sock); if (rc < 0) { return rc; } diff --git a/module/sock/posix/posix.c b/module/sock/posix/posix.c index 5366b48d5..c8846c93c 100644 --- a/module/sock/posix/posix.c +++ b/module/sock/posix/posix.c @@ -596,11 +596,6 @@ spdk_posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *r spdk_sock_request_queue(sock, req); - if (sock->group_impl == NULL) { - spdk_sock_request_put(sock, req, -ENOTSUP); - return; - } - /* If there are a sufficient number queued, just flush them out immediately. */ if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { rc = _sock_flush(sock);