nvme/tcp: Use writev_async for sending data on sockets

Amortize the writev syscall cost by using the writev_async socket API.

This allows the socket layer to batch writes into one system call
and also apply further optimizations such as posix's MSG_ZEROCOPY
when they are available. As part of doing so we remove the error
return in the socket layer writev_async implementation for sockets
that don't have a poll group.

Doing so eliminates the send queue processing.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Change-Id: I5432ae322afaff7b96c22269fc06b75f9ae60b81
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/475420
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Or Gerlitz 2019-11-21 17:21:29 +02:00 committed by Tomasz Zawadzki
parent e61b0904a8
commit 8e8a5f7c28
2 changed files with 21 additions and 74 deletions

View File

@ -290,79 +290,21 @@ nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
return 0; return 0;
} }
static int static void
nvme_tcp_qpair_process_send_queue(struct nvme_tcp_qpair *tqpair) _pdu_write_done(void *cb_arg, int err)
{ {
const int array_size = 32; struct nvme_tcp_pdu *pdu = cb_arg;
struct iovec iovs[array_size]; struct nvme_tcp_qpair *tqpair = pdu->qpair;
int iovcnt = 0;
int bytes = 0;
uint32_t mapped_length;
struct nvme_tcp_pdu *pdu;
int pdu_length;
TAILQ_HEAD(, nvme_tcp_pdu) completed_pdus_list;
pdu = TAILQ_FIRST(&tqpair->send_queue); TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
if (pdu == NULL) { if (err != 0) {
return 0; nvme_tcp_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair);
return;
} }
/* assert(pdu->cb_fn != NULL);
* Build up a list of iovecs for the first few PDUs in the pdu->cb_fn(pdu->cb_arg);
* tqpair 's send_queue.
*/
while (pdu != NULL && ((array_size - iovcnt) >= (2 + (int)pdu->data_iovcnt))) {
iovcnt += nvme_tcp_build_iovs(&iovs[iovcnt], array_size - iovcnt,
pdu, tqpair->host_hdgst_enable,
tqpair->host_ddgst_enable, &mapped_length);
pdu = TAILQ_NEXT(pdu, tailq);
}
bytes = spdk_sock_writev(tqpair->sock, iovs, iovcnt);
SPDK_DEBUGLOG(SPDK_LOG_NVME, "bytes=%d are out\n", bytes);
if (bytes == -1) {
if (errno == EWOULDBLOCK || errno == EAGAIN) {
return 1;
} else {
SPDK_ERRLOG("spdk_sock_writev() failed, errno %d: %s\n",
errno, spdk_strerror(errno));
return -errno;
}
}
pdu = TAILQ_FIRST(&tqpair->send_queue);
/*
* Free any PDUs that were fully written. If a PDU was only
* partially written, update its writev_offset so that next
* time only the unwritten portion will be sent to writev().
*/
TAILQ_INIT(&completed_pdus_list);
while (bytes > 0) {
pdu_length = pdu->hdr->common.plen - pdu->writev_offset;
assert(pdu_length > 0);
if (bytes >= pdu_length) {
bytes -= pdu_length;
TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
TAILQ_INSERT_TAIL(&completed_pdus_list, pdu, tailq);
pdu = TAILQ_FIRST(&tqpair->send_queue);
} else {
pdu->writev_offset += bytes;
bytes = 0;
}
}
while (!TAILQ_EMPTY(&completed_pdus_list)) {
pdu = TAILQ_FIRST(&completed_pdus_list);
TAILQ_REMOVE(&completed_pdus_list, pdu, tailq);
assert(pdu->cb_fn != NULL);
pdu->cb_fn(pdu->cb_arg);
}
return TAILQ_EMPTY(&tqpair->send_queue) ? 0 : 1;
} }
static int static int
@ -374,6 +316,7 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
int enable_digest; int enable_digest;
int hlen; int hlen;
uint32_t crc32c; uint32_t crc32c;
uint32_t mapped_length = 0;
hlen = pdu->hdr->common.hlen; hlen = pdu->hdr->common.hlen;
enable_digest = 1; enable_digest = 1;
@ -397,7 +340,16 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
pdu->cb_fn = cb_fn; pdu->cb_fn = cb_fn;
pdu->cb_arg = cb_arg; pdu->cb_arg = cb_arg;
pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu,
tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
&mapped_length);
pdu->qpair = tqpair;
pdu->sock_req.cb_fn = _pdu_write_done;
pdu->sock_req.cb_arg = pdu;
TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);
return 0; return 0;
} }
@ -1437,7 +1389,7 @@ nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_c
uint32_t reaped; uint32_t reaped;
int rc; int rc;
rc = nvme_tcp_qpair_process_send_queue(tqpair); rc = spdk_sock_flush(tqpair->sock);
if (rc < 0) { if (rc < 0) {
return rc; return rc;
} }

View File

@ -596,11 +596,6 @@ spdk_posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *r
spdk_sock_request_queue(sock, req); spdk_sock_request_queue(sock, req);
if (sock->group_impl == NULL) {
spdk_sock_request_put(sock, req, -ENOTSUP);
return;
}
/* If there are a sufficient number queued, just flush them out immediately. */ /* If there are a sufficient number queued, just flush them out immediately. */
if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { if (sock->queued_iovcnt >= IOV_BATCH_SIZE) {
rc = _sock_flush(sock); rc = _sock_flush(sock);