nvme/tcp: Use writev_async for sending data on sockets
Amortize the writev syscall cost by using the writev_async socket API. This allows the socket layer to batch writes into one system call and also apply further optimizations such as posix's MSG_ZEROCOPY when they are available. As part of doing so we remove the error return in the socket layer writev_async implementation for sockets that don't have a poll group. Doing so eliminates the send queue processing. Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Change-Id: I5432ae322afaff7b96c22269fc06b75f9ae60b81 Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/475420 Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
parent
e61b0904a8
commit
8e8a5f7c28
@ -290,81 +290,23 @@ nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static void
|
||||||
nvme_tcp_qpair_process_send_queue(struct nvme_tcp_qpair *tqpair)
|
_pdu_write_done(void *cb_arg, int err)
|
||||||
{
|
{
|
||||||
const int array_size = 32;
|
struct nvme_tcp_pdu *pdu = cb_arg;
|
||||||
struct iovec iovs[array_size];
|
struct nvme_tcp_qpair *tqpair = pdu->qpair;
|
||||||
int iovcnt = 0;
|
|
||||||
int bytes = 0;
|
|
||||||
uint32_t mapped_length;
|
|
||||||
struct nvme_tcp_pdu *pdu;
|
|
||||||
int pdu_length;
|
|
||||||
TAILQ_HEAD(, nvme_tcp_pdu) completed_pdus_list;
|
|
||||||
|
|
||||||
pdu = TAILQ_FIRST(&tqpair->send_queue);
|
|
||||||
|
|
||||||
if (pdu == NULL) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Build up a list of iovecs for the first few PDUs in the
|
|
||||||
* tqpair 's send_queue.
|
|
||||||
*/
|
|
||||||
while (pdu != NULL && ((array_size - iovcnt) >= (2 + (int)pdu->data_iovcnt))) {
|
|
||||||
iovcnt += nvme_tcp_build_iovs(&iovs[iovcnt], array_size - iovcnt,
|
|
||||||
pdu, tqpair->host_hdgst_enable,
|
|
||||||
tqpair->host_ddgst_enable, &mapped_length);
|
|
||||||
pdu = TAILQ_NEXT(pdu, tailq);
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes = spdk_sock_writev(tqpair->sock, iovs, iovcnt);
|
|
||||||
SPDK_DEBUGLOG(SPDK_LOG_NVME, "bytes=%d are out\n", bytes);
|
|
||||||
if (bytes == -1) {
|
|
||||||
if (errno == EWOULDBLOCK || errno == EAGAIN) {
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
SPDK_ERRLOG("spdk_sock_writev() failed, errno %d: %s\n",
|
|
||||||
errno, spdk_strerror(errno));
|
|
||||||
return -errno;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pdu = TAILQ_FIRST(&tqpair->send_queue);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Free any PDUs that were fully written. If a PDU was only
|
|
||||||
* partially written, update its writev_offset so that next
|
|
||||||
* time only the unwritten portion will be sent to writev().
|
|
||||||
*/
|
|
||||||
TAILQ_INIT(&completed_pdus_list);
|
|
||||||
while (bytes > 0) {
|
|
||||||
pdu_length = pdu->hdr->common.plen - pdu->writev_offset;
|
|
||||||
assert(pdu_length > 0);
|
|
||||||
if (bytes >= pdu_length) {
|
|
||||||
bytes -= pdu_length;
|
|
||||||
TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
|
TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
|
||||||
TAILQ_INSERT_TAIL(&completed_pdus_list, pdu, tailq);
|
|
||||||
pdu = TAILQ_FIRST(&tqpair->send_queue);
|
|
||||||
|
|
||||||
} else {
|
if (err != 0) {
|
||||||
pdu->writev_offset += bytes;
|
nvme_tcp_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair);
|
||||||
bytes = 0;
|
return;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while (!TAILQ_EMPTY(&completed_pdus_list)) {
|
|
||||||
pdu = TAILQ_FIRST(&completed_pdus_list);
|
|
||||||
TAILQ_REMOVE(&completed_pdus_list, pdu, tailq);
|
|
||||||
assert(pdu->cb_fn != NULL);
|
assert(pdu->cb_fn != NULL);
|
||||||
pdu->cb_fn(pdu->cb_arg);
|
pdu->cb_fn(pdu->cb_arg);
|
||||||
}
|
}
|
||||||
|
|
||||||
return TAILQ_EMPTY(&tqpair->send_queue) ? 0 : 1;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
|
nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
|
||||||
struct nvme_tcp_pdu *pdu,
|
struct nvme_tcp_pdu *pdu,
|
||||||
@ -374,6 +316,7 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
|
|||||||
int enable_digest;
|
int enable_digest;
|
||||||
int hlen;
|
int hlen;
|
||||||
uint32_t crc32c;
|
uint32_t crc32c;
|
||||||
|
uint32_t mapped_length = 0;
|
||||||
|
|
||||||
hlen = pdu->hdr->common.hlen;
|
hlen = pdu->hdr->common.hlen;
|
||||||
enable_digest = 1;
|
enable_digest = 1;
|
||||||
@ -397,7 +340,16 @@ nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
|
|||||||
|
|
||||||
pdu->cb_fn = cb_fn;
|
pdu->cb_fn = cb_fn;
|
||||||
pdu->cb_arg = cb_arg;
|
pdu->cb_arg = cb_arg;
|
||||||
|
|
||||||
|
pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu,
|
||||||
|
tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
|
||||||
|
&mapped_length);
|
||||||
|
pdu->qpair = tqpair;
|
||||||
|
pdu->sock_req.cb_fn = _pdu_write_done;
|
||||||
|
pdu->sock_req.cb_arg = pdu;
|
||||||
TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
|
TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
|
||||||
|
spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1437,7 +1389,7 @@ nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_c
|
|||||||
uint32_t reaped;
|
uint32_t reaped;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
rc = nvme_tcp_qpair_process_send_queue(tqpair);
|
rc = spdk_sock_flush(tqpair->sock);
|
||||||
if (rc < 0) {
|
if (rc < 0) {
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -596,11 +596,6 @@ spdk_posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *r
|
|||||||
|
|
||||||
spdk_sock_request_queue(sock, req);
|
spdk_sock_request_queue(sock, req);
|
||||||
|
|
||||||
if (sock->group_impl == NULL) {
|
|
||||||
spdk_sock_request_put(sock, req, -ENOTSUP);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If there are a sufficient number queued, just flush them out immediately. */
|
/* If there are a sufficient number queued, just flush them out immediately. */
|
||||||
if (sock->queued_iovcnt >= IOV_BATCH_SIZE) {
|
if (sock->queued_iovcnt >= IOV_BATCH_SIZE) {
|
||||||
rc = _sock_flush(sock);
|
rc = _sock_flush(sock);
|
||||||
|
Loading…
Reference in New Issue
Block a user