nvmf/tcp: Use a big buffer for PDU receving.

Purpose: Reduce the recv/readv system call.
Method: Use a big recv buffer to conduct the read.
Though it will introduce addtional buffer copy,
we hope that the overhead introduced by buffer copy will
be smaller compared with frequent recv/readv system call overhead.
And the design is to make a trade off between them.

Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Change-Id: I9286fd9cec0b512cea8e3f2c335c5bf862b98573
Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/464842
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: Broadcom SPDK FC-NVMe CI <spdk-ci.pdl@broadcom.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Ziye Yang 2019-08-12 19:44:22 +08:00 committed by Jim Harris
parent ea5ad0b286
commit d50736776c

View File

@ -55,6 +55,7 @@
#define NVMF_TCP_PDU_MAX_C2H_DATA_SIZE 131072
#define NVMF_TCP_QPAIR_MAX_C2H_PDU_NUM 64 /* Maximal c2h_data pdu number for ecah tqpair */
#define SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY 6
#define SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR 4
/* spdk nvmf related structure */
enum spdk_nvmf_tcp_req_state {
@ -198,6 +199,13 @@ struct spdk_nvmf_tcp_req {
TAILQ_ENTRY(spdk_nvmf_tcp_req) state_link;
};
struct nvme_tcp_pdu_recv_buf {
char *buf;
uint32_t off;
uint32_t size;
uint32_t remain_size;
};
struct spdk_nvmf_tcp_qpair {
struct spdk_nvmf_qpair qpair;
struct spdk_nvmf_tcp_poll_group *group;
@ -209,6 +217,7 @@ struct spdk_nvmf_tcp_qpair {
enum nvme_tcp_qpair_state state;
struct nvme_tcp_pdu pdu_in_progress;
struct nvme_tcp_pdu_recv_buf pdu_recv_buf;
TAILQ_HEAD(, nvme_tcp_pdu) send_queue;
TAILQ_HEAD(, nvme_tcp_pdu) free_queue;
@ -500,6 +509,7 @@ spdk_nvmf_tcp_qpair_destroy(struct spdk_nvmf_tcp_qpair *tqpair)
free(tqpair->reqs);
spdk_free(tqpair->buf);
spdk_free(tqpair->bufs);
free(tqpair->pdu_recv_buf.buf);
free(tqpair);
SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Leave\n");
}
@ -982,6 +992,14 @@ spdk_nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair, uint16
TAILQ_INSERT_TAIL(&tqpair->free_queue, &tqpair->pdu[i], tailq);
}
tqpair->pdu_recv_buf.size = (in_capsule_data_size + sizeof(struct spdk_nvme_tcp_cmd) + 2 *
SPDK_NVME_TCP_DIGEST_LEN) * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
tqpair->pdu_recv_buf.buf = calloc(1, tqpair->pdu_recv_buf.size);
if (!tqpair->pdu_recv_buf.buf) {
SPDK_ERRLOG("Unable to allocate the pdu recv buf on tqpair=%p with size=%d\n", tqpair,
tqpair->pdu_recv_buf.size);
return -1;
}
} else {
tqpair->reqs = calloc(size, sizeof(*tqpair->reqs));
if (!tqpair->reqs) {
@ -1707,7 +1725,13 @@ spdk_nvmf_tcp_icreq_handle(struct spdk_nvmf_tcp_transport *ttransport,
SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "maxr2t =%u\n", (ic_req->maxr2t + 1u));
tqpair->host_hdgst_enable = ic_req->dgst.bits.hdgst_enable ? true : false;
if (!tqpair->host_hdgst_enable) {
tqpair->pdu_recv_buf.size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
}
tqpair->host_ddgst_enable = ic_req->dgst.bits.ddgst_enable ? true : false;
if (!tqpair->host_ddgst_enable) {
tqpair->pdu_recv_buf.size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
}
tqpair->cpda = spdk_min(ic_req->hpda, SPDK_NVME_TCP_CPDA_MAX);
SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "cpda of tqpair=(%p) is : %u\n", tqpair, tqpair->cpda);
@ -1902,7 +1926,65 @@ nvmf_tcp_pdu_payload_insert_dif(struct nvme_tcp_pdu *pdu, uint32_t read_offset,
return rc;
}
#define MAX_NVME_TCP_PDU_LOOP_COUNT 32
static int
nvme_tcp_recv_buf_read(struct spdk_sock *sock, struct nvme_tcp_pdu_recv_buf *pdu_recv_buf)
{
int rc;
assert(pdu_recv_buf->off == 0);
assert(pdu_recv_buf->remain_size == 0);
rc = nvme_tcp_read_data(sock, pdu_recv_buf->size,
pdu_recv_buf->buf);
if (rc < 0) {
SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "will disconnect sock=%p\n", sock);
} else if (rc > 0) {
pdu_recv_buf->remain_size = rc;
spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, 0, rc, 0, 0);
}
return rc;
}
static uint32_t
nvme_tcp_read_data_from_pdu_recv_buf(struct nvme_tcp_pdu_recv_buf *pdu_recv_buf,
uint32_t expected_size,
char *dst)
{
uint32_t size;
assert(pdu_recv_buf->remain_size > 0);
size = spdk_min(expected_size, pdu_recv_buf->remain_size);
memcpy(dst, (void *)pdu_recv_buf->buf + pdu_recv_buf->off, size);
pdu_recv_buf->off += size;
pdu_recv_buf->remain_size -= size;
if (spdk_unlikely(!pdu_recv_buf->remain_size)) {
pdu_recv_buf->off = 0;
}
return size;
}
static int
nvme_tcp_read_payload_data_from_pdu_recv_buf(struct nvme_tcp_pdu_recv_buf *pdu_recv_buf,
struct nvme_tcp_pdu *pdu)
{
struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS + 1];
int iovcnt, i;
uint32_t size = 0;
assert(pdu_recv_buf->remain_size > 0);
iovcnt = nvme_tcp_build_payload_iovs(iov, NVME_TCP_MAX_SGL_DESCRIPTORS + 1, pdu,
pdu->ddgst_enable, NULL);
assert(iovcnt >= 0);
for (i = 0; i < iovcnt; i++) {
if (!pdu_recv_buf->remain_size) {
break;
}
size += nvme_tcp_read_data_from_pdu_recv_buf(pdu_recv_buf, iov[i].iov_len, iov[i].iov_base);
}
return size;
}
static int
spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
@ -1910,7 +1992,7 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
int rc = 0;
struct nvme_tcp_pdu *pdu;
enum nvme_tcp_pdu_recv_state prev_state;
uint32_t data_len, current_pdu_num = 0;
uint32_t data_len;
/* The loop here is to allow for several back-to-back state changes. */
do {
@ -1922,19 +2004,19 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
/* Wait for the common header */
case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
rc = nvme_tcp_read_data(tqpair->sock,
sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
(void *)&pdu->hdr->common + pdu->ch_valid_bytes);
if (rc < 0) {
SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "will disconnect tqpair=%p\n", tqpair);
return NVME_TCP_PDU_FATAL;
} else if (rc > 0) {
pdu->ch_valid_bytes += rc;
spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, 0, rc, 0, 0);
if (spdk_likely(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY)) {
spdk_nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
if (!tqpair->pdu_recv_buf.remain_size) {
rc = nvme_tcp_recv_buf_read(tqpair->sock, &tqpair->pdu_recv_buf);
if (rc <= 0) {
return rc;
}
}
rc = nvme_tcp_read_data_from_pdu_recv_buf(&tqpair->pdu_recv_buf,
sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
(void *)&pdu->hdr->common + pdu->ch_valid_bytes);
pdu->ch_valid_bytes += rc;
if (spdk_likely(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY)) {
spdk_nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
}
if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
return NVME_TCP_PDU_IN_PROGRESS;
@ -1945,25 +2027,23 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
break;
/* Wait for the pdu specific header */
case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
rc = nvme_tcp_read_data(tqpair->sock,
pdu->psh_len - pdu->psh_valid_bytes,
(void *)&pdu->hdr->raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
if (rc < 0) {
return NVME_TCP_PDU_FATAL;
} else if (rc > 0) {
spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE,
0, rc, 0, 0);
pdu->psh_valid_bytes += rc;
if (!tqpair->pdu_recv_buf.remain_size) {
rc = nvme_tcp_recv_buf_read(tqpair->sock, &tqpair->pdu_recv_buf);
if (rc <= 0) {
return rc;
}
}
rc = nvme_tcp_read_data_from_pdu_recv_buf(&tqpair->pdu_recv_buf,
pdu->psh_len - pdu->psh_valid_bytes,
(void *)&pdu->hdr->raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
pdu->psh_valid_bytes += rc;
if (pdu->psh_valid_bytes < pdu->psh_len) {
return NVME_TCP_PDU_IN_PROGRESS;
}
/* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
spdk_nvmf_tcp_pdu_psh_handle(tqpair);
if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) {
current_pdu_num++;
}
break;
case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
/* check whether the data is valid, if not we just return */
@ -1979,11 +2059,18 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
pdu->ddgst_enable = true;
}
rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
if (rc < 0) {
return NVME_TCP_PDU_IN_PROGRESS;
if (tqpair->pdu_recv_buf.remain_size) {
rc = nvme_tcp_read_payload_data_from_pdu_recv_buf(&tqpair->pdu_recv_buf, pdu);
pdu->readv_offset += rc;
}
if (pdu->readv_offset < data_len) {
rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
if (rc < 0) {
return NVME_TCP_PDU_IN_PROGRESS;
}
pdu->readv_offset += rc;
}
pdu->readv_offset += rc;
if (spdk_unlikely(pdu->dif_ctx != NULL)) {
rc = nvmf_tcp_pdu_payload_insert_dif(pdu, pdu->readv_offset - rc, rc);
@ -1998,7 +2085,6 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
/* All of this PDU has now been read from the socket. */
spdk_nvmf_tcp_pdu_payload_handle(tqpair);
current_pdu_num++;
break;
case NVME_TCP_PDU_RECV_STATE_ERROR:
/* Check whether the connection is closed. Each time, we only read 1 byte every time */
@ -2012,7 +2098,7 @@ spdk_nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
SPDK_ERRLOG("code should not come to here");
break;
}
} while ((tqpair->recv_state != prev_state) && (current_pdu_num < MAX_NVME_TCP_PDU_LOOP_COUNT));
} while (tqpair->recv_state != prev_state);
return rc;
}