2022-06-03 19:15:11 +00:00
|
|
|
/* SPDX-License-Identifier: BSD-3-Clause
|
2019-10-14 23:58:51 +00:00
|
|
|
* Copyright (c) Intel Corporation.
|
|
|
|
* All rights reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "spdk/stdinc.h"
|
|
|
|
#include "spdk/config.h"
|
|
|
|
|
2020-02-19 18:58:29 +00:00
|
|
|
#include <linux/errqueue.h>
|
2019-10-14 23:58:51 +00:00
|
|
|
#include <sys/epoll.h>
|
|
|
|
#include <liburing.h>
|
|
|
|
|
|
|
|
#include "spdk/barrier.h"
|
2021-04-01 20:07:17 +00:00
|
|
|
#include "spdk/env.h"
|
2019-10-14 23:58:51 +00:00
|
|
|
#include "spdk/log.h"
|
2020-03-18 00:27:50 +00:00
|
|
|
#include "spdk/pipe.h"
|
2019-10-14 23:58:51 +00:00
|
|
|
#include "spdk/sock.h"
|
|
|
|
#include "spdk/string.h"
|
|
|
|
#include "spdk/util.h"
|
|
|
|
|
|
|
|
#include "spdk_internal/sock.h"
|
|
|
|
#include "spdk_internal/assert.h"
|
2021-07-14 14:56:03 +00:00
|
|
|
#include "../sock_kernel.h"
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
#define MAX_TMPBUF 1024
|
|
|
|
#define PORTNUMLEN 32
|
|
|
|
#define SPDK_SOCK_GROUP_QUEUE_DEPTH 4096
|
2020-02-19 18:58:29 +00:00
|
|
|
#define SPDK_SOCK_CMG_INFO_SIZE (sizeof(struct cmsghdr) + sizeof(struct sock_extended_err))
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
enum spdk_sock_task_type {
|
|
|
|
SPDK_SOCK_TASK_POLLIN = 0,
|
2020-02-19 18:58:29 +00:00
|
|
|
SPDK_SOCK_TASK_RECV,
|
2019-10-14 23:58:51 +00:00
|
|
|
SPDK_SOCK_TASK_WRITE,
|
2020-04-27 20:40:10 +00:00
|
|
|
SPDK_SOCK_TASK_CANCEL,
|
2019-10-14 23:58:51 +00:00
|
|
|
};
|
|
|
|
|
2020-02-19 18:58:29 +00:00
|
|
|
#if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY)
|
|
|
|
#define SPDK_ZEROCOPY
|
|
|
|
#endif
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
enum spdk_uring_sock_task_status {
|
|
|
|
SPDK_URING_SOCK_TASK_NOT_IN_USE = 0,
|
|
|
|
SPDK_URING_SOCK_TASK_IN_PROCESS,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct spdk_uring_task {
|
|
|
|
enum spdk_uring_sock_task_status status;
|
|
|
|
enum spdk_sock_task_type type;
|
|
|
|
struct spdk_uring_sock *sock;
|
|
|
|
struct msghdr msg;
|
|
|
|
struct iovec iovs[IOV_BATCH_SIZE];
|
|
|
|
int iov_cnt;
|
|
|
|
struct spdk_sock_request *last_req;
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
bool is_zcopy;
|
2019-10-14 23:58:51 +00:00
|
|
|
STAILQ_ENTRY(spdk_uring_task) link;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct spdk_uring_sock {
|
|
|
|
struct spdk_sock base;
|
|
|
|
int fd;
|
2020-02-19 18:58:29 +00:00
|
|
|
uint32_t sendmsg_idx;
|
2019-10-14 23:58:51 +00:00
|
|
|
struct spdk_uring_sock_group_impl *group;
|
|
|
|
struct spdk_uring_task write_task;
|
2020-02-19 18:58:29 +00:00
|
|
|
struct spdk_uring_task recv_task;
|
2019-10-14 23:58:51 +00:00
|
|
|
struct spdk_uring_task pollin_task;
|
2020-04-27 20:40:10 +00:00
|
|
|
struct spdk_uring_task cancel_task;
|
2020-03-18 00:27:50 +00:00
|
|
|
struct spdk_pipe *recv_pipe;
|
|
|
|
void *recv_buf;
|
|
|
|
int recv_buf_sz;
|
2020-02-19 18:58:29 +00:00
|
|
|
bool zcopy;
|
2020-03-18 00:27:50 +00:00
|
|
|
bool pending_recv;
|
2020-02-19 18:58:29 +00:00
|
|
|
int zcopy_send_flags;
|
2020-05-18 21:28:33 +00:00
|
|
|
int connection_status;
|
2021-03-30 20:01:23 +00:00
|
|
|
int placement_id;
|
2020-02-19 18:58:29 +00:00
|
|
|
uint8_t buf[SPDK_SOCK_CMG_INFO_SIZE];
|
2020-03-18 00:27:50 +00:00
|
|
|
TAILQ_ENTRY(spdk_uring_sock) link;
|
2019-10-14 23:58:51 +00:00
|
|
|
};
|
|
|
|
|
2021-07-06 11:56:18 +00:00
|
|
|
TAILQ_HEAD(pending_recv_list, spdk_uring_sock);
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
struct spdk_uring_sock_group_impl {
|
|
|
|
struct spdk_sock_group_impl base;
|
|
|
|
struct io_uring uring;
|
|
|
|
uint32_t io_inflight;
|
|
|
|
uint32_t io_queued;
|
|
|
|
uint32_t io_avail;
|
2021-07-06 11:56:18 +00:00
|
|
|
struct pending_recv_list pending_recv;
|
2019-10-14 23:58:51 +00:00
|
|
|
};
|
|
|
|
|
2020-09-10 14:57:52 +00:00
|
|
|
static struct spdk_sock_impl_opts g_spdk_uring_sock_impl_opts = {
|
|
|
|
.recv_buf_size = MIN_SO_RCVBUF_SIZE,
|
|
|
|
.send_buf_size = MIN_SO_SNDBUF_SIZE,
|
|
|
|
.enable_recv_pipe = true,
|
2020-09-10 17:22:24 +00:00
|
|
|
.enable_quickack = false,
|
2021-03-29 19:34:51 +00:00
|
|
|
.enable_placement_id = PLACEMENT_NONE,
|
2020-02-19 18:58:29 +00:00
|
|
|
.enable_zerocopy_send_server = false,
|
|
|
|
.enable_zerocopy_send_client = false,
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
.zerocopy_threshold = 0
|
2020-09-10 14:57:52 +00:00
|
|
|
};
|
|
|
|
|
2021-04-01 20:07:17 +00:00
|
|
|
static struct spdk_sock_map g_map = {
|
|
|
|
.entries = STAILQ_HEAD_INITIALIZER(g_map.entries),
|
|
|
|
.mtx = PTHREAD_MUTEX_INITIALIZER
|
|
|
|
};
|
|
|
|
|
|
|
|
__attribute((destructor)) static void
|
|
|
|
uring_sock_map_cleanup(void)
|
|
|
|
{
|
|
|
|
spdk_sock_map_cleanup(&g_map);
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
#define SPDK_URING_SOCK_REQUEST_IOV(req) ((struct iovec *)((uint8_t *)req + sizeof(struct spdk_sock_request)))
|
|
|
|
|
|
|
|
#define __uring_sock(sock) (struct spdk_uring_sock *)sock
|
|
|
|
#define __uring_group_impl(group) (struct spdk_uring_sock_group_impl *)group
|
|
|
|
|
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport,
|
|
|
|
char *caddr, int clen, uint16_t *cport)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct sockaddr_storage sa;
|
|
|
|
socklen_t salen;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
assert(sock != NULL);
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof sa);
|
|
|
|
salen = sizeof sa;
|
|
|
|
rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (sa.ss_family) {
|
|
|
|
case AF_UNIX:
|
|
|
|
/* Acceptable connection types that don't have IPs */
|
|
|
|
return 0;
|
|
|
|
case AF_INET:
|
|
|
|
case AF_INET6:
|
|
|
|
/* Code below will get IP addresses */
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/* Unsupported socket family */
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = get_addr_str((struct sockaddr *)&sa, saddr, slen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sport) {
|
|
|
|
if (sa.ss_family == AF_INET) {
|
|
|
|
*sport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
|
|
|
|
} else if (sa.ss_family == AF_INET6) {
|
|
|
|
*sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof sa);
|
|
|
|
salen = sizeof sa;
|
|
|
|
rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = get_addr_str((struct sockaddr *)&sa, caddr, clen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cport) {
|
|
|
|
if (sa.ss_family == AF_INET) {
|
|
|
|
*cport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
|
|
|
|
} else if (sa.ss_family == AF_INET6) {
|
|
|
|
*cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-05-10 20:07:08 +00:00
|
|
|
enum uring_sock_create_type {
|
2019-10-14 23:58:51 +00:00
|
|
|
SPDK_SOCK_CREATE_LISTEN,
|
|
|
|
SPDK_SOCK_CREATE_CONNECT,
|
|
|
|
};
|
|
|
|
|
2020-03-18 00:27:50 +00:00
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_alloc_pipe(struct spdk_uring_sock *sock, int sz)
|
2020-03-18 00:27:50 +00:00
|
|
|
{
|
|
|
|
uint8_t *new_buf;
|
|
|
|
struct spdk_pipe *new_pipe;
|
|
|
|
struct iovec siov[2];
|
|
|
|
struct iovec diov[2];
|
|
|
|
int sbytes;
|
|
|
|
ssize_t bytes;
|
|
|
|
|
|
|
|
if (sock->recv_buf_sz == sz) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If the new size is 0, just free the pipe */
|
|
|
|
if (sz == 0) {
|
|
|
|
spdk_pipe_destroy(sock->recv_pipe);
|
|
|
|
free(sock->recv_buf);
|
|
|
|
sock->recv_pipe = NULL;
|
|
|
|
sock->recv_buf = NULL;
|
|
|
|
return 0;
|
2020-05-29 12:42:38 +00:00
|
|
|
} else if (sz < MIN_SOCK_PIPE_SIZE) {
|
|
|
|
SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE);
|
|
|
|
return -1;
|
2020-03-18 00:27:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Round up to next 64 byte multiple */
|
|
|
|
new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t));
|
|
|
|
if (!new_buf) {
|
|
|
|
SPDK_ERRLOG("socket recv buf allocation failed\n");
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
new_pipe = spdk_pipe_create(new_buf, sz + 1);
|
|
|
|
if (new_pipe == NULL) {
|
|
|
|
SPDK_ERRLOG("socket pipe allocation failed\n");
|
|
|
|
free(new_buf);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sock->recv_pipe != NULL) {
|
|
|
|
/* Pull all of the data out of the old pipe */
|
|
|
|
sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
|
|
|
|
if (sbytes > sz) {
|
|
|
|
/* Too much data to fit into the new pipe size */
|
|
|
|
spdk_pipe_destroy(new_pipe);
|
|
|
|
free(new_buf);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov);
|
|
|
|
assert(sbytes == sz);
|
|
|
|
|
|
|
|
bytes = spdk_iovcpy(siov, 2, diov, 2);
|
|
|
|
spdk_pipe_writer_advance(new_pipe, bytes);
|
|
|
|
|
|
|
|
spdk_pipe_destroy(sock->recv_pipe);
|
|
|
|
free(sock->recv_buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
sock->recv_buf_sz = sz;
|
|
|
|
sock->recv_buf = new_buf;
|
|
|
|
sock->recv_pipe = new_pipe;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_set_recvbuf(struct spdk_sock *_sock, int sz)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
assert(sock != NULL);
|
|
|
|
|
2020-09-10 14:57:52 +00:00
|
|
|
if (g_spdk_uring_sock_impl_opts.enable_recv_pipe) {
|
|
|
|
rc = uring_sock_alloc_pipe(sock, sz);
|
|
|
|
if (rc) {
|
|
|
|
SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock);
|
|
|
|
return rc;
|
|
|
|
}
|
2020-03-18 00:27:50 +00:00
|
|
|
}
|
|
|
|
|
2020-09-10 14:57:52 +00:00
|
|
|
if (sz < MIN_SO_RCVBUF_SIZE) {
|
|
|
|
sz = MIN_SO_RCVBUF_SIZE;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz));
|
|
|
|
if (rc < 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_set_sendbuf(struct spdk_sock *_sock, int sz)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
assert(sock != NULL);
|
|
|
|
|
2020-09-10 14:57:52 +00:00
|
|
|
if (sz < MIN_SO_SNDBUF_SIZE) {
|
|
|
|
sz = MIN_SO_SNDBUF_SIZE;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz));
|
|
|
|
if (rc < 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct spdk_uring_sock *
|
2020-02-19 18:58:29 +00:00
|
|
|
uring_sock_alloc(int fd, bool enable_zero_copy)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock;
|
2020-09-10 17:22:24 +00:00
|
|
|
#if defined(__linux__)
|
|
|
|
int flag;
|
|
|
|
int rc;
|
|
|
|
#endif
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
sock = calloc(1, sizeof(*sock));
|
|
|
|
if (sock == NULL) {
|
|
|
|
SPDK_ERRLOG("sock allocation failed\n");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
sock->fd = fd;
|
2020-09-10 17:22:24 +00:00
|
|
|
|
|
|
|
#if defined(__linux__)
|
|
|
|
flag = 1;
|
|
|
|
|
|
|
|
if (g_spdk_uring_sock_impl_opts.enable_quickack) {
|
|
|
|
rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag));
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("quickack was failed to set\n");
|
|
|
|
}
|
|
|
|
}
|
2021-03-30 20:01:23 +00:00
|
|
|
|
|
|
|
spdk_sock_get_placement_id(sock->fd, g_spdk_uring_sock_impl_opts.enable_placement_id,
|
|
|
|
&sock->placement_id);
|
2020-02-19 18:58:29 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
|
|
|
/* Try to turn on zero copy sends */
|
|
|
|
flag = 1;
|
|
|
|
|
|
|
|
if (enable_zero_copy) {
|
|
|
|
rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag));
|
|
|
|
if (rc == 0) {
|
|
|
|
sock->zcopy = true;
|
|
|
|
sock->zcopy_send_flags = MSG_ZEROCOPY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2020-09-10 17:22:24 +00:00
|
|
|
#endif
|
2021-03-30 20:01:23 +00:00
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
return sock;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct spdk_sock *
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_create(const char *ip, int port,
|
|
|
|
enum uring_sock_create_type type,
|
|
|
|
struct spdk_sock_opts *opts)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock;
|
|
|
|
char buf[MAX_TMPBUF];
|
|
|
|
char portnum[PORTNUMLEN];
|
|
|
|
char *p;
|
|
|
|
struct addrinfo hints, *res, *res0;
|
|
|
|
int fd, flag;
|
|
|
|
int val = 1;
|
|
|
|
int rc;
|
2020-02-19 18:58:29 +00:00
|
|
|
bool enable_zcopy_impl_opts = false;
|
|
|
|
bool enable_zcopy_user_opts = true;
|
|
|
|
|
|
|
|
assert(opts != NULL);
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
if (ip == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (ip[0] == '[') {
|
|
|
|
snprintf(buf, sizeof(buf), "%s", ip + 1);
|
|
|
|
p = strchr(buf, ']');
|
|
|
|
if (p != NULL) {
|
|
|
|
*p = '\0';
|
|
|
|
}
|
|
|
|
ip = (const char *) &buf[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(portnum, sizeof portnum, "%d", port);
|
|
|
|
memset(&hints, 0, sizeof hints);
|
|
|
|
hints.ai_family = PF_UNSPEC;
|
|
|
|
hints.ai_socktype = SOCK_STREAM;
|
|
|
|
hints.ai_flags = AI_NUMERICSERV;
|
|
|
|
hints.ai_flags |= AI_PASSIVE;
|
|
|
|
hints.ai_flags |= AI_NUMERICHOST;
|
|
|
|
rc = getaddrinfo(ip, portnum, &hints, &res0);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("getaddrinfo() failed (errno=%d)\n", errno);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* try listen */
|
|
|
|
fd = -1;
|
|
|
|
for (res = res0; res != NULL; res = res->ai_next) {
|
|
|
|
retry:
|
|
|
|
fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
|
|
|
|
if (fd < 0) {
|
|
|
|
/* error */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-09-10 14:57:52 +00:00
|
|
|
val = g_spdk_uring_sock_impl_opts.recv_buf_size;
|
2019-10-14 23:58:51 +00:00
|
|
|
rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof val);
|
|
|
|
if (rc) {
|
|
|
|
/* Not fatal */
|
|
|
|
}
|
|
|
|
|
2020-09-10 14:57:52 +00:00
|
|
|
val = g_spdk_uring_sock_impl_opts.send_buf_size;
|
2019-10-14 23:58:51 +00:00
|
|
|
rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, sizeof val);
|
|
|
|
if (rc) {
|
|
|
|
/* Not fatal */
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
|
|
|
|
if (rc != 0) {
|
|
|
|
close(fd);
|
2021-06-13 13:29:33 +00:00
|
|
|
fd = -1;
|
2019-10-14 23:58:51 +00:00
|
|
|
/* error */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val);
|
|
|
|
if (rc != 0) {
|
|
|
|
close(fd);
|
2021-06-13 13:29:33 +00:00
|
|
|
fd = -1;
|
2019-10-14 23:58:51 +00:00
|
|
|
/* error */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-04-11 06:35:45 +00:00
|
|
|
if (opts->ack_timeout) {
|
|
|
|
#if defined(__linux__)
|
|
|
|
val = opts->ack_timeout;
|
|
|
|
rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &val, sizeof val);
|
|
|
|
if (rc != 0) {
|
|
|
|
close(fd);
|
|
|
|
fd = -1;
|
|
|
|
/* error */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2020-02-19 11:18:51 +00:00
|
|
|
#if defined(SO_PRIORITY)
|
|
|
|
if (opts != NULL && opts->priority) {
|
|
|
|
rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val);
|
|
|
|
if (rc != 0) {
|
|
|
|
close(fd);
|
2021-06-13 13:29:33 +00:00
|
|
|
fd = -1;
|
2020-02-19 11:18:51 +00:00
|
|
|
/* error */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2019-10-14 23:58:51 +00:00
|
|
|
if (res->ai_family == AF_INET6) {
|
|
|
|
rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val);
|
|
|
|
if (rc != 0) {
|
|
|
|
close(fd);
|
2021-06-13 13:29:33 +00:00
|
|
|
fd = -1;
|
2019-10-14 23:58:51 +00:00
|
|
|
/* error */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type == SPDK_SOCK_CREATE_LISTEN) {
|
|
|
|
rc = bind(fd, res->ai_addr, res->ai_addrlen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno);
|
|
|
|
switch (errno) {
|
|
|
|
case EINTR:
|
|
|
|
/* interrupted? */
|
|
|
|
close(fd);
|
|
|
|
goto retry;
|
|
|
|
case EADDRNOTAVAIL:
|
|
|
|
SPDK_ERRLOG("IP address %s not available. "
|
|
|
|
"Verify IP address in config file "
|
|
|
|
"and make sure setup script is "
|
|
|
|
"run before starting spdk app.\n", ip);
|
|
|
|
/* FALLTHROUGH */
|
|
|
|
default:
|
|
|
|
/* try next family */
|
|
|
|
close(fd);
|
|
|
|
fd = -1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* bind OK */
|
|
|
|
rc = listen(fd, 512);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("listen() failed, errno = %d\n", errno);
|
|
|
|
close(fd);
|
|
|
|
fd = -1;
|
|
|
|
break;
|
|
|
|
}
|
2020-02-19 18:58:29 +00:00
|
|
|
enable_zcopy_impl_opts = g_spdk_uring_sock_impl_opts.enable_zerocopy_send_server;
|
2019-10-14 23:58:51 +00:00
|
|
|
} else if (type == SPDK_SOCK_CREATE_CONNECT) {
|
|
|
|
rc = connect(fd, res->ai_addr, res->ai_addrlen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("connect() failed, errno = %d\n", errno);
|
|
|
|
/* try next family */
|
|
|
|
close(fd);
|
|
|
|
fd = -1;
|
|
|
|
continue;
|
|
|
|
}
|
2020-02-19 18:58:29 +00:00
|
|
|
|
|
|
|
enable_zcopy_impl_opts = g_spdk_uring_sock_impl_opts.enable_zerocopy_send_client;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
flag = fcntl(fd, F_GETFL);
|
|
|
|
if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) {
|
|
|
|
SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
|
|
|
|
close(fd);
|
|
|
|
fd = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
freeaddrinfo(res0);
|
|
|
|
|
|
|
|
if (fd < 0) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-07-14 14:56:03 +00:00
|
|
|
enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd);
|
2020-02-19 18:58:29 +00:00
|
|
|
sock = uring_sock_alloc(fd, enable_zcopy_user_opts && enable_zcopy_impl_opts);
|
2019-10-14 23:58:51 +00:00
|
|
|
if (sock == NULL) {
|
|
|
|
SPDK_ERRLOG("sock allocation failed\n");
|
|
|
|
close(fd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return &sock->base;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct spdk_sock *
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
2020-05-10 20:07:08 +00:00
|
|
|
return uring_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts);
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct spdk_sock *
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
2020-05-10 20:07:08 +00:00
|
|
|
return uring_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts);
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct spdk_sock *
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_accept(struct spdk_sock *_sock)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct sockaddr_storage sa;
|
|
|
|
socklen_t salen;
|
|
|
|
int rc, fd;
|
|
|
|
struct spdk_uring_sock *new_sock;
|
|
|
|
int flag;
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof(sa));
|
|
|
|
salen = sizeof(sa);
|
|
|
|
|
|
|
|
assert(sock != NULL);
|
|
|
|
|
|
|
|
rc = accept(sock->fd, (struct sockaddr *)&sa, &salen);
|
|
|
|
|
|
|
|
if (rc == -1) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
fd = rc;
|
|
|
|
|
|
|
|
flag = fcntl(fd, F_GETFL);
|
|
|
|
if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) {
|
|
|
|
SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
|
|
|
|
close(fd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-02-19 11:18:51 +00:00
|
|
|
#if defined(SO_PRIORITY)
|
|
|
|
/* The priority is not inherited, so call this function again */
|
|
|
|
if (sock->base.opts.priority) {
|
|
|
|
rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int));
|
|
|
|
if (rc != 0) {
|
|
|
|
close(fd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2020-02-19 18:58:29 +00:00
|
|
|
new_sock = uring_sock_alloc(fd, sock->zcopy);
|
2019-10-14 23:58:51 +00:00
|
|
|
if (new_sock == NULL) {
|
|
|
|
close(fd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return &new_sock->base;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_close(struct spdk_sock *_sock)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
|
|
|
|
assert(TAILQ_EMPTY(&_sock->pending_reqs));
|
|
|
|
assert(sock->group == NULL);
|
2020-03-18 00:27:50 +00:00
|
|
|
|
2021-02-24 15:28:54 +00:00
|
|
|
/* If the socket fails to close, the best choice is to
|
|
|
|
* leak the fd but continue to free the rest of the sock
|
|
|
|
* memory. */
|
|
|
|
close(sock->fd);
|
|
|
|
|
2020-03-18 00:27:50 +00:00
|
|
|
spdk_pipe_destroy(sock->recv_pipe);
|
|
|
|
free(sock->recv_buf);
|
2021-02-24 15:28:54 +00:00
|
|
|
free(sock);
|
2019-10-14 23:58:51 +00:00
|
|
|
|
2021-02-24 15:28:54 +00:00
|
|
|
return 0;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_recv_from_pipe(struct spdk_uring_sock *sock, struct iovec *diov, int diovcnt)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
2020-03-18 00:27:50 +00:00
|
|
|
struct iovec siov[2];
|
|
|
|
int sbytes;
|
|
|
|
ssize_t bytes;
|
|
|
|
struct spdk_uring_sock_group_impl *group;
|
|
|
|
|
|
|
|
sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
|
|
|
|
if (sbytes < 0) {
|
|
|
|
errno = EINVAL;
|
|
|
|
return -1;
|
|
|
|
} else if (sbytes == 0) {
|
|
|
|
errno = EAGAIN;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
bytes = spdk_iovcpy(siov, 2, diov, diovcnt);
|
|
|
|
|
|
|
|
if (bytes == 0) {
|
|
|
|
/* The only way this happens is if diov is 0 length */
|
|
|
|
errno = EINVAL;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
spdk_pipe_reader_advance(sock->recv_pipe, bytes);
|
2019-10-14 23:58:51 +00:00
|
|
|
|
2020-03-18 00:27:50 +00:00
|
|
|
/* If we drained the pipe, take it off the level-triggered list */
|
|
|
|
if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
|
|
|
|
group = __uring_group_impl(sock->base.group_impl);
|
|
|
|
TAILQ_REMOVE(&group->pending_recv, sock, link);
|
|
|
|
sock->pending_recv = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline ssize_t
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_read(struct spdk_uring_sock *sock)
|
2020-03-18 00:27:50 +00:00
|
|
|
{
|
|
|
|
struct iovec iov[2];
|
|
|
|
int bytes;
|
|
|
|
struct spdk_uring_sock_group_impl *group;
|
|
|
|
|
|
|
|
bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov);
|
|
|
|
|
|
|
|
if (bytes > 0) {
|
|
|
|
bytes = readv(sock->fd, iov, 2);
|
|
|
|
if (bytes > 0) {
|
|
|
|
spdk_pipe_writer_advance(sock->recv_pipe, bytes);
|
|
|
|
if (sock->base.group_impl) {
|
|
|
|
group = __uring_group_impl(sock->base.group_impl);
|
|
|
|
TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
|
|
|
|
sock->pending_recv = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return bytes;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
2020-03-18 00:27:50 +00:00
|
|
|
int rc, i;
|
|
|
|
size_t len;
|
|
|
|
|
|
|
|
if (sock->recv_pipe == NULL) {
|
|
|
|
return readv(sock->fd, iov, iovcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
len = 0;
|
|
|
|
for (i = 0; i < iovcnt; i++) {
|
|
|
|
len += iov[i].iov_len;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
|
|
|
|
/* If the user is receiving a sufficiently large amount of data,
|
|
|
|
* receive directly to their buffers. */
|
2020-05-29 12:42:38 +00:00
|
|
|
if (len >= MIN_SOCK_PIPE_SIZE) {
|
2020-03-18 00:27:50 +00:00
|
|
|
return readv(sock->fd, iov, iovcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Otherwise, do a big read into our pipe */
|
2020-05-10 20:07:08 +00:00
|
|
|
rc = uring_sock_read(sock);
|
2020-03-18 00:27:50 +00:00
|
|
|
if (rc <= 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-10 20:07:08 +00:00
|
|
|
return uring_sock_recv_from_pipe(sock, iov, iovcnt);
|
2020-03-18 00:27:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
|
2020-03-18 00:27:50 +00:00
|
|
|
{
|
|
|
|
struct iovec iov[1];
|
|
|
|
|
|
|
|
iov[0].iov_base = buf;
|
|
|
|
iov[0].iov_len = len;
|
2019-10-14 23:58:51 +00:00
|
|
|
|
2020-05-10 20:07:08 +00:00
|
|
|
return uring_sock_readv(sock, iov, 1);
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
|
|
|
|
if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
|
|
|
|
errno = EAGAIN;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return writev(sock->fd, iov, iovcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
sock_complete_reqs(struct spdk_sock *_sock, ssize_t rc, bool is_zcopy)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
2020-02-19 18:58:29 +00:00
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
2019-10-14 23:58:51 +00:00
|
|
|
struct spdk_sock_request *req;
|
|
|
|
int i, retval;
|
|
|
|
unsigned int offset;
|
|
|
|
size_t len;
|
|
|
|
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
if (is_zcopy) {
|
2020-02-19 18:58:29 +00:00
|
|
|
/* Handling overflow case, because we use psock->sendmsg_idx - 1 for the
|
|
|
|
* req->internal.offset, so sendmsg_idx should not be zero */
|
|
|
|
if (spdk_unlikely(sock->sendmsg_idx == UINT32_MAX)) {
|
|
|
|
sock->sendmsg_idx = 1;
|
|
|
|
} else {
|
|
|
|
sock->sendmsg_idx++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
/* Consume the requests that were actually written */
|
|
|
|
req = TAILQ_FIRST(&_sock->queued_reqs);
|
|
|
|
while (req) {
|
|
|
|
offset = req->internal.offset;
|
|
|
|
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
/* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */
|
|
|
|
req->internal.is_zcopy = is_zcopy;
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
for (i = 0; i < req->iovcnt; i++) {
|
|
|
|
/* Advance by the offset first */
|
|
|
|
if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
|
|
|
|
offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Calculate the remaining length of this element */
|
|
|
|
len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
|
|
|
|
|
|
|
|
if (len > (size_t)rc) {
|
|
|
|
/* This element was partially sent. */
|
|
|
|
req->internal.offset += rc;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
offset = 0;
|
|
|
|
req->internal.offset += len;
|
|
|
|
rc -= len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handled a full request. */
|
|
|
|
spdk_sock_request_pend(_sock, req);
|
|
|
|
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
if (!req->internal.is_zcopy && req == TAILQ_FIRST(&_sock->pending_reqs)) {
|
2020-02-19 18:58:29 +00:00
|
|
|
retval = spdk_sock_request_put(_sock, req, 0);
|
|
|
|
if (retval) {
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* Re-use the offset field to hold the sendmsg call index. The
|
|
|
|
* index is 0 based, so subtract one here because we've already
|
|
|
|
* incremented above. */
|
|
|
|
req->internal.offset = sock->sendmsg_idx - 1;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (rc == 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
req = TAILQ_FIRST(&_sock->queued_reqs);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-02-19 18:58:29 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
|
|
|
static int
|
|
|
|
_sock_check_zcopy(struct spdk_sock *_sock, int status)
|
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
ssize_t rc;
|
|
|
|
struct sock_extended_err *serr;
|
|
|
|
struct cmsghdr *cm;
|
|
|
|
uint32_t idx;
|
|
|
|
struct spdk_sock_request *req, *treq;
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
assert(sock->zcopy == true);
|
|
|
|
if (spdk_unlikely(status) < 0) {
|
|
|
|
if (!TAILQ_EMPTY(&_sock->pending_reqs)) {
|
|
|
|
SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries, status =%d\n",
|
|
|
|
status);
|
|
|
|
} else {
|
|
|
|
SPDK_WARNLOG("Recvmsg yielded an error!\n");
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
cm = CMSG_FIRSTHDR(&sock->recv_task.msg);
|
2021-11-24 23:37:21 +00:00
|
|
|
if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
|
|
|
|
(cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR))) {
|
2020-02-19 18:58:29 +00:00
|
|
|
SPDK_WARNLOG("Unexpected cmsg level or type!\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
serr = (struct sock_extended_err *)CMSG_DATA(cm);
|
|
|
|
if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
|
|
|
|
SPDK_WARNLOG("Unexpected extended error origin\n");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Most of the time, the pending_reqs array is in the exact
|
|
|
|
* order we need such that all of the requests to complete are
|
|
|
|
* in order, in the front. It is guaranteed that all requests
|
|
|
|
* belonging to the same sendmsg call are sequential, so once
|
|
|
|
* we encounter one match we can stop looping as soon as a
|
|
|
|
* non-match is found.
|
|
|
|
*/
|
|
|
|
for (idx = serr->ee_info; idx <= serr->ee_data; idx++) {
|
|
|
|
found = false;
|
|
|
|
TAILQ_FOREACH_SAFE(req, &_sock->pending_reqs, internal.link, treq) {
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
if (!req->internal.is_zcopy) {
|
|
|
|
/* This wasn't a zcopy request. It was just waiting in line to complete */
|
|
|
|
rc = spdk_sock_request_put(_sock, req, 0);
|
|
|
|
if (rc < 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
} else if (req->internal.offset == idx) {
|
2020-02-19 18:58:29 +00:00
|
|
|
found = true;
|
|
|
|
rc = spdk_sock_request_put(_sock, req, 0);
|
|
|
|
if (rc < 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
} else if (found) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_sock_prep_recv(struct spdk_sock *_sock)
|
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct spdk_uring_task *task = &sock->recv_task;
|
|
|
|
struct io_uring_sqe *sqe;
|
|
|
|
|
|
|
|
if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(sock->group != NULL);
|
|
|
|
sock->group->io_queued++;
|
|
|
|
|
|
|
|
sqe = io_uring_get_sqe(&sock->group->uring);
|
|
|
|
io_uring_prep_recvmsg(sqe, sock->fd, &task->msg, MSG_ERRQUEUE);
|
|
|
|
io_uring_sqe_set_data(sqe, task);
|
|
|
|
task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
static void
|
|
|
|
_sock_flush(struct spdk_sock *_sock)
|
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct spdk_uring_task *task = &sock->write_task;
|
|
|
|
uint32_t iovcnt;
|
|
|
|
struct io_uring_sqe *sqe;
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
int flags;
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
|
|
|
if (sock->zcopy) {
|
|
|
|
flags = MSG_DONTWAIT | sock->zcopy_send_flags;
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
flags = MSG_DONTWAIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
iovcnt = spdk_sock_prep_reqs(&sock->base, task->iovs, task->iov_cnt, &task->last_req, &flags);
|
2019-10-14 23:58:51 +00:00
|
|
|
if (!iovcnt) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
task->iov_cnt = iovcnt;
|
|
|
|
assert(sock->group != NULL);
|
|
|
|
task->msg.msg_iov = task->iovs;
|
|
|
|
task->msg.msg_iovlen = task->iov_cnt;
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
|
|
|
task->is_zcopy = (flags & MSG_ZEROCOPY) ? true : false;
|
|
|
|
#endif
|
2019-10-14 23:58:51 +00:00
|
|
|
sock->group->io_queued++;
|
|
|
|
|
|
|
|
sqe = io_uring_get_sqe(&sock->group->uring);
|
2020-02-19 18:58:29 +00:00
|
|
|
io_uring_prep_sendmsg(sqe, sock->fd, &sock->write_task.msg, flags);
|
2019-10-14 23:58:51 +00:00
|
|
|
io_uring_sqe_set_data(sqe, task);
|
|
|
|
task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_sock_prep_pollin(struct spdk_sock *_sock)
|
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct spdk_uring_task *task = &sock->pollin_task;
|
|
|
|
struct io_uring_sqe *sqe;
|
|
|
|
|
2020-03-18 00:27:50 +00:00
|
|
|
/* Do not prepare pollin event */
|
2020-02-19 18:58:29 +00:00
|
|
|
if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS || (sock->pending_recv && !sock->zcopy)) {
|
2019-10-14 23:58:51 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(sock->group != NULL);
|
|
|
|
sock->group->io_queued++;
|
|
|
|
|
|
|
|
sqe = io_uring_get_sqe(&sock->group->uring);
|
2020-02-19 18:58:29 +00:00
|
|
|
io_uring_prep_poll_add(sqe, sock->fd, POLLIN | POLLERR);
|
2019-10-14 23:58:51 +00:00
|
|
|
io_uring_sqe_set_data(sqe, task);
|
|
|
|
task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
|
|
|
|
}
|
|
|
|
|
2020-04-27 20:40:10 +00:00
|
|
|
static void
|
|
|
|
_sock_prep_cancel_task(struct spdk_sock *_sock, void *user_data)
|
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct spdk_uring_task *task = &sock->cancel_task;
|
|
|
|
struct io_uring_sqe *sqe;
|
|
|
|
|
|
|
|
if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(sock->group != NULL);
|
|
|
|
sock->group->io_queued++;
|
|
|
|
|
|
|
|
sqe = io_uring_get_sqe(&sock->group->uring);
|
|
|
|
io_uring_prep_cancel(sqe, user_data, 0);
|
|
|
|
io_uring_sqe_set_data(sqe, task);
|
|
|
|
task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
sock_uring_group_reap(struct spdk_uring_sock_group_impl *group, int max, int max_read_events,
|
|
|
|
struct spdk_sock **socks)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
2020-05-20 12:29:29 +00:00
|
|
|
int i, count, ret;
|
|
|
|
struct io_uring_cqe *cqe;
|
2020-03-18 00:27:50 +00:00
|
|
|
struct spdk_uring_sock *sock, *tmp;
|
2019-10-14 23:58:51 +00:00
|
|
|
struct spdk_uring_task *task;
|
|
|
|
int status;
|
|
|
|
|
2020-05-20 12:29:29 +00:00
|
|
|
for (i = 0; i < max; i++) {
|
|
|
|
ret = io_uring_peek_cqe(&group->uring, &cqe);
|
|
|
|
if (ret != 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cqe == NULL) {
|
|
|
|
break;
|
|
|
|
}
|
2019-10-14 23:58:51 +00:00
|
|
|
|
2020-05-20 12:29:29 +00:00
|
|
|
task = (struct spdk_uring_task *)cqe->user_data;
|
2019-10-14 23:58:51 +00:00
|
|
|
assert(task != NULL);
|
|
|
|
sock = task->sock;
|
|
|
|
assert(sock != NULL);
|
|
|
|
assert(sock->group != NULL);
|
2020-03-18 00:27:50 +00:00
|
|
|
assert(sock->group == group);
|
2019-10-14 23:58:51 +00:00
|
|
|
sock->group->io_inflight--;
|
|
|
|
sock->group->io_avail++;
|
2020-05-20 12:29:29 +00:00
|
|
|
status = cqe->res;
|
|
|
|
io_uring_cqe_seen(&group->uring, cqe);
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
task->status = SPDK_URING_SOCK_TASK_NOT_IN_USE;
|
|
|
|
|
|
|
|
if (spdk_unlikely(status <= 0)) {
|
2020-02-19 18:58:29 +00:00
|
|
|
if (status == -EAGAIN || status == -EWOULDBLOCK || (status == -ENOBUFS && sock->zcopy)) {
|
2019-10-14 23:58:51 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (task->type) {
|
|
|
|
case SPDK_SOCK_TASK_POLLIN:
|
2021-08-20 06:49:40 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
2020-02-19 18:58:29 +00:00
|
|
|
if ((status & POLLERR) == POLLERR) {
|
|
|
|
_sock_prep_recv(&sock->base);
|
|
|
|
}
|
2021-08-20 06:49:40 +00:00
|
|
|
#endif
|
2019-10-14 23:58:51 +00:00
|
|
|
if ((status & POLLIN) == POLLIN) {
|
2021-07-21 19:24:48 +00:00
|
|
|
if (sock->base.cb_fn != NULL &&
|
|
|
|
sock->pending_recv == false) {
|
2020-03-18 00:27:50 +00:00
|
|
|
sock->pending_recv = true;
|
uring: fix bug when inserting sock into pending_recv list
There is io error when running NVMe over TCP fio test with uring
socket. It's easy to reproduce the bug with the following
configuration:
target 1 core, 16NVMe SSD, 2 initiators each connects to 8 NVMe
namespaces, each runs fio with numjobs=3.
For if in each round, we inset the sock to the head of the
pending_recv list, and then get max_events socks from head of the
list to process, there is possibility that some socks are always
not processed.
Although there was a strategy to cycle the pending_recv list to make
sure we poll things not in the same order. Such as a list: A B C D E F,
if max_events is 3, then this strategy makes the list is rearranged to
D E F A B C. But it will make this strategy not effective if using
TAILQ_INSERT_HEAD(&group->pending_recv, sock...).
Using TAILQ_INSERT_TAIL(&group->pending_recv, sock...) can fix it.
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I8429b8eee29a9f9f820ad291d1b65ce2c2be22ea
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11154
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
2022-01-19 06:23:26 +00:00
|
|
|
TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case SPDK_SOCK_TASK_WRITE:
|
|
|
|
task->last_req = NULL;
|
|
|
|
task->iov_cnt = 0;
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
task->is_zcopy = false;
|
2020-05-18 21:28:33 +00:00
|
|
|
if (spdk_unlikely(status) < 0) {
|
|
|
|
sock->connection_status = status;
|
|
|
|
spdk_sock_abort_requests(&sock->base);
|
|
|
|
} else {
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
sock_complete_reqs(&sock->base, status, task->is_zcopy);
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
2020-04-27 20:40:10 +00:00
|
|
|
break;
|
2020-02-19 18:58:29 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
|
|
|
case SPDK_SOCK_TASK_RECV:
|
|
|
|
if (spdk_unlikely(status == -ECANCELED)) {
|
|
|
|
sock->connection_status = status;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
_sock_check_zcopy(&sock->base, status);
|
|
|
|
break;
|
|
|
|
#endif
|
2020-04-27 20:40:10 +00:00
|
|
|
case SPDK_SOCK_TASK_CANCEL:
|
2020-05-18 21:28:33 +00:00
|
|
|
/* Do nothing */
|
2019-10-14 23:58:51 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
SPDK_UNREACHABLE();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-18 21:28:33 +00:00
|
|
|
if (!socks) {
|
|
|
|
return 0;
|
|
|
|
}
|
2020-03-18 00:27:50 +00:00
|
|
|
count = 0;
|
|
|
|
TAILQ_FOREACH_SAFE(sock, &group->pending_recv, link, tmp) {
|
|
|
|
if (count == max_read_events) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2021-07-06 11:56:18 +00:00
|
|
|
if (spdk_unlikely(sock->base.cb_fn == NULL) ||
|
|
|
|
(sock->recv_pipe == NULL || spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0)) {
|
sock: Fix the "sock remove assert bug" in spdk_sock_group_remove_sock
The statement causes this issue is:
assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL);
The call trace is:
The previous solution is:
commitid with: e71e81b6311772681a3f8bcc279bc7253c7c1d9b
But with this solution, it will always add the sock
into the removed_socks list even if it is not under polling
context by sock_group_impl_poll_count. So it will exceed the size of
removed_socks array if sock_group_impl_poll_count function will not be
called. And we should not use a large array, because it is just a workaround,
it just hides the bug.
So our current solution is:
1 Remove the code in sock layer, i.e., rollback the commit
e71e81b6311772681a3f8bcc279bc7253c7c1d9b. This patch is
not the right fix. The sock->cb_fn's NULL pointer case is
caused by the cb_fn of write operation (if the
spdk_sock_group_remove_sock is inside the cb_fn). And it is not
caused by the epoll related cache issue described in commit
"e7181.." commit, but caused by the following situation:
(1)The socket's cb_fn is set to NULL which is caused by
spdk_sock_group_remove_sock by the socket itself
inside a call back function from a write operation.
(2) And the socket is already in the pending_recv list. It is
not caused by the epoll event issue, e.g., socket A changes Socket B's
cb_fn. By the way, A socket A should never remove a socket B from a polling group.
If it really does it, it should use spdk_thread_sendmsg to make sure
it happens in the next round.
2 Add the code check in each posix, uring implementation module.
If sock->cb_fn is NULL, we will not return the socket to the active socks list.
And this is enough to address the issue.
Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Change-Id: I79187f2f1301c819c46a5c3bdd84372f75534f2f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6472
Reviewed-by: Xiaodong Liu <xiaodong.liu@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2021-02-19 12:02:07 +00:00
|
|
|
sock->pending_recv = false;
|
|
|
|
TAILQ_REMOVE(&group->pending_recv, sock, link);
|
2021-07-06 11:56:18 +00:00
|
|
|
if (spdk_unlikely(sock->base.cb_fn == NULL)) {
|
|
|
|
/* If the socket's cb_fn is NULL, do not add it to socks array */
|
|
|
|
continue;
|
|
|
|
}
|
sock: Fix the "sock remove assert bug" in spdk_sock_group_remove_sock
The statement causes this issue is:
assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL);
The call trace is:
The previous solution is:
commitid with: e71e81b6311772681a3f8bcc279bc7253c7c1d9b
But with this solution, it will always add the sock
into the removed_socks list even if it is not under polling
context by sock_group_impl_poll_count. So it will exceed the size of
removed_socks array if sock_group_impl_poll_count function will not be
called. And we should not use a large array, because it is just a workaround,
it just hides the bug.
So our current solution is:
1 Remove the code in sock layer, i.e., rollback the commit
e71e81b6311772681a3f8bcc279bc7253c7c1d9b. This patch is
not the right fix. The sock->cb_fn's NULL pointer case is
caused by the cb_fn of write operation (if the
spdk_sock_group_remove_sock is inside the cb_fn). And it is not
caused by the epoll related cache issue described in commit
"e7181.." commit, but caused by the following situation:
(1)The socket's cb_fn is set to NULL which is caused by
spdk_sock_group_remove_sock by the socket itself
inside a call back function from a write operation.
(2) And the socket is already in the pending_recv list. It is
not caused by the epoll event issue, e.g., socket A changes Socket B's
cb_fn. By the way, A socket A should never remove a socket B from a polling group.
If it really does it, it should use spdk_thread_sendmsg to make sure
it happens in the next round.
2 Add the code check in each posix, uring implementation module.
If sock->cb_fn is NULL, we will not return the socket to the active socks list.
And this is enough to address the issue.
Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Change-Id: I79187f2f1301c819c46a5c3bdd84372f75534f2f
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6472
Reviewed-by: Xiaodong Liu <xiaodong.liu@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2021-02-19 12:02:07 +00:00
|
|
|
}
|
|
|
|
|
2020-03-18 00:27:50 +00:00
|
|
|
socks[count++] = &sock->base;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-07-06 11:56:18 +00:00
|
|
|
/* Cycle the pending_recv list so that each time we poll things aren't
|
|
|
|
* in the same order. Say we have 6 sockets in the list, named as follows:
|
|
|
|
* A B C D E F
|
|
|
|
* And all 6 sockets had the poll events, but max_events is only 3. That means
|
|
|
|
* psock currently points at D. We want to rearrange the list to the following:
|
|
|
|
* D E F A B C
|
|
|
|
*
|
|
|
|
* The variables below are named according to this example to make it easier to
|
|
|
|
* follow the swaps.
|
|
|
|
*/
|
|
|
|
if (sock != NULL) {
|
|
|
|
struct spdk_uring_sock *ua, *uc, *ud, *uf;
|
|
|
|
|
|
|
|
/* Capture pointers to the elements we need */
|
|
|
|
ud = sock;
|
uring: Fix socket rotation and ordering issue in the pending_recv list.
When we rotate the socket in the list, we did not check whether the uc pointer
is NULL, then we cause the coredump when using uc pointer.
When we add a new socket (with pollin event) into the pending_recv list,
we add it into the end of the pending_recv list, then it delays the execution
of the newly socket, and it can cause the timeout especially when a socket
is in a connection phase.
So the purpose of this patch is:
1 Revise the rotation logic to handle the two cases, i.e., (1)sock is in the
beginning of the list; (2)sock is in the end of list. The purpose is
to avoid NULL pointer access, and efficently handle the exceptional case.
2 When there is new pollin event of the socket, we should add socket in the beginning
of the list. And this can avoid the new socket handling starvation.
Since max poll event num is 32 from upper layer and if we always put the new socket
in the end of the list, then starvation will occur if there are many socket connection events.
Because if we add the new socket into the end of the pending list, we will always handle the
existing socks first, then the later coming socket(with relatively pollin event) will always be
handled late. Then in the sock connection initialization phase, it will consume a relatively
long time, then the upper layer connection based on this socket will cause timeout,.e.g.,
ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host nqn.2014-08.org.nvmexpress:
uuid:af56cce7-2008-408c-a8a0-9c710857febf from subsystem nqn.2019-02.io.spdk:cnode0 due to
keep alive timeout.
[2021-08-25 20:13:42.201139] ctrlr.c: 579:_nvmf_ctrlr_add_io_qpair:
*ERROR*: Unknown controller ID 0x1
Fixes #2097
Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Change-Id: I171b83ffd800539e86660c7607538e120fbe1a91
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9223
Reviewed-by: John Kariuki <John.K.Kariuki@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2021-08-19 15:46:04 +00:00
|
|
|
|
2021-07-06 11:56:18 +00:00
|
|
|
ua = TAILQ_FIRST(&group->pending_recv);
|
uring: Fix socket rotation and ordering issue in the pending_recv list.
When we rotate the socket in the list, we did not check whether the uc pointer
is NULL, then we cause the coredump when using uc pointer.
When we add a new socket (with pollin event) into the pending_recv list,
we add it into the end of the pending_recv list, then it delays the execution
of the newly socket, and it can cause the timeout especially when a socket
is in a connection phase.
So the purpose of this patch is:
1 Revise the rotation logic to handle the two cases, i.e., (1)sock is in the
beginning of the list; (2)sock is in the end of list. The purpose is
to avoid NULL pointer access, and efficently handle the exceptional case.
2 When there is new pollin event of the socket, we should add socket in the beginning
of the list. And this can avoid the new socket handling starvation.
Since max poll event num is 32 from upper layer and if we always put the new socket
in the end of the list, then starvation will occur if there are many socket connection events.
Because if we add the new socket into the end of the pending list, we will always handle the
existing socks first, then the later coming socket(with relatively pollin event) will always be
handled late. Then in the sock connection initialization phase, it will consume a relatively
long time, then the upper layer connection based on this socket will cause timeout,.e.g.,
ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host nqn.2014-08.org.nvmexpress:
uuid:af56cce7-2008-408c-a8a0-9c710857febf from subsystem nqn.2019-02.io.spdk:cnode0 due to
keep alive timeout.
[2021-08-25 20:13:42.201139] ctrlr.c: 579:_nvmf_ctrlr_add_io_qpair:
*ERROR*: Unknown controller ID 0x1
Fixes #2097
Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Change-Id: I171b83ffd800539e86660c7607538e120fbe1a91
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9223
Reviewed-by: John Kariuki <John.K.Kariuki@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2021-08-19 15:46:04 +00:00
|
|
|
if (ua == ud) {
|
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
|
2021-07-06 11:56:18 +00:00
|
|
|
uf = TAILQ_LAST(&group->pending_recv, pending_recv_list);
|
uring: Fix socket rotation and ordering issue in the pending_recv list.
When we rotate the socket in the list, we did not check whether the uc pointer
is NULL, then we cause the coredump when using uc pointer.
When we add a new socket (with pollin event) into the pending_recv list,
we add it into the end of the pending_recv list, then it delays the execution
of the newly socket, and it can cause the timeout especially when a socket
is in a connection phase.
So the purpose of this patch is:
1 Revise the rotation logic to handle the two cases, i.e., (1)sock is in the
beginning of the list; (2)sock is in the end of list. The purpose is
to avoid NULL pointer access, and efficently handle the exceptional case.
2 When there is new pollin event of the socket, we should add socket in the beginning
of the list. And this can avoid the new socket handling starvation.
Since max poll event num is 32 from upper layer and if we always put the new socket
in the end of the list, then starvation will occur if there are many socket connection events.
Because if we add the new socket into the end of the pending list, we will always handle the
existing socks first, then the later coming socket(with relatively pollin event) will always be
handled late. Then in the sock connection initialization phase, it will consume a relatively
long time, then the upper layer connection based on this socket will cause timeout,.e.g.,
ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host nqn.2014-08.org.nvmexpress:
uuid:af56cce7-2008-408c-a8a0-9c710857febf from subsystem nqn.2019-02.io.spdk:cnode0 due to
keep alive timeout.
[2021-08-25 20:13:42.201139] ctrlr.c: 579:_nvmf_ctrlr_add_io_qpair:
*ERROR*: Unknown controller ID 0x1
Fixes #2097
Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Change-Id: I171b83ffd800539e86660c7607538e120fbe1a91
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9223
Reviewed-by: John Kariuki <John.K.Kariuki@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2021-08-19 15:46:04 +00:00
|
|
|
if (uf == ud) {
|
|
|
|
TAILQ_REMOVE(&group->pending_recv, ud, link);
|
|
|
|
TAILQ_INSERT_HEAD(&group->pending_recv, ud, link);
|
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
|
|
|
|
uc = TAILQ_PREV(ud, pending_recv_list, link);
|
|
|
|
assert(uc != NULL);
|
2021-07-06 11:56:18 +00:00
|
|
|
|
|
|
|
/* Break the link between C and D */
|
|
|
|
uc->link.tqe_next = NULL;
|
|
|
|
|
|
|
|
/* Connect F to A */
|
|
|
|
uf->link.tqe_next = ua;
|
|
|
|
ua->link.tqe_prev = &uf->link.tqe_next;
|
|
|
|
|
|
|
|
/* Fix up the list first/last pointers */
|
|
|
|
group->pending_recv.tqh_first = ud;
|
|
|
|
group->pending_recv.tqh_last = &uc->link.tqe_next;
|
|
|
|
|
|
|
|
/* D is in front of the list, make tqe prev pointer point to the head of list */
|
|
|
|
ud->link.tqe_prev = &group->pending_recv.tqh_first;
|
2020-03-18 00:27:50 +00:00
|
|
|
}
|
|
|
|
|
uring: Fix socket rotation and ordering issue in the pending_recv list.
When we rotate the socket in the list, we did not check whether the uc pointer
is NULL, then we cause the coredump when using uc pointer.
When we add a new socket (with pollin event) into the pending_recv list,
we add it into the end of the pending_recv list, then it delays the execution
of the newly socket, and it can cause the timeout especially when a socket
is in a connection phase.
So the purpose of this patch is:
1 Revise the rotation logic to handle the two cases, i.e., (1)sock is in the
beginning of the list; (2)sock is in the end of list. The purpose is
to avoid NULL pointer access, and efficently handle the exceptional case.
2 When there is new pollin event of the socket, we should add socket in the beginning
of the list. And this can avoid the new socket handling starvation.
Since max poll event num is 32 from upper layer and if we always put the new socket
in the end of the list, then starvation will occur if there are many socket connection events.
Because if we add the new socket into the end of the pending list, we will always handle the
existing socks first, then the later coming socket(with relatively pollin event) will always be
handled late. Then in the sock connection initialization phase, it will consume a relatively
long time, then the upper layer connection based on this socket will cause timeout,.e.g.,
ctrlr.c: 185:nvmf_ctrlr_keep_alive_poll: *NOTICE*: Disconnecting host nqn.2014-08.org.nvmexpress:
uuid:af56cce7-2008-408c-a8a0-9c710857febf from subsystem nqn.2019-02.io.spdk:cnode0 due to
keep alive timeout.
[2021-08-25 20:13:42.201139] ctrlr.c: 579:_nvmf_ctrlr_add_io_qpair:
*ERROR*: Unknown controller ID 0x1
Fixes #2097
Signed-off-by: Ziye Yang <ziye.yang@intel.com>
Change-Id: I171b83ffd800539e86660c7607538e120fbe1a91
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9223
Reviewed-by: John Kariuki <John.K.Kariuki@intel.com>
Reviewed-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2021-08-19 15:46:04 +00:00
|
|
|
end:
|
2019-10-14 23:58:51 +00:00
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
_sock_flush_client(struct spdk_sock *_sock)
|
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct msghdr msg = {};
|
|
|
|
struct iovec iovs[IOV_BATCH_SIZE];
|
|
|
|
int iovcnt;
|
|
|
|
ssize_t rc;
|
2020-02-19 18:58:29 +00:00
|
|
|
int flags = sock->zcopy_send_flags;
|
2022-03-01 08:32:57 +00:00
|
|
|
int retval;
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
bool is_zcopy = false;
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
/* Can't flush from within a callback or we end up with recursive calls */
|
|
|
|
if (_sock->cb_cnt > 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Gather an iov */
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
iovcnt = spdk_sock_prep_reqs(_sock, iovs, 0, NULL, &flags);
|
2019-10-14 23:58:51 +00:00
|
|
|
if (iovcnt == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Perform the vectored write */
|
|
|
|
msg.msg_iov = iovs;
|
|
|
|
msg.msg_iovlen = iovcnt;
|
2020-02-19 18:58:29 +00:00
|
|
|
rc = sendmsg(sock->fd, &msg, flags);
|
2019-10-14 23:58:51 +00:00
|
|
|
if (rc <= 0) {
|
|
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
|
|
|
is_zcopy = flags & MSG_ZEROCOPY;
|
|
|
|
#endif
|
|
|
|
retval = sock_complete_reqs(_sock, rc, is_zcopy);
|
2022-03-01 08:32:57 +00:00
|
|
|
if (retval < 0) {
|
|
|
|
/* if the socket is closed, return to avoid heap-use-after-free error */
|
|
|
|
return retval;
|
|
|
|
}
|
2019-10-14 23:58:51 +00:00
|
|
|
|
2020-02-19 18:58:29 +00:00
|
|
|
#ifdef SPDK_ZEROCOPY
|
|
|
|
if (sock->zcopy && !TAILQ_EMPTY(&_sock->pending_reqs)) {
|
|
|
|
_sock_check_zcopy(_sock, 0);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_writev_async(struct spdk_sock *_sock, struct spdk_sock_request *req)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
int rc;
|
|
|
|
|
2020-05-18 21:28:33 +00:00
|
|
|
if (spdk_unlikely(sock->connection_status)) {
|
|
|
|
req->cb_fn(req->cb_arg, sock->connection_status);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
spdk_sock_request_queue(_sock, req);
|
|
|
|
|
|
|
|
if (!sock->group) {
|
|
|
|
if (_sock->queued_iovcnt >= IOV_BATCH_SIZE) {
|
|
|
|
rc = _sock_flush_client(_sock);
|
|
|
|
if (rc) {
|
|
|
|
spdk_sock_abort_requests(_sock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-18 07:44:53 +00:00
|
|
|
static void
|
|
|
|
uring_sock_readv_async(struct spdk_sock *sock, struct spdk_sock_request *req)
|
|
|
|
{
|
|
|
|
req->cb_fn(req->cb_arg, -ENOTSUP);
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
int val;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
assert(sock != NULL);
|
|
|
|
|
|
|
|
val = nbytes;
|
|
|
|
rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val);
|
|
|
|
if (rc != 0) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_is_ipv6(struct spdk_sock *_sock)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct sockaddr_storage sa;
|
|
|
|
socklen_t salen;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
assert(sock != NULL);
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof sa);
|
|
|
|
salen = sizeof sa;
|
|
|
|
rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (sa.ss_family == AF_INET6);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_is_ipv4(struct spdk_sock *_sock)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct sockaddr_storage sa;
|
|
|
|
socklen_t salen;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
assert(sock != NULL);
|
|
|
|
|
|
|
|
memset(&sa, 0, sizeof sa);
|
|
|
|
salen = sizeof sa;
|
|
|
|
rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
|
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (sa.ss_family == AF_INET);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_is_connected(struct spdk_sock *_sock)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
uint8_t byte;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = recv(sock->fd, &byte, 1, MSG_PEEK);
|
|
|
|
if (rc == 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rc < 0) {
|
|
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-04-01 20:22:28 +00:00
|
|
|
static struct spdk_sock_group_impl *
|
2021-11-17 13:19:58 +00:00
|
|
|
uring_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
2021-03-30 20:01:23 +00:00
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
2021-04-01 20:22:28 +00:00
|
|
|
struct spdk_sock_group_impl *group;
|
2020-12-15 07:05:44 +00:00
|
|
|
|
2021-04-01 20:07:17 +00:00
|
|
|
if (sock->placement_id != -1) {
|
2021-11-17 13:19:58 +00:00
|
|
|
spdk_sock_map_lookup(&g_map, sock->placement_id, &group, hint);
|
2021-04-01 20:07:17 +00:00
|
|
|
return group;
|
|
|
|
}
|
2019-10-14 23:58:51 +00:00
|
|
|
|
2021-04-01 20:07:17 +00:00
|
|
|
return NULL;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct spdk_sock_group_impl *
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_group_impl_create(void)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock_group_impl *group_impl;
|
|
|
|
|
|
|
|
group_impl = calloc(1, sizeof(*group_impl));
|
|
|
|
if (group_impl == NULL) {
|
|
|
|
SPDK_ERRLOG("group_impl allocation failed\n");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
group_impl->io_avail = SPDK_SOCK_GROUP_QUEUE_DEPTH;
|
|
|
|
|
|
|
|
if (io_uring_queue_init(SPDK_SOCK_GROUP_QUEUE_DEPTH, &group_impl->uring, 0) < 0) {
|
|
|
|
SPDK_ERRLOG("uring I/O context setup failure\n");
|
|
|
|
free(group_impl);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-03-18 00:27:50 +00:00
|
|
|
TAILQ_INIT(&group_impl->pending_recv);
|
|
|
|
|
2021-04-01 20:07:17 +00:00
|
|
|
if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) {
|
2021-04-01 20:22:28 +00:00
|
|
|
spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base);
|
2021-04-01 20:07:17 +00:00
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
return &group_impl->base;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group,
|
|
|
|
struct spdk_sock *_sock)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
|
2021-04-01 20:07:17 +00:00
|
|
|
int rc;
|
2019-10-14 23:58:51 +00:00
|
|
|
|
|
|
|
sock->group = group;
|
|
|
|
sock->write_task.sock = sock;
|
|
|
|
sock->write_task.type = SPDK_SOCK_TASK_WRITE;
|
|
|
|
|
|
|
|
sock->pollin_task.sock = sock;
|
|
|
|
sock->pollin_task.type = SPDK_SOCK_TASK_POLLIN;
|
|
|
|
|
2020-02-19 18:58:29 +00:00
|
|
|
sock->recv_task.sock = sock;
|
|
|
|
sock->recv_task.type = SPDK_SOCK_TASK_RECV;
|
|
|
|
sock->recv_task.msg.msg_control = sock->buf;
|
|
|
|
sock->recv_task.msg.msg_controllen = sizeof(sock->buf);
|
|
|
|
|
2020-04-27 20:40:10 +00:00
|
|
|
sock->cancel_task.sock = sock;
|
|
|
|
sock->cancel_task.type = SPDK_SOCK_TASK_CANCEL;
|
|
|
|
|
2020-04-27 20:58:39 +00:00
|
|
|
/* switched from another polling group due to scheduling */
|
|
|
|
if (spdk_unlikely(sock->recv_pipe != NULL &&
|
|
|
|
(spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) {
|
|
|
|
assert(sock->pending_recv == false);
|
|
|
|
sock->pending_recv = true;
|
|
|
|
TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
|
|
|
|
}
|
|
|
|
|
2021-04-01 20:07:17 +00:00
|
|
|
if (sock->placement_id != -1) {
|
2021-04-01 20:22:28 +00:00
|
|
|
rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base);
|
2021-04-01 20:07:17 +00:00
|
|
|
if (rc != 0) {
|
|
|
|
SPDK_ERRLOG("Failed to insert sock group into map: %d", rc);
|
|
|
|
/* Do not treat this as an error. The system will continue running. */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
|
|
|
|
struct spdk_sock **socks)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
|
|
|
|
int count, ret;
|
|
|
|
int to_complete, to_submit;
|
|
|
|
struct spdk_sock *_sock, *tmp;
|
2020-05-18 21:28:33 +00:00
|
|
|
struct spdk_uring_sock *sock;
|
2019-10-14 23:58:51 +00:00
|
|
|
|
2020-06-24 23:18:07 +00:00
|
|
|
if (spdk_likely(socks)) {
|
|
|
|
TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) {
|
|
|
|
sock = __uring_sock(_sock);
|
|
|
|
if (spdk_unlikely(sock->connection_status)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
_sock_flush(_sock);
|
|
|
|
_sock_prep_pollin(_sock);
|
2020-05-18 21:28:33 +00:00
|
|
|
}
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
to_submit = group->io_queued;
|
|
|
|
|
|
|
|
/* For network I/O, it cannot be set with O_DIRECT, so we do not need to call spdk_io_uring_enter */
|
|
|
|
if (to_submit > 0) {
|
|
|
|
/* If there are I/O to submit, use io_uring_submit here.
|
|
|
|
* It will automatically call io_uring_enter appropriately. */
|
|
|
|
ret = io_uring_submit(&group->uring);
|
|
|
|
if (ret < 0) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
group->io_queued = 0;
|
|
|
|
group->io_inflight += to_submit;
|
|
|
|
group->io_avail -= to_submit;
|
|
|
|
}
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
to_complete = group->io_inflight;
|
2020-11-11 10:43:15 +00:00
|
|
|
if (to_complete > 0 || !TAILQ_EMPTY(&group->pending_recv)) {
|
2020-05-10 20:07:08 +00:00
|
|
|
count = sock_uring_group_reap(group, to_complete, max_events, socks);
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2020-04-27 21:27:29 +00:00
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group,
|
|
|
|
struct spdk_sock *_sock)
|
2020-04-27 21:27:29 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
|
|
|
|
|
|
|
|
if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
|
2020-05-18 21:28:33 +00:00
|
|
|
_sock_prep_cancel_task(_sock, &sock->write_task);
|
|
|
|
/* Since spdk_sock_group_remove_sock is not asynchronous interface, so
|
|
|
|
* currently can use a while loop here. */
|
|
|
|
while ((sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
|
|
|
|
(sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
|
|
|
|
uring_sock_group_impl_poll(_group, 32, NULL);
|
|
|
|
}
|
2020-04-27 21:27:29 +00:00
|
|
|
}
|
|
|
|
|
2021-08-19 13:18:04 +00:00
|
|
|
if (sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
|
|
|
|
_sock_prep_cancel_task(_sock, &sock->pollin_task);
|
2020-02-19 18:58:29 +00:00
|
|
|
/* Since spdk_sock_group_remove_sock is not asynchronous interface, so
|
|
|
|
* currently can use a while loop here. */
|
2021-08-19 13:18:04 +00:00
|
|
|
while ((sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
|
2020-02-19 18:58:29 +00:00
|
|
|
(sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
|
|
|
|
uring_sock_group_impl_poll(_group, 32, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-19 13:18:04 +00:00
|
|
|
if (sock->recv_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
|
|
|
|
_sock_prep_cancel_task(_sock, &sock->recv_task);
|
2020-05-18 21:28:33 +00:00
|
|
|
/* Since spdk_sock_group_remove_sock is not asynchronous interface, so
|
|
|
|
* currently can use a while loop here. */
|
2021-08-19 13:18:04 +00:00
|
|
|
while ((sock->recv_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
|
2020-05-18 21:28:33 +00:00
|
|
|
(sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
|
|
|
|
uring_sock_group_impl_poll(_group, 32, NULL);
|
|
|
|
}
|
2020-04-27 20:40:10 +00:00
|
|
|
}
|
2021-08-19 13:18:04 +00:00
|
|
|
|
|
|
|
/* Make sure the cancelling the tasks above didn't cause sending new requests */
|
|
|
|
assert(sock->write_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE);
|
|
|
|
assert(sock->pollin_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE);
|
|
|
|
assert(sock->recv_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE);
|
|
|
|
|
2020-05-18 21:28:33 +00:00
|
|
|
if (sock->pending_recv) {
|
|
|
|
TAILQ_REMOVE(&group->pending_recv, sock, link);
|
|
|
|
sock->pending_recv = false;
|
2020-04-27 21:27:29 +00:00
|
|
|
}
|
2020-05-18 21:28:33 +00:00
|
|
|
assert(sock->pending_recv == false);
|
2020-04-27 21:27:29 +00:00
|
|
|
|
2021-04-01 20:07:17 +00:00
|
|
|
if (sock->placement_id != -1) {
|
|
|
|
spdk_sock_map_release(&g_map, sock->placement_id);
|
|
|
|
}
|
|
|
|
|
2020-04-27 20:40:10 +00:00
|
|
|
sock->group = NULL;
|
2020-04-27 21:27:29 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_group_impl_close(struct spdk_sock_group_impl *_group)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
|
|
|
struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
|
|
|
|
|
|
|
|
/* try to reap all the active I/O */
|
|
|
|
while (group->io_inflight) {
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_group_impl_poll(_group, 32, NULL);
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
assert(group->io_inflight == 0);
|
|
|
|
assert(group->io_avail == SPDK_SOCK_GROUP_QUEUE_DEPTH);
|
|
|
|
|
|
|
|
io_uring_queue_exit(&group->uring);
|
|
|
|
|
2021-04-01 20:07:17 +00:00
|
|
|
if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) {
|
|
|
|
spdk_sock_map_release(&g_map, spdk_env_get_current_core());
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
free(group);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-10 14:57:52 +00:00
|
|
|
static int
|
|
|
|
uring_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len)
|
|
|
|
{
|
|
|
|
if (!opts || !len) {
|
|
|
|
errno = EINVAL;
|
|
|
|
return -1;
|
|
|
|
}
|
2020-10-15 11:07:41 +00:00
|
|
|
memset(opts, 0, *len);
|
2020-09-10 14:57:52 +00:00
|
|
|
|
|
|
|
#define FIELD_OK(field) \
|
|
|
|
offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= *len
|
|
|
|
|
|
|
|
#define GET_FIELD(field) \
|
|
|
|
if (FIELD_OK(field)) { \
|
|
|
|
opts->field = g_spdk_uring_sock_impl_opts.field; \
|
|
|
|
}
|
|
|
|
|
|
|
|
GET_FIELD(recv_buf_size);
|
|
|
|
GET_FIELD(send_buf_size);
|
|
|
|
GET_FIELD(enable_recv_pipe);
|
2020-09-10 17:22:24 +00:00
|
|
|
GET_FIELD(enable_quickack);
|
2020-09-10 17:22:24 +00:00
|
|
|
GET_FIELD(enable_placement_id);
|
2020-02-19 18:58:29 +00:00
|
|
|
GET_FIELD(enable_zerocopy_send_server);
|
|
|
|
GET_FIELD(enable_zerocopy_send_client);
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
GET_FIELD(zerocopy_threshold);
|
2020-09-10 14:57:52 +00:00
|
|
|
|
|
|
|
#undef GET_FIELD
|
|
|
|
#undef FIELD_OK
|
|
|
|
|
|
|
|
*len = spdk_min(*len, sizeof(g_spdk_uring_sock_impl_opts));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
uring_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len)
|
|
|
|
{
|
|
|
|
if (!opts) {
|
|
|
|
errno = EINVAL;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define FIELD_OK(field) \
|
|
|
|
offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= len
|
|
|
|
|
|
|
|
#define SET_FIELD(field) \
|
|
|
|
if (FIELD_OK(field)) { \
|
|
|
|
g_spdk_uring_sock_impl_opts.field = opts->field; \
|
|
|
|
}
|
|
|
|
|
|
|
|
SET_FIELD(recv_buf_size);
|
|
|
|
SET_FIELD(send_buf_size);
|
|
|
|
SET_FIELD(enable_recv_pipe);
|
2020-09-10 17:22:24 +00:00
|
|
|
SET_FIELD(enable_quickack);
|
2020-09-10 17:22:24 +00:00
|
|
|
SET_FIELD(enable_placement_id);
|
2020-02-19 18:58:29 +00:00
|
|
|
SET_FIELD(enable_zerocopy_send_server);
|
|
|
|
SET_FIELD(enable_zerocopy_send_client);
|
sock: introduce dynamic zerocopy according to data size
MSG_ZEROCOPY is not always effective as mentioned in
https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html.
Currently in spdk, once we enable sendmsg zerocopy, then all data
transferred through _sock_flush are sent with zerocopy, and vice
versa. Here dynamic zerocopy is introduced to allow data sent with
MSG_ZEROCOPY or not according to its size, which can be enabled by
setting "enable_dynamic_zerocopy" as true.
Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators'
configurations are the same as spdk report:
https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf
For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious
influence when read percentage is greater than 50%.
For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost
tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has
1%~7% improvement when read percentage is greater than 50%.
The following is part of the detailed data.
posix:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 286.5 298.5 4.19% 307 304.15 -0.93%
4 1042.5 1107 6.19% 1135.5 1136 0.04%
8 1952.5 2058 5.40% 2170.5 2170.5 0.00%
12 2658.5 2879 8.29% 3042 3046 0.13%
16 3247.5 3460.5 6.56% 3793.5 3775 -0.49%
24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08%
32 4810 5095 5.93% 4488 4845 7.95%
40 5306.5 5435 2.42% 4427.5 4902 10.72%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 275 287 4.36% 294.4 295.45 0.36%
4 979 1041 6.33% 1073 1083.5 0.98%
8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59%
12 2441 2598.5 6.45% 2808.5 2779.5 -1.03%
16 2920.5 3109.5 6.47% 3455 3411.5 -1.26%
24 3709 3972.5 7.10% 4483.5 4502.5 0.42%
32 4225.5 4532.5 7.27% 4463.5 4733 6.04%
40 4790.5 4884.5 1.96% 4427 4904.5 10.79%
uring:
qdepth=128
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 270.5 287.5 6.28% 295.75 304.75 3.04%
4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31%
8 1907 2055 7.76% 2127 2211.5 3.97%
12 2614 2801 7.15% 2982.5 3061.5 2.65%
16 3169.5 3420 7.90% 3654.5 3781.5 3.48%
24 4109.5 4414 7.41% 4691.5 4750.5 1.26%
32 4752.5 4908 3.27% 4494 4825.5 7.38%
40 5233.5 5327 1.79% 4374.5 4891 11.81%
qdepth=512
rw_percent 0 | 30
cpu origin thisPatch opt | origin thisPatch opt
1 259.95 276 6.17% 286.65 294.8 2.84%
4 955 1021 6.91% 1070.5 1100 2.76%
8 1772 1903.5 7.42% 1992.5 2077.5 4.27%
12 2380.5 2543.5 6.85% 2752.5 2860 3.91%
16 2920.5 3099 6.11% 3391.5 3540 4.38%
24 3697 3912 5.82% 4401 4637 5.36%
32 4256.5 4454.5 4.65% 4516 4777 5.78%
40 4707 4968.5 5.56% 4400.5 4933 12.10%
Signed-off-by: Richael Zhuang <richael.zhuang@arm.com>
Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
|
|
|
SET_FIELD(zerocopy_threshold);
|
2020-09-10 14:57:52 +00:00
|
|
|
|
|
|
|
#undef SET_FIELD
|
|
|
|
#undef FIELD_OK
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-14 23:58:51 +00:00
|
|
|
static int
|
2020-05-10 20:07:08 +00:00
|
|
|
uring_sock_flush(struct spdk_sock *_sock)
|
2019-10-14 23:58:51 +00:00
|
|
|
{
|
2020-04-27 20:40:10 +00:00
|
|
|
struct spdk_uring_sock *sock = __uring_sock(_sock);
|
|
|
|
|
|
|
|
if (!sock->group) {
|
|
|
|
return _sock_flush_client(_sock);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2019-10-14 23:58:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct spdk_net_impl g_uring_net_impl = {
|
|
|
|
.name = "uring",
|
2020-05-10 20:07:08 +00:00
|
|
|
.getaddr = uring_sock_getaddr,
|
|
|
|
.connect = uring_sock_connect,
|
|
|
|
.listen = uring_sock_listen,
|
|
|
|
.accept = uring_sock_accept,
|
|
|
|
.close = uring_sock_close,
|
|
|
|
.recv = uring_sock_recv,
|
|
|
|
.readv = uring_sock_readv,
|
2022-03-18 07:44:53 +00:00
|
|
|
.readv_async = uring_sock_readv_async,
|
2020-05-10 20:07:08 +00:00
|
|
|
.writev = uring_sock_writev,
|
|
|
|
.writev_async = uring_sock_writev_async,
|
|
|
|
.flush = uring_sock_flush,
|
|
|
|
.set_recvlowat = uring_sock_set_recvlowat,
|
|
|
|
.set_recvbuf = uring_sock_set_recvbuf,
|
|
|
|
.set_sendbuf = uring_sock_set_sendbuf,
|
|
|
|
.is_ipv6 = uring_sock_is_ipv6,
|
|
|
|
.is_ipv4 = uring_sock_is_ipv4,
|
|
|
|
.is_connected = uring_sock_is_connected,
|
2021-04-01 20:07:17 +00:00
|
|
|
.group_impl_get_optimal = uring_sock_group_impl_get_optimal,
|
2020-05-10 20:07:08 +00:00
|
|
|
.group_impl_create = uring_sock_group_impl_create,
|
|
|
|
.group_impl_add_sock = uring_sock_group_impl_add_sock,
|
|
|
|
.group_impl_remove_sock = uring_sock_group_impl_remove_sock,
|
|
|
|
.group_impl_poll = uring_sock_group_impl_poll,
|
|
|
|
.group_impl_close = uring_sock_group_impl_close,
|
2020-09-10 14:57:52 +00:00
|
|
|
.get_opts = uring_sock_impl_get_opts,
|
|
|
|
.set_opts = uring_sock_impl_set_opts,
|
2019-10-14 23:58:51 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
SPDK_NET_IMPL_REGISTER(uring, &g_uring_net_impl, DEFAULT_SOCK_PRIORITY + 1);
|