Spdk/lib/sock/sock.c

935 lines
20 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (C) 2016 Intel Corporation. All rights reserved.
* Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
* Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*/
#include "spdk/stdinc.h"
#include "spdk/sock.h"
#include "spdk_internal/sock.h"
#include "spdk/log.h"
#include "spdk/env.h"
#include "spdk/util.h"
#define SPDK_SOCK_DEFAULT_PRIORITY 0
#define SPDK_SOCK_DEFAULT_ZCOPY true
#define SPDK_SOCK_DEFAULT_ACK_TIMEOUT 0
#define SPDK_SOCK_OPTS_FIELD_OK(opts, field) (offsetof(struct spdk_sock_opts, field) + sizeof(opts->field) <= (opts->opts_size))
static STAILQ_HEAD(, spdk_net_impl) g_net_impls = STAILQ_HEAD_INITIALIZER(g_net_impls);
static struct spdk_net_impl *g_default_impl;
struct spdk_sock_placement_id_entry {
int placement_id;
uint32_t ref;
struct spdk_sock_group_impl *group;
STAILQ_ENTRY(spdk_sock_placement_id_entry) link;
};
static inline struct spdk_sock_group_impl *
sock_get_group_impl_from_group(struct spdk_sock *sock, struct spdk_sock_group *group)
{
struct spdk_sock_group_impl *group_impl = NULL;
STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
if (sock->net_impl == group_impl->net_impl) {
return group_impl;
}
}
return NULL;
}
/* Called under map->mtx lock */
static struct spdk_sock_placement_id_entry *
_sock_map_entry_alloc(struct spdk_sock_map *map, int placement_id)
{
struct spdk_sock_placement_id_entry *entry;
entry = calloc(1, sizeof(*entry));
if (!entry) {
SPDK_ERRLOG("Cannot allocate an entry for placement_id=%u\n", placement_id);
return NULL;
}
entry->placement_id = placement_id;
STAILQ_INSERT_TAIL(&map->entries, entry, link);
return entry;
}
int
spdk_sock_map_insert(struct spdk_sock_map *map, int placement_id,
struct spdk_sock_group_impl *group)
{
struct spdk_sock_placement_id_entry *entry;
int rc = 0;
pthread_mutex_lock(&map->mtx);
STAILQ_FOREACH(entry, &map->entries, link) {
if (placement_id == entry->placement_id) {
/* Can't set group to NULL if it is already not-NULL */
if (group == NULL) {
rc = (entry->group == NULL) ? 0 : -EINVAL;
goto end;
}
if (entry->group == NULL) {
entry->group = group;
} else if (entry->group != group) {
rc = -EINVAL;
goto end;
}
entry->ref++;
goto end;
}
}
entry = _sock_map_entry_alloc(map, placement_id);
if (entry == NULL) {
rc = -ENOMEM;
goto end;
}
if (group) {
entry->group = group;
entry->ref++;
}
end:
pthread_mutex_unlock(&map->mtx);
return rc;
}
void
spdk_sock_map_release(struct spdk_sock_map *map, int placement_id)
{
struct spdk_sock_placement_id_entry *entry;
pthread_mutex_lock(&map->mtx);
STAILQ_FOREACH(entry, &map->entries, link) {
if (placement_id == entry->placement_id) {
assert(entry->ref > 0);
entry->ref--;
if (entry->ref == 0) {
entry->group = NULL;
}
break;
}
}
pthread_mutex_unlock(&map->mtx);
}
int
spdk_sock_map_lookup(struct spdk_sock_map *map, int placement_id,
struct spdk_sock_group_impl **group, struct spdk_sock_group_impl *hint)
{
struct spdk_sock_placement_id_entry *entry;
*group = NULL;
pthread_mutex_lock(&map->mtx);
STAILQ_FOREACH(entry, &map->entries, link) {
if (placement_id == entry->placement_id) {
*group = entry->group;
if (*group != NULL) {
/* Return previously assigned sock_group */
pthread_mutex_unlock(&map->mtx);
return 0;
}
break;
}
}
/* No entry with assigned sock_group, nor hint to use */
if (hint == NULL) {
pthread_mutex_unlock(&map->mtx);
return -EINVAL;
}
/* Create new entry if there is none with matching placement_id */
if (entry == NULL) {
entry = _sock_map_entry_alloc(map, placement_id);
if (entry == NULL) {
pthread_mutex_unlock(&map->mtx);
return -ENOMEM;
}
}
entry->group = hint;
pthread_mutex_unlock(&map->mtx);
return 0;
}
void
spdk_sock_map_cleanup(struct spdk_sock_map *map)
{
struct spdk_sock_placement_id_entry *entry, *tmp;
pthread_mutex_lock(&map->mtx);
STAILQ_FOREACH_SAFE(entry, &map->entries, link, tmp) {
STAILQ_REMOVE(&map->entries, entry, spdk_sock_placement_id_entry, link);
free(entry);
}
pthread_mutex_unlock(&map->mtx);
}
int
spdk_sock_map_find_free(struct spdk_sock_map *map)
{
struct spdk_sock_placement_id_entry *entry;
int placement_id = -1;
pthread_mutex_lock(&map->mtx);
STAILQ_FOREACH(entry, &map->entries, link) {
if (entry->group == NULL) {
placement_id = entry->placement_id;
break;
}
}
pthread_mutex_unlock(&map->mtx);
return placement_id;
}
int
spdk_sock_get_optimal_sock_group(struct spdk_sock *sock, struct spdk_sock_group **group,
struct spdk_sock_group *hint)
{
struct spdk_sock_group_impl *group_impl;
struct spdk_sock_group_impl *hint_group_impl = NULL;
assert(group != NULL);
if (hint != NULL) {
hint_group_impl = sock_get_group_impl_from_group(sock, hint);
if (hint_group_impl == NULL) {
return -EINVAL;
}
}
group_impl = sock->net_impl->group_impl_get_optimal(sock, hint_group_impl);
if (group_impl) {
*group = group_impl->group;
}
return 0;
}
int
spdk_sock_getaddr(struct spdk_sock *sock, char *saddr, int slen, uint16_t *sport,
char *caddr, int clen, uint16_t *cport)
{
return sock->net_impl->getaddr(sock, saddr, slen, sport, caddr, clen, cport);
}
void
spdk_sock_get_default_opts(struct spdk_sock_opts *opts)
{
assert(opts);
if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) {
opts->priority = SPDK_SOCK_DEFAULT_PRIORITY;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, zcopy)) {
opts->zcopy = SPDK_SOCK_DEFAULT_ZCOPY;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, ack_timeout)) {
opts->ack_timeout = SPDK_SOCK_DEFAULT_ACK_TIMEOUT;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, impl_opts)) {
opts->impl_opts = NULL;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, impl_opts_size)) {
opts->impl_opts_size = 0;
}
}
/*
* opts The opts allocated in the current library.
* opts_user The opts passed by the caller.
* */
static void
sock_init_opts(struct spdk_sock_opts *opts, struct spdk_sock_opts *opts_user)
{
assert(opts);
assert(opts_user);
opts->opts_size = sizeof(*opts);
spdk_sock_get_default_opts(opts);
/* reset the size according to the user */
opts->opts_size = opts_user->opts_size;
if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) {
opts->priority = opts_user->priority;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, zcopy)) {
opts->zcopy = opts_user->zcopy;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, ack_timeout)) {
opts->ack_timeout = opts_user->ack_timeout;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, impl_opts)) {
opts->impl_opts = opts_user->impl_opts;
}
if (SPDK_SOCK_OPTS_FIELD_OK(opts, impl_opts)) {
opts->impl_opts_size = opts_user->impl_opts_size;
}
}
struct spdk_sock *
spdk_sock_connect(const char *ip, int port, const char *impl_name)
{
struct spdk_sock_opts opts;
opts.opts_size = sizeof(opts);
spdk_sock_get_default_opts(&opts);
return spdk_sock_connect_ext(ip, port, impl_name, &opts);
}
struct spdk_sock *
spdk_sock_connect_ext(const char *ip, int port, const char *_impl_name, struct spdk_sock_opts *opts)
{
struct spdk_net_impl *impl = NULL;
struct spdk_sock *sock;
struct spdk_sock_opts opts_local;
const char *impl_name = NULL;
if (opts == NULL) {
SPDK_ERRLOG("the opts should not be NULL pointer\n");
return NULL;
}
if (_impl_name) {
impl_name = _impl_name;
} else if (g_default_impl) {
impl_name = g_default_impl->name;
}
STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) {
continue;
}
SPDK_DEBUGLOG(sock, "Creating a client socket using impl %s\n", impl->name);
sock_init_opts(&opts_local, opts);
sock = impl->connect(ip, port, &opts_local);
if (sock != NULL) {
/* Copy the contents, both the two structures are the same ABI version */
memcpy(&sock->opts, &opts_local, sizeof(sock->opts));
/* Clear out impl_opts to make sure we don't keep reference to a dangling
* pointer */
sock->opts.impl_opts = NULL;
sock->net_impl = impl;
TAILQ_INIT(&sock->queued_reqs);
TAILQ_INIT(&sock->pending_reqs);
sock: introduce dynamic zerocopy according to data size MSG_ZEROCOPY is not always effective as mentioned in https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html. Currently in spdk, once we enable sendmsg zerocopy, then all data transferred through _sock_flush are sent with zerocopy, and vice versa. Here dynamic zerocopy is introduced to allow data sent with MSG_ZEROCOPY or not according to its size, which can be enabled by setting "enable_dynamic_zerocopy" as true. Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators' configurations are the same as spdk report: https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious influence when read percentage is greater than 50%. For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has 1%~7% improvement when read percentage is greater than 50%. The following is part of the detailed data. posix: qdepth=128 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 286.5 298.5 4.19% 307 304.15 -0.93% 4 1042.5 1107 6.19% 1135.5 1136 0.04% 8 1952.5 2058 5.40% 2170.5 2170.5 0.00% 12 2658.5 2879 8.29% 3042 3046 0.13% 16 3247.5 3460.5 6.56% 3793.5 3775 -0.49% 24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08% 32 4810 5095 5.93% 4488 4845 7.95% 40 5306.5 5435 2.42% 4427.5 4902 10.72% qdepth=512 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 275 287 4.36% 294.4 295.45 0.36% 4 979 1041 6.33% 1073 1083.5 0.98% 8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59% 12 2441 2598.5 6.45% 2808.5 2779.5 -1.03% 16 2920.5 3109.5 6.47% 3455 3411.5 -1.26% 24 3709 3972.5 7.10% 4483.5 4502.5 0.42% 32 4225.5 4532.5 7.27% 4463.5 4733 6.04% 40 4790.5 4884.5 1.96% 4427 4904.5 10.79% uring: qdepth=128 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 270.5 287.5 6.28% 295.75 304.75 3.04% 4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31% 8 1907 2055 7.76% 2127 2211.5 3.97% 12 2614 2801 7.15% 2982.5 3061.5 2.65% 16 3169.5 3420 7.90% 3654.5 3781.5 3.48% 24 4109.5 4414 7.41% 4691.5 4750.5 1.26% 32 4752.5 4908 3.27% 4494 4825.5 7.38% 40 5233.5 5327 1.79% 4374.5 4891 11.81% qdepth=512 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 259.95 276 6.17% 286.65 294.8 2.84% 4 955 1021 6.91% 1070.5 1100 2.76% 8 1772 1903.5 7.42% 1992.5 2077.5 4.27% 12 2380.5 2543.5 6.85% 2752.5 2860 3.91% 16 2920.5 3099 6.11% 3391.5 3540 4.38% 24 3697 3912 5.82% 4401 4637 5.36% 32 4256.5 4454.5 4.65% 4516 4777 5.78% 40 4707 4968.5 5.56% 4400.5 4933 12.10% Signed-off-by: Richael Zhuang <richael.zhuang@arm.com> Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
return sock;
}
}
return NULL;
}
struct spdk_sock *
spdk_sock_listen(const char *ip, int port, const char *impl_name)
{
struct spdk_sock_opts opts;
opts.opts_size = sizeof(opts);
spdk_sock_get_default_opts(&opts);
return spdk_sock_listen_ext(ip, port, impl_name, &opts);
}
struct spdk_sock *
spdk_sock_listen_ext(const char *ip, int port, const char *_impl_name, struct spdk_sock_opts *opts)
{
struct spdk_net_impl *impl = NULL;
struct spdk_sock *sock;
struct spdk_sock_opts opts_local;
const char *impl_name = NULL;
if (opts == NULL) {
SPDK_ERRLOG("the opts should not be NULL pointer\n");
return NULL;
}
if (_impl_name) {
impl_name = _impl_name;
} else if (g_default_impl) {
impl_name = g_default_impl->name;
}
STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) {
continue;
}
SPDK_DEBUGLOG(sock, "Creating a listening socket using impl %s\n", impl->name);
sock_init_opts(&opts_local, opts);
sock = impl->listen(ip, port, &opts_local);
if (sock != NULL) {
/* Copy the contents, both the two structures are the same ABI version */
memcpy(&sock->opts, &opts_local, sizeof(sock->opts));
/* Clear out impl_opts to make sure we don't keep reference to a dangling
* pointer */
sock->opts.impl_opts = NULL;
sock->net_impl = impl;
/* Don't need to initialize the request queues for listen
* sockets. */
return sock;
}
}
return NULL;
}
struct spdk_sock *
spdk_sock_accept(struct spdk_sock *sock)
{
struct spdk_sock *new_sock;
new_sock = sock->net_impl->accept(sock);
if (new_sock != NULL) {
/* Inherit the opts from the "accept sock" */
new_sock->opts = sock->opts;
memcpy(&new_sock->opts, &sock->opts, sizeof(new_sock->opts));
new_sock->net_impl = sock->net_impl;
TAILQ_INIT(&new_sock->queued_reqs);
TAILQ_INIT(&new_sock->pending_reqs);
}
return new_sock;
}
int
spdk_sock_close(struct spdk_sock **_sock)
{
struct spdk_sock *sock = *_sock;
if (sock == NULL) {
errno = EBADF;
return -1;
}
if (sock->cb_fn != NULL) {
/* This sock is still part of a sock_group. */
errno = EBUSY;
return -1;
}
/* Beyond this point the socket is considered closed. */
*_sock = NULL;
sock->flags.closed = true;
if (sock->cb_cnt > 0) {
/* Let the callback unwind before destroying the socket */
return 0;
}
spdk_sock_abort_requests(sock);
return sock->net_impl->close(sock);
}
ssize_t
spdk_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
{
if (sock == NULL || sock->flags.closed) {
errno = EBADF;
return -1;
}
return sock->net_impl->recv(sock, buf, len);
}
ssize_t
spdk_sock_readv(struct spdk_sock *sock, struct iovec *iov, int iovcnt)
{
if (sock == NULL || sock->flags.closed) {
errno = EBADF;
return -1;
}
return sock->net_impl->readv(sock, iov, iovcnt);
}
void
spdk_sock_readv_async(struct spdk_sock *sock, struct spdk_sock_request *req)
{
assert(req->cb_fn != NULL);
if (spdk_unlikely(sock == NULL || sock->flags.closed)) {
req->cb_fn(req->cb_arg, -EBADF);
return;
}
/* The socket needs to be part of a poll group */
if (spdk_unlikely(sock->group_impl == NULL)) {
req->cb_fn(req->cb_arg, -EPERM);
return;
}
sock->net_impl->readv_async(sock, req);
}
ssize_t
spdk_sock_writev(struct spdk_sock *sock, struct iovec *iov, int iovcnt)
{
if (sock == NULL || sock->flags.closed) {
errno = EBADF;
return -1;
}
return sock->net_impl->writev(sock, iov, iovcnt);
}
void
spdk_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req)
{
assert(req->cb_fn != NULL);
if (sock == NULL || sock->flags.closed) {
req->cb_fn(req->cb_arg, -EBADF);
return;
}
sock->net_impl->writev_async(sock, req);
}
int
spdk_sock_flush(struct spdk_sock *sock)
{
if (sock == NULL || sock->flags.closed) {
errno = EBADF;
return -1;
}
return sock->net_impl->flush(sock);
}
int
spdk_sock_set_recvlowat(struct spdk_sock *sock, int nbytes)
{
return sock->net_impl->set_recvlowat(sock, nbytes);
}
int
spdk_sock_set_recvbuf(struct spdk_sock *sock, int sz)
{
return sock->net_impl->set_recvbuf(sock, sz);
}
int
spdk_sock_set_sendbuf(struct spdk_sock *sock, int sz)
{
return sock->net_impl->set_sendbuf(sock, sz);
}
bool
spdk_sock_is_ipv6(struct spdk_sock *sock)
{
return sock->net_impl->is_ipv6(sock);
}
bool
spdk_sock_is_ipv4(struct spdk_sock *sock)
{
return sock->net_impl->is_ipv4(sock);
}
bool
spdk_sock_is_connected(struct spdk_sock *sock)
{
return sock->net_impl->is_connected(sock);
}
struct spdk_sock_group *
spdk_sock_group_create(void *ctx)
{
struct spdk_net_impl *impl = NULL;
struct spdk_sock_group *group;
struct spdk_sock_group_impl *group_impl;
group = calloc(1, sizeof(*group));
if (group == NULL) {
return NULL;
}
STAILQ_INIT(&group->group_impls);
STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
group_impl = impl->group_impl_create();
if (group_impl != NULL) {
STAILQ_INSERT_TAIL(&group->group_impls, group_impl, link);
TAILQ_INIT(&group_impl->socks);
group_impl->net_impl = impl;
group_impl->group = group;
}
}
group->ctx = ctx;
return group;
}
void *
spdk_sock_group_get_ctx(struct spdk_sock_group *group)
{
if (group == NULL) {
return NULL;
}
return group->ctx;
}
int
spdk_sock_group_add_sock(struct spdk_sock_group *group, struct spdk_sock *sock,
spdk_sock_cb cb_fn, void *cb_arg)
{
struct spdk_sock_group_impl *group_impl = NULL;
int rc;
if (cb_fn == NULL) {
errno = EINVAL;
return -1;
}
if (sock->group_impl != NULL) {
/*
* This sock is already part of a sock_group.
*/
errno = EINVAL;
return -1;
}
group_impl = sock_get_group_impl_from_group(sock, group);
if (group_impl == NULL) {
errno = EINVAL;
return -1;
}
rc = group_impl->net_impl->group_impl_add_sock(group_impl, sock);
if (rc != 0) {
return rc;
}
TAILQ_INSERT_TAIL(&group_impl->socks, sock, link);
sock->group_impl = group_impl;
sock->cb_fn = cb_fn;
sock->cb_arg = cb_arg;
return 0;
}
int
spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *sock)
{
struct spdk_sock_group_impl *group_impl = NULL;
int rc;
group_impl = sock_get_group_impl_from_group(sock, group);
if (group_impl == NULL) {
errno = EINVAL;
return -1;
}
assert(group_impl == sock->group_impl);
rc = group_impl->net_impl->group_impl_remove_sock(group_impl, sock);
if (rc == 0) {
TAILQ_REMOVE(&group_impl->socks, sock, link);
sock->group_impl = NULL;
sock->cb_fn = NULL;
sock->cb_arg = NULL;
}
return rc;
}
int
spdk_sock_group_poll(struct spdk_sock_group *group)
{
return spdk_sock_group_poll_count(group, MAX_EVENTS_PER_POLL);
}
static int
sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl,
struct spdk_sock_group *group,
int max_events)
{
struct spdk_sock *socks[MAX_EVENTS_PER_POLL];
int num_events, i;
if (TAILQ_EMPTY(&group_impl->socks)) {
return 0;
}
num_events = group_impl->net_impl->group_impl_poll(group_impl, max_events, socks);
if (num_events == -1) {
return -1;
}
for (i = 0; i < num_events; i++) {
struct spdk_sock *sock = socks[i];
sock: Fix the "sock remove assert bug" in spdk_sock_group_remove_sock The statement causes this issue is: assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL); The call trace is: The previous solution is: commitid with: e71e81b6311772681a3f8bcc279bc7253c7c1d9b But with this solution, it will always add the sock into the removed_socks list even if it is not under polling context by sock_group_impl_poll_count. So it will exceed the size of removed_socks array if sock_group_impl_poll_count function will not be called. And we should not use a large array, because it is just a workaround, it just hides the bug. So our current solution is: 1 Remove the code in sock layer, i.e., rollback the commit e71e81b6311772681a3f8bcc279bc7253c7c1d9b. This patch is not the right fix. The sock->cb_fn's NULL pointer case is caused by the cb_fn of write operation (if the spdk_sock_group_remove_sock is inside the cb_fn). And it is not caused by the epoll related cache issue described in commit "e7181.." commit, but caused by the following situation: (1)The socket's cb_fn is set to NULL which is caused by spdk_sock_group_remove_sock by the socket itself inside a call back function from a write operation. (2) And the socket is already in the pending_recv list. It is not caused by the epoll event issue, e.g., socket A changes Socket B's cb_fn. By the way, A socket A should never remove a socket B from a polling group. If it really does it, it should use spdk_thread_sendmsg to make sure it happens in the next round. 2 Add the code check in each posix, uring implementation module. If sock->cb_fn is NULL, we will not return the socket to the active socks list. And this is enough to address the issue. Signed-off-by: Ziye Yang <ziye.yang@intel.com> Change-Id: I79187f2f1301c819c46a5c3bdd84372f75534f2f Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/6472 Reviewed-by: Xiaodong Liu <xiaodong.liu@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
2021-02-19 12:02:07 +00:00
assert(sock->cb_fn != NULL);
sock->cb_fn(sock->cb_arg, group, sock);
}
sock: keep track of removed sockets during call to poll We have been intermittently hitting the assert where we check sock->cb_fn != NULL in spdk_sock_group_impl_poll_count. The only way we could be hitting this specific error is if we wereremoving a socket from a sock group within after receiving an event for it. Specifically, we are seeing this error on the NVMe-oF TCP target which relies on posix sockets using epoll. The man page for epoll states the following: If you use an event cache or store all the file descriptors returned from epoll_wait(2), then make sure to provide a way to mark its closure dynamically (i.e., caused by a previous event's processing). Suppose you receive 100 events from epoll_wait(2), and in event #47 a condition causes event #13 to be closed. If you remove the structure and close(2) the file descriptor for event #13, then your event cache might still say there are events waiting for that file descriptor causing confusion. One solution for this is to call, during the processing of event 47, epoll_ctl(EPOLL_CTL_DEL) to delete file descriptor 13 and close(2), then mark its associated data structure as removed and link it to a cleanup list. If you find another event for file descriptor 13 in your batch processing, you will discover the file descriptor had been previously removed and there will be no confusion. Since we do store all of the file descriptors returned from epoll_wait, we need to implement the tracking mentioned above. fixes issue #1294 Signed-off-by: Seth Howell <seth.howell@intel.com> Change-Id: Ib592ce19e3f0b691e3a825d02ebb42d7338e3ceb Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1589 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
2020-03-30 21:54:02 +00:00
return num_events;
}
int
spdk_sock_group_poll_count(struct spdk_sock_group *group, int max_events)
{
struct spdk_sock_group_impl *group_impl = NULL;
int rc, num_events = 0;
if (max_events < 1) {
errno = -EINVAL;
return -1;
}
/*
* Only poll for up to 32 events at a time - if more events are pending,
* the next call to this function will reap them.
*/
if (max_events > MAX_EVENTS_PER_POLL) {
max_events = MAX_EVENTS_PER_POLL;
}
STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
rc = sock_group_impl_poll_count(group_impl, group, max_events);
if (rc < 0) {
num_events = -1;
SPDK_ERRLOG("group_impl_poll_count for net(%s) failed\n",
group_impl->net_impl->name);
} else if (num_events >= 0) {
num_events += rc;
}
}
return num_events;
}
int
spdk_sock_group_close(struct spdk_sock_group **group)
{
struct spdk_sock_group_impl *group_impl = NULL, *tmp;
int rc;
if (*group == NULL) {
errno = EBADF;
return -1;
}
STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) {
if (!TAILQ_EMPTY(&group_impl->socks)) {
errno = EBUSY;
return -1;
}
}
STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) {
rc = group_impl->net_impl->group_impl_close(group_impl);
if (rc != 0) {
SPDK_ERRLOG("group_impl_close for net failed\n");
}
}
free(*group);
*group = NULL;
return 0;
}
static inline struct spdk_net_impl *
sock_get_impl_by_name(const char *impl_name)
{
struct spdk_net_impl *impl;
assert(impl_name != NULL);
STAILQ_FOREACH(impl, &g_net_impls, link) {
if (0 == strcmp(impl_name, impl->name)) {
return impl;
}
}
return NULL;
}
int
spdk_sock_impl_get_opts(const char *impl_name, struct spdk_sock_impl_opts *opts, size_t *len)
{
struct spdk_net_impl *impl;
if (!impl_name || !opts || !len) {
errno = EINVAL;
return -1;
}
impl = sock_get_impl_by_name(impl_name);
if (!impl) {
errno = EINVAL;
return -1;
}
if (!impl->get_opts) {
errno = ENOTSUP;
return -1;
}
return impl->get_opts(opts, len);
}
int
spdk_sock_impl_set_opts(const char *impl_name, const struct spdk_sock_impl_opts *opts, size_t len)
{
struct spdk_net_impl *impl;
if (!impl_name || !opts) {
errno = EINVAL;
return -1;
}
impl = sock_get_impl_by_name(impl_name);
if (!impl) {
errno = EINVAL;
return -1;
}
if (!impl->set_opts) {
errno = ENOTSUP;
return -1;
}
return impl->set_opts(opts, len);
}
void
spdk_sock_write_config_json(struct spdk_json_write_ctx *w)
{
struct spdk_net_impl *impl;
struct spdk_sock_impl_opts opts;
size_t len;
assert(w != NULL);
spdk_json_write_array_begin(w);
if (g_default_impl) {
spdk_json_write_object_begin(w);
spdk_json_write_named_string(w, "method", "sock_set_default_impl");
spdk_json_write_named_object_begin(w, "params");
spdk_json_write_named_string(w, "impl_name", g_default_impl->name);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);
}
STAILQ_FOREACH(impl, &g_net_impls, link) {
if (!impl->get_opts) {
continue;
}
len = sizeof(opts);
if (impl->get_opts(&opts, &len) == 0) {
spdk_json_write_object_begin(w);
spdk_json_write_named_string(w, "method", "sock_impl_set_options");
spdk_json_write_named_object_begin(w, "params");
spdk_json_write_named_string(w, "impl_name", impl->name);
spdk_json_write_named_uint32(w, "recv_buf_size", opts.recv_buf_size);
spdk_json_write_named_uint32(w, "send_buf_size", opts.send_buf_size);
spdk_json_write_named_bool(w, "enable_recv_pipe", opts.enable_recv_pipe);
spdk_json_write_named_bool(w, "enable_quickack", opts.enable_quickack);
spdk_json_write_named_uint32(w, "enable_placement_id", opts.enable_placement_id);
spdk_json_write_named_bool(w, "enable_zerocopy_send_server", opts.enable_zerocopy_send_server);
spdk_json_write_named_bool(w, "enable_zerocopy_send_client", opts.enable_zerocopy_send_client);
sock: introduce dynamic zerocopy according to data size MSG_ZEROCOPY is not always effective as mentioned in https://www.kernel.org/doc/html/v4.15/networking/msg_zerocopy.html. Currently in spdk, once we enable sendmsg zerocopy, then all data transferred through _sock_flush are sent with zerocopy, and vice versa. Here dynamic zerocopy is introduced to allow data sent with MSG_ZEROCOPY or not according to its size, which can be enabled by setting "enable_dynamic_zerocopy" as true. Test with 16 P4610 NVMe SSD, 2 initiators, target's and initiators' configurations are the same as spdk report: https://ci.spdk.io/download/performance-reports/SPDK_tcp_perf_report_2104.pdf For posix socket, rw_percent=0(randwrite), it has 1.9%~8.3% performance boost tested with target 1~40 cpu cores and qdepth=128,256,512. And it has no obvious influence when read percentage is greater than 50%. For uring socket, rw_percent=0(randwrite), it has 1.8%~7.9% performance boost tested with target 1~40 cpu cores and qdepth=128,256,512. And it still has 1%~7% improvement when read percentage is greater than 50%. The following is part of the detailed data. posix: qdepth=128 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 286.5 298.5 4.19% 307 304.15 -0.93% 4 1042.5 1107 6.19% 1135.5 1136 0.04% 8 1952.5 2058 5.40% 2170.5 2170.5 0.00% 12 2658.5 2879 8.29% 3042 3046 0.13% 16 3247.5 3460.5 6.56% 3793.5 3775 -0.49% 24 4232.5 4459.5 5.36% 4614.5 4756.5 3.08% 32 4810 5095 5.93% 4488 4845 7.95% 40 5306.5 5435 2.42% 4427.5 4902 10.72% qdepth=512 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 275 287 4.36% 294.4 295.45 0.36% 4 979 1041 6.33% 1073 1083.5 0.98% 8 1822.5 1914.5 5.05% 2030.5 2018.5 -0.59% 12 2441 2598.5 6.45% 2808.5 2779.5 -1.03% 16 2920.5 3109.5 6.47% 3455 3411.5 -1.26% 24 3709 3972.5 7.10% 4483.5 4502.5 0.42% 32 4225.5 4532.5 7.27% 4463.5 4733 6.04% 40 4790.5 4884.5 1.96% 4427 4904.5 10.79% uring: qdepth=128 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 270.5 287.5 6.28% 295.75 304.75 3.04% 4 1018.5 1089.5 6.97% 1119.5 1156.5 3.31% 8 1907 2055 7.76% 2127 2211.5 3.97% 12 2614 2801 7.15% 2982.5 3061.5 2.65% 16 3169.5 3420 7.90% 3654.5 3781.5 3.48% 24 4109.5 4414 7.41% 4691.5 4750.5 1.26% 32 4752.5 4908 3.27% 4494 4825.5 7.38% 40 5233.5 5327 1.79% 4374.5 4891 11.81% qdepth=512 rw_percent 0 | 30 cpu origin thisPatch opt | origin thisPatch opt 1 259.95 276 6.17% 286.65 294.8 2.84% 4 955 1021 6.91% 1070.5 1100 2.76% 8 1772 1903.5 7.42% 1992.5 2077.5 4.27% 12 2380.5 2543.5 6.85% 2752.5 2860 3.91% 16 2920.5 3099 6.11% 3391.5 3540 4.38% 24 3697 3912 5.82% 4401 4637 5.36% 32 4256.5 4454.5 4.65% 4516 4777 5.78% 40 4707 4968.5 5.56% 4400.5 4933 12.10% Signed-off-by: Richael Zhuang <richael.zhuang@arm.com> Change-Id: I730dcf89ed2bf3efe91586421a89045fc11c81f0 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12210 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com>
2021-12-07 09:37:02 +00:00
spdk_json_write_named_uint32(w, "zerocopy_threshold", opts.zerocopy_threshold);
spdk_json_write_named_uint32(w, "tls_version", opts.tls_version);
spdk_json_write_named_bool(w, "enable_ktls", opts.enable_ktls);
if (opts.psk_key) {
spdk_json_write_named_string(w, "psk_key", opts.psk_key);
}
if (opts.psk_identity) {
spdk_json_write_named_string(w, "psk_identity", opts.psk_identity);
}
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);
} else {
SPDK_ERRLOG("Failed to get socket options for socket implementation %s\n", impl->name);
}
}
spdk_json_write_array_end(w);
}
void
spdk_net_impl_register(struct spdk_net_impl *impl, int priority)
{
struct spdk_net_impl *cur, *prev;
impl->priority = priority;
prev = NULL;
STAILQ_FOREACH(cur, &g_net_impls, link) {
if (impl->priority > cur->priority) {
break;
}
prev = cur;
}
if (prev) {
STAILQ_INSERT_AFTER(&g_net_impls, prev, impl, link);
} else {
STAILQ_INSERT_HEAD(&g_net_impls, impl, link);
}
}
int
spdk_sock_set_default_impl(const char *impl_name)
{
struct spdk_net_impl *impl;
if (!impl_name) {
errno = EINVAL;
return -1;
}
impl = sock_get_impl_by_name(impl_name);
if (!impl) {
errno = EINVAL;
return -1;
}
if (impl == g_default_impl) {
return 0;
}
if (g_default_impl) {
SPDK_DEBUGLOG(sock, "Change the default sock impl from %s to %s\n", g_default_impl->name,
impl->name);
} else {
SPDK_DEBUGLOG(sock, "Set default sock implementation to %s\n", impl_name);
}
g_default_impl = impl;
return 0;
}
SPDK_LOG_REGISTER_COMPONENT(sock)