sock/posix: Avoid extra readv calls after draining recv_pipe
Move from a single flag indicating that the socket is on the pending_events list to two flags - pipe_has_data and socket_has_data. If either flag is true, the socket is on the socks_with_data list. This is necessary to track enough state to avoid doing extra recv() system calls. Change-Id: I65e5701dccb0a5bade19f266f164f26706b110d4 Signed-off-by: Ben Walker <benjamin.walker@intel.com> Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/7595 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Community-CI: Mellanox Build Bot Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
This commit is contained in:
parent
4e9adb3bf9
commit
1ae601b573
@ -68,7 +68,8 @@ struct spdk_posix_sock {
|
|||||||
struct spdk_pipe *recv_pipe;
|
struct spdk_pipe *recv_pipe;
|
||||||
void *recv_buf;
|
void *recv_buf;
|
||||||
int recv_buf_sz;
|
int recv_buf_sz;
|
||||||
bool pending_events;
|
bool pipe_has_data;
|
||||||
|
bool socket_has_data;
|
||||||
bool zcopy;
|
bool zcopy;
|
||||||
|
|
||||||
int placement_id;
|
int placement_id;
|
||||||
@ -76,12 +77,12 @@ struct spdk_posix_sock {
|
|||||||
TAILQ_ENTRY(spdk_posix_sock) link;
|
TAILQ_ENTRY(spdk_posix_sock) link;
|
||||||
};
|
};
|
||||||
|
|
||||||
TAILQ_HEAD(spdk_pending_events_list, spdk_posix_sock);
|
TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock);
|
||||||
|
|
||||||
struct spdk_posix_sock_group_impl {
|
struct spdk_posix_sock_group_impl {
|
||||||
struct spdk_sock_group_impl base;
|
struct spdk_sock_group_impl base;
|
||||||
int fd;
|
int fd;
|
||||||
struct spdk_pending_events_list pending_events;
|
struct spdk_has_data_list socks_with_data;
|
||||||
int placement_id;
|
int placement_id;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -904,13 +905,16 @@ posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int
|
|||||||
|
|
||||||
spdk_pipe_reader_advance(sock->recv_pipe, bytes);
|
spdk_pipe_reader_advance(sock->recv_pipe, bytes);
|
||||||
|
|
||||||
/* If we drained the pipe, take it off the pending_events list. The socket may still have data buffered
|
/* If we drained the pipe, mark it appropriately */
|
||||||
* in the kernel to receive, but this will be handled on the next poll call when we get the same EPOLLIN
|
if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
|
||||||
* event again. */
|
assert(sock->pipe_has_data == true);
|
||||||
if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
|
|
||||||
group = __posix_group_impl(sock->base.group_impl);
|
group = __posix_group_impl(sock->base.group_impl);
|
||||||
TAILQ_REMOVE(&group->pending_events, sock, link);
|
if (group && !sock->socket_has_data) {
|
||||||
sock->pending_events = false;
|
TAILQ_REMOVE(&group->socks_with_data, sock, link);
|
||||||
|
}
|
||||||
|
|
||||||
|
sock->pipe_has_data = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return bytes;
|
return bytes;
|
||||||
@ -920,34 +924,46 @@ static inline ssize_t
|
|||||||
posix_sock_read(struct spdk_posix_sock *sock)
|
posix_sock_read(struct spdk_posix_sock *sock)
|
||||||
{
|
{
|
||||||
struct iovec iov[2];
|
struct iovec iov[2];
|
||||||
int bytes;
|
int bytes_avail, bytes_recvd;
|
||||||
struct spdk_posix_sock_group_impl *group;
|
struct spdk_posix_sock_group_impl *group;
|
||||||
|
|
||||||
bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov);
|
bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov);
|
||||||
|
|
||||||
if (bytes > 0) {
|
if (bytes_avail <= 0) {
|
||||||
bytes = readv(sock->fd, iov, 2);
|
return bytes_avail;
|
||||||
if (bytes > 0) {
|
|
||||||
spdk_pipe_writer_advance(sock->recv_pipe, bytes);
|
|
||||||
|
|
||||||
/* For normal operation, this function is called in response to an EPOLLIN
|
|
||||||
* event, which already placed the socket onto the pending_events list.
|
|
||||||
* But between polls the user may repeatedly call posix_sock_read
|
|
||||||
* and if they clear the pipe on one of those earlier calls, the
|
|
||||||
* socket will be removed from the pending_events list. In that case,
|
|
||||||
* if we now found more data, put it back on.
|
|
||||||
* This essentially never happens in practice because the application
|
|
||||||
* will stop trying to receive and wait for the next EPOLLIN event, but
|
|
||||||
* for correctness let's handle it. */
|
|
||||||
if (!sock->pending_events && sock->base.group_impl) {
|
|
||||||
group = __posix_group_impl(sock->base.group_impl);
|
|
||||||
TAILQ_INSERT_TAIL(&group->pending_events, sock, link);
|
|
||||||
sock->pending_events = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return bytes;
|
bytes_recvd = readv(sock->fd, iov, 2);
|
||||||
|
|
||||||
|
assert(sock->pipe_has_data == false);
|
||||||
|
|
||||||
|
if (bytes_recvd <= 0) {
|
||||||
|
/* Errors count as draining the socket data */
|
||||||
|
if (sock->base.group_impl && sock->socket_has_data) {
|
||||||
|
group = __posix_group_impl(sock->base.group_impl);
|
||||||
|
TAILQ_REMOVE(&group->socks_with_data, sock, link);
|
||||||
|
}
|
||||||
|
|
||||||
|
sock->socket_has_data = false;
|
||||||
|
|
||||||
|
return bytes_recvd;
|
||||||
|
}
|
||||||
|
|
||||||
|
spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd);
|
||||||
|
|
||||||
|
#if DEBUG
|
||||||
|
if (sock->base.group_impl) {
|
||||||
|
assert(sock->socket_has_data == true);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
sock->pipe_has_data = true;
|
||||||
|
if (bytes_recvd < bytes_avail) {
|
||||||
|
/* We drained the kernel socket entirely. */
|
||||||
|
sock->socket_has_data = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return bytes_recvd;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t
|
static ssize_t
|
||||||
@ -959,26 +975,26 @@ posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
|
|||||||
size_t len;
|
size_t len;
|
||||||
|
|
||||||
if (sock->recv_pipe == NULL) {
|
if (sock->recv_pipe == NULL) {
|
||||||
if (group && sock->pending_events) {
|
assert(sock->pipe_has_data == false);
|
||||||
sock->pending_events = false;
|
if (group && sock->socket_has_data) {
|
||||||
TAILQ_REMOVE(&group->pending_events, sock, link);
|
sock->socket_has_data = false;
|
||||||
|
TAILQ_REMOVE(&group->socks_with_data, sock, link);
|
||||||
}
|
}
|
||||||
return readv(sock->fd, iov, iovcnt);
|
return readv(sock->fd, iov, iovcnt);
|
||||||
}
|
}
|
||||||
|
|
||||||
len = 0;
|
/* If the socket is not in a group, we must assume it always has
|
||||||
for (i = 0; i < iovcnt; i++) {
|
* data waiting for us because it is not epolled */
|
||||||
len += iov[i].iov_len;
|
if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) {
|
||||||
}
|
|
||||||
|
|
||||||
if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
|
|
||||||
/* If the user is receiving a sufficiently large amount of data,
|
/* If the user is receiving a sufficiently large amount of data,
|
||||||
* receive directly to their buffers. */
|
* receive directly to their buffers. */
|
||||||
|
len = 0;
|
||||||
|
for (i = 0; i < iovcnt; i++) {
|
||||||
|
len += iov[i].iov_len;
|
||||||
|
}
|
||||||
|
|
||||||
if (len >= MIN_SOCK_PIPE_SIZE) {
|
if (len >= MIN_SOCK_PIPE_SIZE) {
|
||||||
if (group && sock->pending_events) {
|
/* TODO: Should this detect if kernel socket is drained? */
|
||||||
sock->pending_events = false;
|
|
||||||
TAILQ_REMOVE(&group->pending_events, sock, link);
|
|
||||||
}
|
|
||||||
return readv(sock->fd, iov, iovcnt);
|
return readv(sock->fd, iov, iovcnt);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1160,7 +1176,7 @@ posix_sock_group_impl_create(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
group_impl->fd = fd;
|
group_impl->fd = fd;
|
||||||
TAILQ_INIT(&group_impl->pending_events);
|
TAILQ_INIT(&group_impl->socks_with_data);
|
||||||
group_impl->placement_id = -1;
|
group_impl->placement_id = -1;
|
||||||
|
|
||||||
if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) {
|
if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) {
|
||||||
@ -1256,9 +1272,9 @@ posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_
|
|||||||
/* switched from another polling group due to scheduling */
|
/* switched from another polling group due to scheduling */
|
||||||
if (spdk_unlikely(sock->recv_pipe != NULL &&
|
if (spdk_unlikely(sock->recv_pipe != NULL &&
|
||||||
(spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) {
|
(spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) {
|
||||||
assert(sock->pending_events == false);
|
sock->pipe_has_data = true;
|
||||||
sock->pending_events = true;
|
sock->socket_has_data = false;
|
||||||
TAILQ_INSERT_TAIL(&group->pending_events, sock, link);
|
TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_MARK) {
|
if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_MARK) {
|
||||||
@ -1281,9 +1297,10 @@ posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct sp
|
|||||||
struct spdk_posix_sock *sock = __posix_sock(_sock);
|
struct spdk_posix_sock *sock = __posix_sock(_sock);
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
if (sock->pending_events) {
|
if (sock->pipe_has_data || sock->socket_has_data) {
|
||||||
TAILQ_REMOVE(&group->pending_events, sock, link);
|
TAILQ_REMOVE(&group->socks_with_data, sock, link);
|
||||||
sock->pending_events = false;
|
sock->pipe_has_data = false;
|
||||||
|
sock->socket_has_data = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sock->placement_id != -1) {
|
if (sock->placement_id != -1) {
|
||||||
@ -1362,7 +1379,7 @@ posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
|
|||||||
*/
|
*/
|
||||||
int last_placement_id = -1;
|
int last_placement_id = -1;
|
||||||
|
|
||||||
TAILQ_FOREACH(psock, &group->pending_events, link) {
|
TAILQ_FOREACH(psock, &group->socks_with_data, link) {
|
||||||
if (psock->zcopy && psock->placement_id >= 0 &&
|
if (psock->zcopy && psock->placement_id >= 0 &&
|
||||||
psock->placement_id != last_placement_id) {
|
psock->placement_id != last_placement_id) {
|
||||||
struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0};
|
struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0};
|
||||||
@ -1433,16 +1450,16 @@ posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
|
|||||||
psock = __posix_sock(sock);
|
psock = __posix_sock(sock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* If the socket does not already have recv pending, add it now */
|
/* If the socket is not already in the list, add it now */
|
||||||
if (!psock->pending_events) {
|
if (!psock->socket_has_data && !psock->pipe_has_data) {
|
||||||
psock->pending_events = true;
|
TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link);
|
||||||
TAILQ_INSERT_TAIL(&group->pending_events, psock, link);
|
|
||||||
}
|
}
|
||||||
|
psock->socket_has_data = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
num_events = 0;
|
num_events = 0;
|
||||||
|
|
||||||
TAILQ_FOREACH_SAFE(psock, &group->pending_events, link, ptmp) {
|
TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) {
|
||||||
if (num_events == max_events) {
|
if (num_events == max_events) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1450,15 +1467,16 @@ posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
|
|||||||
/* If the socket's cb_fn is NULL, just remove it from the
|
/* If the socket's cb_fn is NULL, just remove it from the
|
||||||
* list and do not add it to socks array */
|
* list and do not add it to socks array */
|
||||||
if (spdk_unlikely(psock->base.cb_fn == NULL)) {
|
if (spdk_unlikely(psock->base.cb_fn == NULL)) {
|
||||||
psock->pending_events = false;
|
psock->socket_has_data = false;
|
||||||
TAILQ_REMOVE(&group->pending_events, psock, link);
|
psock->pipe_has_data = false;
|
||||||
|
TAILQ_REMOVE(&group->socks_with_data, psock, link);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
socks[num_events++] = &psock->base;
|
socks[num_events++] = &psock->base;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Cycle the pending_events list so that each time we poll things aren't
|
/* Cycle the has_data list so that each time we poll things aren't
|
||||||
* in the same order. Say we have 6 sockets in the list, named as follows:
|
* in the same order. Say we have 6 sockets in the list, named as follows:
|
||||||
* A B C D E F
|
* A B C D E F
|
||||||
* And all 6 sockets had epoll events, but max_events is only 3. That means
|
* And all 6 sockets had epoll events, but max_events is only 3. That means
|
||||||
@ -1473,9 +1491,9 @@ posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
|
|||||||
|
|
||||||
/* Capture pointers to the elements we need */
|
/* Capture pointers to the elements we need */
|
||||||
pd = psock;
|
pd = psock;
|
||||||
pc = TAILQ_PREV(pd, spdk_pending_events_list, link);
|
pc = TAILQ_PREV(pd, spdk_has_data_list, link);
|
||||||
pa = TAILQ_FIRST(&group->pending_events);
|
pa = TAILQ_FIRST(&group->socks_with_data);
|
||||||
pf = TAILQ_LAST(&group->pending_events, spdk_pending_events_list);
|
pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list);
|
||||||
|
|
||||||
/* Break the link between C and D */
|
/* Break the link between C and D */
|
||||||
pc->link.tqe_next = NULL;
|
pc->link.tqe_next = NULL;
|
||||||
@ -1486,8 +1504,8 @@ posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
|
|||||||
pa->link.tqe_prev = &pf->link.tqe_next;
|
pa->link.tqe_prev = &pf->link.tqe_next;
|
||||||
|
|
||||||
/* Fix up the list first/last pointers */
|
/* Fix up the list first/last pointers */
|
||||||
group->pending_events.tqh_first = pd;
|
group->socks_with_data.tqh_first = pd;
|
||||||
group->pending_events.tqh_last = &pc->link.tqe_next;
|
group->socks_with_data.tqh_last = &pc->link.tqe_next;
|
||||||
}
|
}
|
||||||
|
|
||||||
return num_events;
|
return num_events;
|
||||||
|
Loading…
Reference in New Issue
Block a user