[PATCH] fs: eventpoll: add empty event

From: Johannes Lundberg
Date: Mon Sep 27 2021 - 16:30:38 EST


The EPOLLEMPTY event will trigger when the TCP write buffer becomes
empty, i.e., when all outgoing data have been ACKed.

The need for this functionality comes from a business requirement
of measuring with higher precision how much time is spent
transmitting data to a client. For reference, similar functionality
was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.

Signed-off-by: Johannes Lundberg <jlundberg@xxxxxxxx>
---
include/net/sock.h | 11 +++++++++++
include/uapi/linux/eventpoll.h | 1 +
net/core/sock.c | 5 +++++
net/core/stream.c | 14 ++++++++++++++
net/ipv4/tcp.c | 5 +++++
5 files changed, 36 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index c005c3c750e8..9047a9e225a9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -516,6 +516,7 @@ struct sock {
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk);
void (*sk_write_space)(struct sock *sk);
+ void (*sk_empty)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
@@ -965,6 +966,7 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val)
WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

+void sk_stream_empty(struct sock *sk);
void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
@@ -1288,6 +1290,11 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)

INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));

+static inline bool sk_stream_is_empty(const struct sock *sk)
+{
+ return (sk->sk_wmem_queued == 0);
+}
+
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1559,6 +1566,10 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk_wmem_queued_add(sk, -skb->truesize);
+
+ if (sk_stream_is_empty(sk))
+ sk->sk_empty(sk);
+
sk_mem_uncharge(sk, skb->truesize);
if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
!sk->sk_tx_skb_cache && !skb_cloned(skb)) {
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..aab9f1f624d0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -39,6 +39,7 @@
#define EPOLLWRNORM (__force __poll_t)0x00000100
#define EPOLLWRBAND (__force __poll_t)0x00000200
#define EPOLLMSG (__force __poll_t)0x00000400
+#define EPOLLEMPTY (__force __poll_t)0x00000800
#define EPOLLRDHUP (__force __poll_t)0x00002000

/* Set exclusive wakeup mode for the target file descriptor */
diff --git a/net/core/sock.c b/net/core/sock.c
index 512e629f9780..f917791d8149 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3062,6 +3062,10 @@ static void sock_def_write_space(struct sock *sk)
rcu_read_unlock();
}

+static void sock_def_empty(struct sock *sk)
+{
+}
+
static void sock_def_destruct(struct sock *sk)
{
}
@@ -3136,6 +3140,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
+ sk->sk_empty = sock_def_empty;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;

diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..c7e4135542a2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -21,6 +21,20 @@
#include <linux/wait.h>
#include <net/sock.h>

+void sk_stream_empty(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct socket_wq *wq;
+
+ if (sk_stream_is_empty(sk) && sock) {
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, EPOLLEMPTY);
+ rcu_read_unlock();
+ }
+}
+
/**
* sk_stream_write_space - stream socket write_space callback.
* @sk: socket
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..550bae79af06 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;

+ sk->sk_empty = sk_stream_empty;
+
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

@@ -561,6 +563,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;

+ if (sk_stream_is_empty(sk))
+ mask |= EPOLLEMPTY;
+
if (tcp_stream_is_readable(sk, target))
mask |= EPOLLIN | EPOLLRDNORM;

--
2.17.1