[RFC PATCH net-next] tcp: add a tracepoint for tcp_listen_queue_drop
From: Ivan Babrou
Date: Tue Jul 11 2023 - 00:35:45 EST
There's already a way to count the overall numbers of queue overflows:
$ sudo netstat -s | grep 'listen queue'
4 times the listen queue of a socket overflowed
However, it's too coarse for monitoring and alerting when a user wants to
track errors per socket and route alerts to people responsible for those
sockets directly. For UDP there's udp_fail_queue_rcv_skb, which fills
a similar need for UDP sockets. This patch adds a TCP equivalent.
--
The goal is to use this new tracepoint with ebpf_exporter:
* https://github.com/cloudflare/ebpf_exporter
There's an example configuration for UDP drops there that we use:
* https://github.com/cloudflare/ebpf_exporter/blob/master/examples/udp-drops.bpf.c
* https://github.com/cloudflare/ebpf_exporter/blob/master/examples/udp-drops.yaml
Paolo Abeni asked whether we need the UDP tracepoint given that kfree_skb
and MIB counters already exist, and I covered that part in my reply here:
* https://lore.kernel.org/netdev/CABWYdi3DVex0wq2kM72QTZkhNzkh_Vjb4+T8mj8U7t06Na=5kA@xxxxxxxxxxxxxx/
I added a TCP example utilizing this patch here:
* https://github.com/cloudflare/ebpf_exporter/pull/221
Not so long ago we hit a bug in one of our services that broke its accept
loop, which in resulted in the listen queue overflow. With this new
tracepoint we can have a metric for this and alert the service owners
directly, cutting the middleman SRE and improving the alert fidelity.
We don't really need a tracepoint for this, just a place to hook a kprobe
or an fprobe to. The existing tcp_listendrop is great for this, except
it's a short inlined function, so there's no way to attach a probe to it.
There are a few ways to approach this:
* Un-inline tcp_listendrop to allow probe attachment
* Un-inline tcp_listendrop and add a stable tracepoint
* Keep tcp_listendrop inlined, but add a tracepoint wrapper to call into
There is no option to keep tcp_listendrop inlined and call into tracepoint
directly from it (it does not compile and it wouldn't be nice if it did):
* https://docs.kernel.org/trace/tracepoints.html
Therefore I went with the third option, which this patch implements.
Example output from perf:
$ sudo perf trace -a -e tcp:tcp_listen_queue_drop
0.000 sockfull/5459 tcp:tcp_listen_queue_drop(skaddr: 0xffffff90d7a25580, sport: 12345, saddr: 0x7faa1aed26, saddr_v6: 0x7faa1aed2a, sk_max_ack_backlog: 128, sk_ack_backlog: 129)
Example extracting the local port with bpftrace:
$ sudo ~/projects/bpftrace/src/bpftrace -e 'rawtracepoint:tcp_listen_queue_drop { $sk = (struct sock *) arg0; $lport = $sk->__sk_common.skc_num; printf("drop on lport = %d\n", $lport); }'
Attaching 1 probe...
drop on lport = 12345
Signed-off-by: Ivan Babrou <ivan@xxxxxxxxxxxxxx>
---
include/net/tcp.h | 7 ++++++
include/trace/events/tcp.h | 46 ++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp.c | 7 ++++++
3 files changed, 60 insertions(+)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 226bce6d1e8c..810ad606641f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,7 @@
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>
#include <linux/net_mm.h>
+#include <linux/tracepoint-defs.h>
extern struct inet_hashinfo tcp_hashinfo;
@@ -2259,6 +2260,10 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in);
}
+DECLARE_TRACEPOINT(tcp_listen_queue_drop);
+
+void do_trace_tcp_listen_queue_drop_wrapper(const struct sock *sk);
+
/*
* TCP listen path runs lockless.
* We forced "struct sock" to be const qualified to make sure
@@ -2270,6 +2275,8 @@ static inline void tcp_listendrop(const struct sock *sk)
{
atomic_inc(&((struct sock *)sk)->sk_drops);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ if (tracepoint_enabled(tcp_listen_queue_drop))
+ do_trace_tcp_listen_queue_drop_wrapper(sk);
}
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index bf06db8d2046..646ad0bbd378 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -416,6 +416,52 @@ TRACE_EVENT(tcp_cong_state_set,
__entry->cong_state)
);
+TRACE_EVENT(tcp_listen_queue_drop,
+
+ TP_PROTO(const struct sock *sk),
+
+ TP_ARGS(sk),
+
+ TP_STRUCT__entry(
+ __field(const void *, skaddr)
+ __field(__u16, sport)
+ __array(__u8, saddr, 4)
+ __array(__u8, saddr_v6, 16)
+ __field(__u32, sk_max_ack_backlog)
+ __field(__u32, sk_ack_backlog)
+ ),
+
+ TP_fast_assign(
+ const struct inet_sock *inet = inet_sk(sk);
+ struct in6_addr *pin6;
+ __be32 *p32;
+
+ __entry->skaddr = sk;
+
+ __entry->sport = ntohs(inet->inet_sport);
+
+ p32 = (__be32 *) __entry->saddr;
+ *p32 = inet->inet_saddr;
+
+ pin6 = (struct in6_addr *)__entry->saddr_v6;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ *pin6 = sk->sk_v6_rcv_saddr;
+ else
+ ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
+#else
+ ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
+#endif
+
+ __entry->sk_max_ack_backlog = READ_ONCE(sk->sk_max_ack_backlog);
+ __entry->sk_ack_backlog = READ_ONCE(sk->sk_ack_backlog);
+ ),
+
+ TP_printk("sport=%hu saddr=%pI4 saddrv6=%pI6c sk_max_ack_backlog=%d sk_ack_backlog=%d",
+ __entry->sport, __entry->saddr, __entry->saddr_v6,
+ __entry->sk_max_ack_backlog, __entry->sk_ack_backlog)
+);
+
#endif /* _TRACE_TCP_H */
/* This part must be outside protection */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e03e08745308..29ecbc5248c3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -276,6 +276,8 @@
#include <net/ip.h>
#include <net/sock.h>
+#include <trace/events/tcp.h>
+
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
@@ -1697,6 +1699,11 @@ int tcp_peek_len(struct socket *sock)
}
EXPORT_SYMBOL(tcp_peek_len);
+void do_trace_tcp_listen_queue_drop_wrapper(const struct sock *sk)
+{
+ trace_tcp_listen_queue_drop(sk);
+}
+
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val)
{
--
2.41.0