[PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB

From: Jiayuan Chen

Date: Tue Apr 14 2026 - 07:01:06 EST


A BPF_PROG_TYPE_SOCK_OPS program can set BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG
to inject custom TCP header options. When the kernel builds a TCP packet,
it calls tcp_established_options() to calculate the header size, which
invokes bpf_skops_hdr_opt_len() to trigger the BPF_SOCK_OPS_HDR_OPT_LEN_CB
callback.

If the BPF program calls bpf_setsockopt(TCP_NODELAY) inside this callback,
__tcp_sock_set_nodelay() will call tcp_push_pending_frames(), which calls
tcp_current_mss(), which calls tcp_established_options() again,
re-triggering the same BPF callback. This creates an infinite recursion
that exhausts the kernel stack and causes a panic.

BPF_SOCK_OPS_HDR_OPT_LEN_CB
-> bpf_setsockopt(TCP_NODELAY)
-> tcp_push_pending_frames()
-> tcp_current_mss()
-> tcp_established_options()
-> bpf_skops_hdr_opt_len()
/* infinite recursion */
-> BPF_SOCK_OPS_HDR_OPT_LEN_CB

A similar reentrancy issue exists for TCP congestion control, which is
guarded by tp->bpf_chg_cc_inprogress. Adopt the same approach: introduce
tp->bpf_hdr_opt_len_cb_inprogress, set it before invoking the callback in
bpf_skops_hdr_opt_len(), and check it in sol_tcp_sockopt() to reject
bpf_setsockopt(TCP_NODELAY) calls that would trigger
tcp_push_pending_frames() and cause the recursion.

Reported-by: Quan Sun <2022090917019@xxxxxxxxxxxxxxxx>
Reported-by: Yinhao Hu <dddddd@xxxxxxxxxxx>
Reported-by: Kaiyan Mei <M202472210@xxxxxxxxxxx>
Reported-by: Dongliang Mu <dzm91@xxxxxxxxxxx>
Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@xxxxxxxxxxxxxxxx/
Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
Signed-off-by: Jiayuan Chen <jiayuan.chen@xxxxxxxxx>
---
Documentation/networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/tcp.h | 11 ++++++++++-
net/core/filter.c | 4 ++++
net/ipv4/tcp_minisocks.c | 1 +
net/ipv4/tcp_output.c | 3 +++
5 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
index 563daea10d6c..07d3226d90cc 100644
--- a/Documentation/networking/net_cachelines/tcp_sock.rst
+++ b/Documentation/networking/net_cachelines/tcp_sock.rst
@@ -152,6 +152,7 @@ unsigned_int keepalive_intvl
int linger2
u8 bpf_sock_ops_cb_flags
u8:1 bpf_chg_cc_inprogress
+u8:1 bpf_hdr_opt_len_cb_inprogress
u16 timeout_rehash
u32 rcv_ooopack
u32 rcv_rtt_last_tsecr
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f72eef31fa23..2bfb73cf922e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -475,12 +475,21 @@ struct tcp_sock {
u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
* values defined in uapi/linux/tcp.h
*/
- u8 bpf_chg_cc_inprogress:1; /* In the middle of
+ u8 bpf_chg_cc_inprogress:1, /* In the middle of
* bpf_setsockopt(TCP_CONGESTION),
* it is to avoid the bpf_tcp_cc->init()
* to recur itself by calling
* bpf_setsockopt(TCP_CONGESTION, "itself").
*/
+ bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
+ * callback so that a nested
+ * bpf_setsockopt(TCP_NODELAY) or
+ * bpf_setsockopt(TCP_CORK) cannot
+ * trigger tcp_push_pending_frames(),
+ * which would call tcp_current_mss()
+ * -> bpf_skops_hdr_opt_len(), causing
+ * infinite recursion.
+ */
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
diff --git a/net/core/filter.c b/net/core/filter.c
index 78b548158fb0..518699429a7a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5483,6 +5483,10 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
if (sk->sk_protocol != IPPROTO_TCP)
return -EINVAL;

+ if ((optname == TCP_NODELAY || optname == TCP_CORK) &&
+ tcp_sk(sk)->bpf_hdr_opt_len_cb_inprogress)
+ return -EBUSY;
+
switch (optname) {
case TCP_NODELAY:
case TCP_MAXSEG:
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dafb63b923d0..fb06c464ac16 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -663,6 +663,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);

newtp->bpf_chg_cc_inprogress = 0;
+ newtp->bpf_hdr_opt_len_cb_inprogress = 0;
tcp_bpf_clone(sk, newsk);

__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 326b58ff1118..c9654e690e1a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -475,6 +475,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
unsigned int *remaining)
{
struct bpf_sock_ops_kern sock_ops;
+ struct tcp_sock *tp = tcp_sk(sk);
int err;

if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
@@ -519,7 +520,9 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
if (skb)
bpf_skops_init_skb(&sock_ops, skb, 0);

+ tp->bpf_hdr_opt_len_cb_inprogress = 1;
err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+ tp->bpf_hdr_opt_len_cb_inprogress = 0;

if (err || sock_ops.remaining_opt_len == *remaining)
return;
--
2.43.0