Re: [PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB

From: Jiayuan Chen

Date: Tue Apr 14 2026 - 21:49:01 EST



On 4/14/26 11:37 PM, mkf wrote:
On Tue, 2026-04-14 at 18:57 +0800, Jiayuan Chen wrote:


[...]

--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -475,12 +475,21 @@ struct tcp_sock {
  u8 bpf_sock_ops_cb_flags;  /* Control calling BPF programs
  * values defined in uapi/linux/tcp.h
  */
- u8 bpf_chg_cc_inprogress:1; /* In the middle of
+ u8 bpf_chg_cc_inprogress:1, /* In the middle of
    * bpf_setsockopt(TCP_CONGESTION),
    * it is to avoid the bpf_tcp_cc->init()
    * to recur itself by calling
    * bpf_setsockopt(TCP_CONGESTION, "itself").
    */
+ bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
+   * callback so that a nested
+   * bpf_setsockopt(TCP_NODELAY) or
+   * bpf_setsockopt(TCP_CORK) cannot
+   * trigger tcp_push_pending_frames(),
+   * which would call tcp_current_mss()
+   * -> bpf_skops_hdr_opt_len(), causing
+   * infinite recursion.
+   */
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
 #else
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
diff --git a/net/core/filter.c b/net/core/filter.c
index 78b548158fb0..518699429a7a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5483,6 +5483,10 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
  if (sk->sk_protocol != IPPROTO_TCP)
  return -EINVAL;
+ if ((optname == TCP_NODELAY || optname == TCP_CORK) &&
+     tcp_sk(sk)->bpf_hdr_opt_len_cb_inprogress)
+ return -EBUSY;
+
TCP_CORK is not support in sol_tcp_sockopt(), return -EINVAL by default. and put the check here
could also prevent us from calling getsockopt(TCP_NODELAY) below.

  switch (optname) {
  case TCP_NODELAY:
  case TCP_MAXSEG:
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dafb63b923d0..fb06c464ac16 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -663,6 +663,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
  RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
  newtp->bpf_chg_cc_inprogress = 0;
+ newtp->bpf_hdr_opt_len_cb_inprogress = 0;
  tcp_bpf_clone(sk, newsk);
  __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 326b58ff1118..c9654e690e1a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -475,6 +475,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
    unsigned int *remaining)
 {
  struct bpf_sock_ops_kern sock_ops;
+ struct tcp_sock *tp = tcp_sk(sk);
  int err;
  if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
@@ -519,7 +520,9 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
  if (skb)
  bpf_skops_init_skb(&sock_ops, skb, 0);
+ tp->bpf_hdr_opt_len_cb_inprogress = 1;
we check the BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG before calling BPF_CGROUP_RUN_PROG_SOCK_OPS_SK,
could this flag use for the same purpose? so we don't need to add an extra field.

if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
!*remaining)
return;


Hi Martin, I saw your patch. Your solution is better, please ignore mine :)