Re: [PATCH net-next] tcp: replace per-packet memset in __tcp_transmit_skb()
From: Eric Dumazet
Date: Tue Mar 03 2026 - 03:55:08 EST
On Tue, Mar 3, 2026 at 8:49 AM Keita Morisaki <kmta1236@xxxxxxxxx> wrote:
>
> Replace memset(&opts, 0, sizeof(opts)) with targeted initialization of
> only the three fields read unconditionally by tcp_options_write() and
> bpf_skops_write_hdr_opt(): mss, num_sack_blocks, and bpf_opt_len.
>
> struct tcp_out_options is 40 bytes without MPTCP and 96 bytes with
> CONFIG_MPTCP=y (typical distro config). Every remaining field is either
> assigned before first use by tcp_established_options()/tcp_syn_options(),
> or gated behind its OPTION_* flag in tcp_options_write(). This memset
> runs on every transmitted TCP packet, so removing it reduces per-packet
> overhead on the hot path.
>
> Assembly comparison (x86-64, GCC 13, CONFIG_MPTCP=n):
>
> Before: 5 store instructions zeroing 40 bytes
> After: 3 store instructions zeroing 4 bytes
>
> With CONFIG_MPTCP=y the savings are larger: 12 stores (96 bytes) become
> 3 stores (4 bytes).
>
> A BUILD_BUG_ON guards against future field additions: if the struct size
> changes, the assertion fires and forces the developer to audit whether
> the new field needs explicit zeroing.
>
> Also add opts->options = 0 at the top of tcp_syn_options(), which
> already used |= without a prior clear. tcp_established_options() already
> clears opts->options at its top.
>
> Signed-off-by: Keita Morisaki <kmta1236@xxxxxxxxx>
> ---
> net/ipv4/tcp_output.c | 16 ++++++++++++++++--
> 1 file changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 326b58ff1118d..ae04c697dbacc 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -965,6 +965,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
> struct tcp_fastopen_request *fastopen = tp->fastopen_req;
> bool timestamps;
>
> + opts->options = 0;
> +
> /* Better than switch (key.type) as it has static branches */
> if (tcp_key_is_md5(key)) {
> timestamps = false;
> @@ -1549,7 +1551,20 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
>
> inet = inet_sk(sk);
> tcb = TCP_SKB_CB(skb);
> - memset(&opts, 0, sizeof(opts));
> + /* Only zero fields read unconditionally by tcp_options_write()
> + * or bpf_skops_write_hdr_opt(). Other fields are set by
> + * tcp_established_options()/tcp_syn_options() before use, or
> + * gated behind their OPTION_* flag.
> + *
> + * If you add a field to tcp_out_options, this will fire —
> + * audit whether the new field needs zeroing here.
> + */
> + BUILD_BUG_ON(sizeof(opts) !=
> + sizeof(struct mptcp_out_options) + 40);
> + opts.mss = 0;
> + opts.num_sack_blocks = 0;
> + opts.bpf_opt_len = 0;
>
> tcp_get_current_key(sk, &key);
> if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
>
Interesting idea, but the implementation is a bit fragile.
I would instead add a group in struct tcp_out_options to clearly mark
which group is zeroed in __tcp_transmit_skb().
Partial patch to give the idea :
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 46bd48cf776a6a37e6ab2664245cd1e35a88d4f8..b96168f50170b7359d26de2389128bc80d4d549e
100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -429,14 +429,19 @@ static void smc_options_write(__be32 *ptr, u16 *options)
}
struct tcp_out_options {
+ /* Following group is cleared in __tcp_transmit_skb() */
+ struct_group(cleared,
+ u16 mss; /* 0 to disable */
+ u8 bpf_opt_len; /* length of BPF hdr option */
+ u8 num_sack_blocks; /* number of SACK blocks to include */
+ );
+
+ /* Caution : Following fields are not cleared in
__tcp_transmit_skb(). */
u16 options; /* bit field of OPTION_* */
- u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
- u8 num_sack_blocks; /* number of SACK blocks to include */
u8 num_accecn_fields:7, /* number of AccECN fields needed */
use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
u8 hash_size; /* bytes in hash_location */
- u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
@@ -1565,7 +1570,7 @@ static int __tcp_transmit_skb(struct sock *sk,
struct sk_buff *skb,
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
- memset(&opts, 0, sizeof(opts));
+ memset(&opts.cleared, 0, sizeof(opts.cleared));
tcp_get_current_key(sk, &key);
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {