Re: [PATCH net v2] net: add xmit recursion limit to tunnel drivers

From: Paolo Abeni

Date: Thu Mar 05 2026 - 09:34:59 EST


On 3/3/26 5:43 PM, bestswngs@xxxxxxxxx wrote:
> From: Weiming Shi <bestswngs@xxxxxxxxx>
>
> __dev_queue_xmit() has two transmit code paths depending on whether the
> device has a qdisc attached:
>
> 1. Qdisc path (q->enqueue): calls __dev_xmit_skb()
> 2. No-qdisc path: calls dev_hard_start_xmit() directly
>
> Commit 745e20f1b626 ("net: add a recursion limit in xmit path") added
> recursion protection to the no-qdisc path via dev_xmit_recursion()
> check and dev_xmit_recursion_inc()/dec() tracking. However, the qdisc
> path performs no recursion depth checking at all.
>
> This allows unbounded recursion through qdisc-attached devices. For
> example, a bond interface in broadcast mode with gretap slaves whose
> remote endpoints route back through the bond creates an infinite
> transmit loop that exhausts the kernel stack:
>
> BUG: KASAN: stack-out-of-bounds in blake2s.constprop.0+0xe7/0x160
> Write of size 32 at addr ffff88810033fed0 by task kworker/0:1/11
> Workqueue: mld mld_ifc_work
> Call Trace:
> <TASK>
> __build_flow_key.constprop.0 (net/ipv4/route.c:515)
> ip_rt_update_pmtu (net/ipv4/route.c:1073)
> iptunnel_xmit (net/ipv4/ip_tunnel_core.c:84)
> ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847)
> gre_tap_xmit (net/ipv4/ip_gre.c:779)
> dev_hard_start_xmit (net/core/dev.c:3887)
> sch_direct_xmit (net/sched/sch_generic.c:347)
> __dev_queue_xmit (net/core/dev.c:4802)
> bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312)
> bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279)
> bond_start_xmit (drivers/net/bonding/bond_main.c:5530)
> dev_hard_start_xmit (net/core/dev.c:3887)
> __dev_queue_xmit (net/core/dev.c:4841)
> ip_finish_output2 (net/ipv4/ip_output.c:237)
> ip_output (net/ipv4/ip_output.c:438)
> iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86)
> gre_tap_xmit (net/ipv4/ip_gre.c:779)
> dev_hard_start_xmit (net/core/dev.c:3887)
> sch_direct_xmit (net/sched/sch_generic.c:347)
> __dev_queue_xmit (net/core/dev.c:4802)
> bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312)
> bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279)
> bond_start_xmit (drivers/net/bonding/bond_main.c:5530)
> dev_hard_start_xmit (net/core/dev.c:3887)
> __dev_queue_xmit (net/core/dev.c:4841)
> ip_finish_output2 (net/ipv4/ip_output.c:237)
> ip_output (net/ipv4/ip_output.c:438)
> iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86)
> ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847)
> gre_tap_xmit (net/ipv4/ip_gre.c:779)
> dev_hard_start_xmit (net/core/dev.c:3887)
> sch_direct_xmit (net/sched/sch_generic.c:347)
> __dev_queue_xmit (net/core/dev.c:4802)
> bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312)
> bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279)
> bond_start_xmit (drivers/net/bonding/bond_main.c:5530)
> dev_hard_start_xmit (net/core/dev.c:3887)
> __dev_queue_xmit (net/core/dev.c:4841)
> mld_sendpack
> mld_ifc_work
> process_one_work
> worker_thread
> </TASK>
>
> Rather than adding an expensive recursion check to the common qdisc
> fast path, add recursion detection directly in tunnel drivers where
> route lookups can create these loops, as suggested by Eric Dumazet.
>
> Use a lower limit (IP_TUNNEL_RECURSION_LIMIT=4) than XMIT_RECURSION_LIMIT
> (8) because each tunnel recursion level involves route lookups and full
> IP output processing, consuming significantly more stack per level.
>
> Move dev_xmit_recursion helpers from the private net/core/dev.h header
> to the public include/linux/netdevice.h so tunnel drivers can use them.
>
> Fixes: 745e20f1b626 ("net: add a recursion limit in xmit path")
> Reported-by: Xiang Mei <xmei5@xxxxxxx>
> Signed-off-by: Weiming Shi <bestswngs@xxxxxxxxx>
> ---
> v2:
> - Move recursion check from qdisc path to tunnel drivers (Eric Dumazet)
> - Add IPv6 tunnel (ip6_tunnel) coverage
> - Use lower recursion limit (4) for tunnel-specific stack consumption
>
> include/linux/netdevice.h | 32 ++++++++++++++++++++++++++++++++
> include/net/ip_tunnels.h | 7 +++++++
> net/core/dev.h | 34 ----------------------------------
> net/ipv4/ip_tunnel.c | 10 ++++++++++
> net/ipv6/ip6_tunnel.c | 10 ++++++++++
> 5 files changed, 59 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index d4e6e00bb90a..1a4d2542dbab 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3576,17 +3576,49 @@ struct page_pool_bh {
> };
> DECLARE_PER_CPU(struct page_pool_bh, system_page_pool);
>
> +#define XMIT_RECURSION_LIMIT 8
> +
> #ifndef CONFIG_PREEMPT_RT
> static inline int dev_recursion_level(void)
> {
> return this_cpu_read(softnet_data.xmit.recursion);
> }
> +
> +static inline bool dev_xmit_recursion(void)
> +{
> + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
> + XMIT_RECURSION_LIMIT);
> +}
> +
> +static inline void dev_xmit_recursion_inc(void)
> +{
> + __this_cpu_inc(softnet_data.xmit.recursion);
> +}
> +
> +static inline void dev_xmit_recursion_dec(void)
> +{
> + __this_cpu_dec(softnet_data.xmit.recursion);
> +}
> #else
> static inline int dev_recursion_level(void)
> {
> return current->net_xmit.recursion;
> }
>
> +static inline bool dev_xmit_recursion(void)
> +{
> + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
> +}
> +
> +static inline void dev_xmit_recursion_inc(void)
> +{
> + current->net_xmit.recursion++;
> +}
> +
> +static inline void dev_xmit_recursion_dec(void)
> +{
> + current->net_xmit.recursion--;
> +}
> #endif
>
> void __netif_schedule(struct Qdisc *q);
> diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
> index 4021e6a73e32..80662f812080 100644
> --- a/include/net/ip_tunnels.h
> +++ b/include/net/ip_tunnels.h
> @@ -27,6 +27,13 @@
> #include <net/ip6_route.h>
> #endif
>
> +/* Recursion limit for tunnel xmit to detect routing loops.
> + * Unlike XMIT_RECURSION_LIMIT (8) used in the no-qdisc path, tunnel
> + * recursion involves route lookups and full IP output, consuming much
> + * more stack per level, so a lower limit is needed.
> + */
> +#define IP_TUNNEL_RECURSION_LIMIT 4
> +
> /* Keep error state on tunnel for 30 sec */
> #define IPTUNNEL_ERR_TIMEO (30*HZ)
>
> diff --git a/net/core/dev.h b/net/core/dev.h
> index 98793a738f43..ec974b3c42d9 100644
> --- a/net/core/dev.h
> +++ b/net/core/dev.h
> @@ -366,40 +366,6 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi)
>
> void kick_defer_list_purge(unsigned int cpu);
>
> -#define XMIT_RECURSION_LIMIT 8
> -
> -#ifndef CONFIG_PREEMPT_RT
> -static inline bool dev_xmit_recursion(void)
> -{
> - return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
> - XMIT_RECURSION_LIMIT);
> -}
> -
> -static inline void dev_xmit_recursion_inc(void)
> -{
> - __this_cpu_inc(softnet_data.xmit.recursion);
> -}
> -
> -static inline void dev_xmit_recursion_dec(void)
> -{
> - __this_cpu_dec(softnet_data.xmit.recursion);
> -}
> -#else
> -static inline bool dev_xmit_recursion(void)
> -{
> - return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
> -}
> -
> -static inline void dev_xmit_recursion_inc(void)
> -{
> - current->net_xmit.recursion++;
> -}
> -
> -static inline void dev_xmit_recursion_dec(void)
> -{
> - current->net_xmit.recursion--;
> -}
> -#endif
>
> int dev_set_hwtstamp_phylib(struct net_device *dev,
> struct kernel_hwtstamp_config *cfg,
> diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
> index 50d0f5fe4e4c..39822e845a06 100644
> --- a/net/ipv4/ip_tunnel.c
> +++ b/net/ipv4/ip_tunnel.c
> @@ -683,6 +683,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
> __be32 dst;
> __be16 df;
>
> + if (dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT) {
> + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
> + dev->name);
> + DEV_STATS_INC(dev, tx_errors);
> + kfree_skb(skb);
> + return;
> + }
> +
> inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
> connected = (tunnel->parms.iph.daddr != 0);
> payload_protocol = skb_protocol(skb, true);
> @@ -842,8 +850,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
>
> ip_tunnel_adj_headroom(dev, max_headroom);
>
> + dev_xmit_recursion_inc();
> iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
> df, !net_eq(tunnel->net, dev_net(dev)), 0);

I think this check should go in iptunnel_xmit(), or UDP tunnels will
remain unprotected.

Same thing for the IPv6 case.

> + dev_xmit_recursion_dec();
> return;
>
> #if IS_ENABLED(CONFIG_IPV6)
> diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
> index 4c29aa94e86e..55bedd5cd656 100644
> --- a/net/ipv6/ip6_tunnel.c
> +++ b/net/ipv6/ip6_tunnel.c
> @@ -1101,6 +1101,14 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
> u8 hop_limit;
> int err = -1;
>
> + if (dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT) {
> + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
> + dev->name);
> + DEV_STATS_INC(dev, tx_errors);
> + kfree_skb(skb);
> + return -1;

I think an standard error code is expected here. Possibly -ELOOP?

/P