[PATCH net v2] net: add xmit recursion limit to tunnel drivers

From: bestswngs

Date: Tue Mar 03 2026 - 11:53:41 EST


From: Weiming Shi <bestswngs@xxxxxxxxx>

__dev_queue_xmit() has two transmit code paths depending on whether the
device has a qdisc attached:

1. Qdisc path (q->enqueue): calls __dev_xmit_skb()
2. No-qdisc path: calls dev_hard_start_xmit() directly

Commit 745e20f1b626 ("net: add a recursion limit in xmit path") added
recursion protection to the no-qdisc path via dev_xmit_recursion()
check and dev_xmit_recursion_inc()/dec() tracking. However, the qdisc
path performs no recursion depth checking at all.

This allows unbounded recursion through qdisc-attached devices. For
example, a bond interface in broadcast mode with gretap slaves whose
remote endpoints route back through the bond creates an infinite
transmit loop that exhausts the kernel stack:

BUG: KASAN: stack-out-of-bounds in blake2s.constprop.0+0xe7/0x160
Write of size 32 at addr ffff88810033fed0 by task kworker/0:1/11
Workqueue: mld mld_ifc_work
Call Trace:
<TASK>
__build_flow_key.constprop.0 (net/ipv4/route.c:515)
ip_rt_update_pmtu (net/ipv4/route.c:1073)
iptunnel_xmit (net/ipv4/ip_tunnel_core.c:84)
ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847)
gre_tap_xmit (net/ipv4/ip_gre.c:779)
dev_hard_start_xmit (net/core/dev.c:3887)
sch_direct_xmit (net/sched/sch_generic.c:347)
__dev_queue_xmit (net/core/dev.c:4802)
bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312)
bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279)
bond_start_xmit (drivers/net/bonding/bond_main.c:5530)
dev_hard_start_xmit (net/core/dev.c:3887)
__dev_queue_xmit (net/core/dev.c:4841)
ip_finish_output2 (net/ipv4/ip_output.c:237)
ip_output (net/ipv4/ip_output.c:438)
iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86)
gre_tap_xmit (net/ipv4/ip_gre.c:779)
dev_hard_start_xmit (net/core/dev.c:3887)
sch_direct_xmit (net/sched/sch_generic.c:347)
__dev_queue_xmit (net/core/dev.c:4802)
bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312)
bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279)
bond_start_xmit (drivers/net/bonding/bond_main.c:5530)
dev_hard_start_xmit (net/core/dev.c:3887)
__dev_queue_xmit (net/core/dev.c:4841)
ip_finish_output2 (net/ipv4/ip_output.c:237)
ip_output (net/ipv4/ip_output.c:438)
iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86)
ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847)
gre_tap_xmit (net/ipv4/ip_gre.c:779)
dev_hard_start_xmit (net/core/dev.c:3887)
sch_direct_xmit (net/sched/sch_generic.c:347)
__dev_queue_xmit (net/core/dev.c:4802)
bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312)
bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279)
bond_start_xmit (drivers/net/bonding/bond_main.c:5530)
dev_hard_start_xmit (net/core/dev.c:3887)
__dev_queue_xmit (net/core/dev.c:4841)
mld_sendpack
mld_ifc_work
process_one_work
worker_thread
</TASK>

Rather than adding an expensive recursion check to the common qdisc
fast path, add recursion detection directly in tunnel drivers where
route lookups can create these loops, as suggested by Eric Dumazet.

Use a lower limit (IP_TUNNEL_RECURSION_LIMIT=4) than XMIT_RECURSION_LIMIT
(8) because each tunnel recursion level involves route lookups and full
IP output processing, consuming significantly more stack per level.

Move dev_xmit_recursion helpers from the private net/core/dev.h header
to the public include/linux/netdevice.h so tunnel drivers can use them.

Fixes: 745e20f1b626 ("net: add a recursion limit in xmit path")
Reported-by: Xiang Mei <xmei5@xxxxxxx>
Signed-off-by: Weiming Shi <bestswngs@xxxxxxxxx>
---
v2:
- Move recursion check from qdisc path to tunnel drivers (Eric Dumazet)
- Add IPv6 tunnel (ip6_tunnel) coverage
- Use lower recursion limit (4) for tunnel-specific stack consumption

include/linux/netdevice.h | 32 ++++++++++++++++++++++++++++++++
include/net/ip_tunnels.h | 7 +++++++
net/core/dev.h | 34 ----------------------------------
net/ipv4/ip_tunnel.c | 10 ++++++++++
net/ipv6/ip6_tunnel.c | 10 ++++++++++
5 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d4e6e00bb90a..1a4d2542dbab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3576,17 +3576,49 @@ struct page_pool_bh {
};
DECLARE_PER_CPU(struct page_pool_bh, system_page_pool);

+#define XMIT_RECURSION_LIMIT 8
+
#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
return this_cpu_read(softnet_data.xmit.recursion);
}
+
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+ XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ __this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.recursion);
+}
#else
static inline int dev_recursion_level(void)
{
return current->net_xmit.recursion;
}

+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ current->net_xmit.recursion++;
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ current->net_xmit.recursion--;
+}
#endif

void __netif_schedule(struct Qdisc *q);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 4021e6a73e32..80662f812080 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -27,6 +27,13 @@
#include <net/ip6_route.h>
#endif

+/* Recursion limit for tunnel xmit to detect routing loops.
+ * Unlike XMIT_RECURSION_LIMIT (8) used in the no-qdisc path, tunnel
+ * recursion involves route lookups and full IP output, consuming much
+ * more stack per level, so a lower limit is needed.
+ */
+#define IP_TUNNEL_RECURSION_LIMIT 4
+
/* Keep error state on tunnel for 30 sec */
#define IPTUNNEL_ERR_TIMEO (30*HZ)

diff --git a/net/core/dev.h b/net/core/dev.h
index 98793a738f43..ec974b3c42d9 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -366,40 +366,6 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi)

void kick_defer_list_purge(unsigned int cpu);

-#define XMIT_RECURSION_LIMIT 8
-
-#ifndef CONFIG_PREEMPT_RT
-static inline bool dev_xmit_recursion(void)
-{
- return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
- XMIT_RECURSION_LIMIT);
-}
-
-static inline void dev_xmit_recursion_inc(void)
-{
- __this_cpu_inc(softnet_data.xmit.recursion);
-}
-
-static inline void dev_xmit_recursion_dec(void)
-{
- __this_cpu_dec(softnet_data.xmit.recursion);
-}
-#else
-static inline bool dev_xmit_recursion(void)
-{
- return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
-}
-
-static inline void dev_xmit_recursion_inc(void)
-{
- current->net_xmit.recursion++;
-}
-
-static inline void dev_xmit_recursion_dec(void)
-{
- current->net_xmit.recursion--;
-}
-#endif

int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 50d0f5fe4e4c..39822e845a06 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -683,6 +683,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
__be32 dst;
__be16 df;

+ if (dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT) {
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+ return;
+ }
+
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
payload_protocol = skb_protocol(skb, true);
@@ -842,8 +850,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,

ip_tunnel_adj_headroom(dev, max_headroom);

+ dev_xmit_recursion_inc();
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
df, !net_eq(tunnel->net, dev_net(dev)), 0);
+ dev_xmit_recursion_dec();
return;

#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 4c29aa94e86e..55bedd5cd656 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1101,6 +1101,14 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
u8 hop_limit;
int err = -1;

+ if (dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT) {
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+ return -1;
+ }
+
payload_protocol = skb_protocol(skb, true);

if (t->parms.collect_md) {
@@ -1277,7 +1285,9 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
+ dev_xmit_recursion_inc();
ip6tunnel_xmit(NULL, skb, dev, 0);
+ dev_xmit_recursion_dec();
return 0;
tx_err_link_failure:
DEV_STATS_INC(dev, tx_carrier_errors);
--
2.43.0