Kmemleak false positives in mld_newpack() and ndisc_alloc_skb()

From: Gavrilov Ilia
Date: Mon Sep 09 2024 - 10:24:19 EST


Hi,
my local syzbot reported memory leak issues at mld_newpack():

BUG: memory leak
unreferenced object 0xffff888105083000 (size 2048):
comm "softirq", pid 0, jiffies 4295566459 (age 39.329s)
hex dump (first 32 bytes):
00 00 33 33 00 00 00 16 aa aa aa aa aa 0c 86 dd ..33............
60 00 00 00 00 4c 00 01 00 00 00 00 00 00 00 00 `....L..........
backtrace:
[<00000000998fe539>] __kmalloc_reserve net/core/skbuff.c:142 [inline]
[<00000000998fe539>] __alloc_skb+0xac/0x630 net/core/skbuff.c:210
[<0000000096d01817>] alloc_skb include/linux/skbuff.h:1096 [inline]
[<0000000096d01817>] alloc_skb_with_frags+0x92/0x570 net/core/skbuff.c:5896
[<000000005e7dbbfe>] sock_alloc_send_pskb+0x797/0x920 net/core/sock.c:2348
[<0000000021266c13>] mld_newpack+0x1d2/0x760 net/ipv6/mcast.c:1604
[<000000007bf59075>] add_grhead+0x269/0x340 net/ipv6/mcast.c:1707
[<0000000080a93257>] add_grec+0xe30/0x10a0 net/ipv6/mcast.c:1838
[<00000000206e567d>] mld_send_cr net/ipv6/mcast.c:1964 [inline]
[<00000000206e567d>] mld_ifc_timer_expire+0x596/0xf10 net/ipv6/mcast.c:2471
[<0000000023c64f57>] call_timer_fn+0x181/0x5f0 kernel/time/timer.c:1414
[<000000001a53ddf1>] expire_timers kernel/time/timer.c:1459 [inline]
[<000000001a53ddf1>] __run_timers.part.0+0x66b/0xa50 kernel/time/timer.c:1753
[<0000000070064c8e>] __run_timers kernel/time/timer.c:1731 [inline]
[<0000000070064c8e>] run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1766
[<00000000d67c96e4>] __do_softirq+0x286/0x9a3 kernel/softirq.c:298
[<0000000002a81fe9>] asm_call_irq_on_stack+0xf/0x20
[<000000001b74b134>] __run_on_irqstack arch/x86/include/asm/irq_stack.h:26 [inline]
[<000000001b74b134>] run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:77 [inline]
[<000000001b74b134>] do_softirq_own_stack+0xaa/0xe0 arch/x86/kernel/irq_64.c:77
[<000000009c0f9a71>] invoke_softirq kernel/softirq.c:393 [inline]
[<000000009c0f9a71>] __irq_exit_rcu kernel/softirq.c:423 [inline]
[<000000009c0f9a71>] irq_exit_rcu+0x136/0x200 kernel/softirq.c:435
[<0000000072169c58>] sysvec_apic_timer_interrupt+0x4d/0x100 arch/x86/kernel/apic/apic.c:1095
[<000000006d64a7a1>] asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:635

I reproduced these issues and confirmed they still existed in the latest
upstream and in 5.10 stable releases. As it turned out later, all these leaks
are false positive.

I noticed that the detected leaks relate only to network packets.
The struct sk_buff itself is released and isn't detected as a memory leak,
but the kmemleak detector signals a leak of the "head" buffer containing
struct skb_shared_info and pointed to by skb_buff->head.

I found a similar issue
https://syzkaller.appspot.com/bug?id=ceee5d7bbf373a903551c396c66c0f7cb98d9bdd
which causes the same false positive leaks.

In all cases a network bridge is used.

In most cases, three types of leaks are detected
(all packets belong to the IPv6 protocol):

1) When sending Router Solicitation type 133
(ICMPv6, Neighbor Discovery Protocol (NDP))

BUG: memory leak
unreferenced object ffff88801a526300 (size 704):

comm "softirq", pid 0, jiffies 4294749186
hex dump (first 32 bytes):
00 00 33 33 00 00 00 02 aa aa aa aa aa 0c 86 dd ..33............
60 00 00 00 00 10 3a ff fe 80 00 00 00 00 00 00 `.....:.........
backtrace (crc 3ab17827):
[<ffffffff81d3ebd5>] kmem_cache_alloc_node+0x2d5/0x380
[<ffffffff868cda3a>] kmalloc_reserve+0x16a/0x260
[<ffffffff868d6d76>] __alloc_skb+0x126/0x330
[<ffffffff875de515>] ndisc_alloc_skb+0x135/0x340
[<ffffffff875eb81e>] ndisc_send_rs+0x7e/0x910
[<ffffffff87577ffb>] addrconf_rs_timer+0x2fb/0x7e0
[<ffffffff8175f42a>] call_timer_fn+0x17a/0x500
[<ffffffff8175fe44>] __run_timers.part.0+0x684/0x970
[<ffffffff817601fa>] run_timer_softirq+0xba/0x1d0
[<ffffffff88a1932f>] __do_softirq+0x1df/0x8ce

2) When sending Neighbor Solicitation (Type 135)

BUG: memory leak
unreferenced object ffff888015215500 (size 704):
comm "kworker/1:1", pid 49, jiffies 4294733392
hex dump (first 32 bytes):
00 a0 33 33 ff aa aa 0a aa aa aa aa aa 0a 86 dd ..33............
60 00 00 00 00 20 3a ff 00 00 00 00 00 00 00 00 `.... :.........
backtrace (crc 7d46e68f):
[<ffffffff81d3ebd5>] kmem_cache_alloc_node+0x2d5/0x380
[<ffffffff868cda3a>] kmalloc_reserve+0x16a/0x260
[<ffffffff868d6d76>] __alloc_skb+0x126/0x330
[<ffffffff875de705>] ndisc_alloc_skb+0x135/0x340
[<ffffffff875def82>] ndisc_ns_create+0x192/0xc40
[<ffffffff875eb538>] ndisc_send_ns+0x98/0x130
[<ffffffff8757a2bf>] addrconf_dad_work+0xc5f/0x14a0
[<ffffffff81541cca>] process_one_work+0x7ca/0x1450
[<ffffffff815431ce>] worker_thread+0x86e/0x1230
[<ffffffff81564639>] kthread+0x339/0x440
[<ffffffff8131a088>] ret_from_fork+0x48/0x80
[<ffffffff81004ddb>] ret_from_fork_asm+0x1b/0x30

3) When sending MLDv2 Multicast Listener Report (Type 143)

BUG: memory leak
unreferenced object ffff8881055df000 (size 2048):
comm "kworker/3:2", pid 272, jiffies 4294729488
hex dump (first 32 bytes):
00 00 33 33 00 00 00 16 de c9 ae f0 47 15 86 dd ..33........G...
60 00 00 00 00 24 00 01 00 00 00 00 00 00 00 00 `....$..........
backtrace (crc 981e6671):
[<ffffffff81d40443>] __kmalloc_node_track_caller+0x3b3/0x4b0
[<ffffffff868cd9ef>] kmalloc_reserve+0xef/0x260
[<ffffffff868d6d56>] __alloc_skb+0x126/0x330
[<ffffffff868eed04>] alloc_skb_with_frags+0xe4/0x710
[<ffffffff868b76f8>] sock_alloc_send_pskb+0x7e8/0x970
[<ffffffff87616d63>] mld_newpack.isra.0+0x1e3/0xa90
[<ffffffff876178ac>] add_grhead+0x28c/0x380
[<ffffffff87620437>] add_grec+0x11e7/0x18a0
[<ffffffff8762528f>] mld_ifc_work+0x41f/0xcd0
[<ffffffff81541cca>] process_one_work+0x7ca/0x1450
[<ffffffff815431ce>] worker_thread+0x86e/0x1230
[<ffffffff81564639>] kthread+0x339/0x440
[<ffffffff8131a088>] ret_from_fork+0x48/0x80
[<ffffffff81004ddb>] ret_from_fork_asm+0x1b/0x30

To make sure that these aren't leaks, I made a patch that adds fields
to struct skb_shared_info for debugging. The patch sets a unique ID for
the network packet when the packet is allocated and prints this ID when
the packet is deleted in the skb_release_data() function.

Patch fragment:

+struct skb_dbg {
+ long id; // a unique ID for skb_shared_info
+ atomic_t clone_count; // the number of cloning
+};

@@ -603,6 +622,7 @@ struct skb_shared_info {
unsigned int gso_type;
u32 tskey;

+ struct skb_dbg dbg;


I also tried to dump leaked objects in this way:

BUG: memory leak
unreferenced object 0xffff888082831000 (size 2048):
...
BUG: memory leak
unreferenced object 0xffff888103c3de00 (size 704):
...
BUG: memory leak
unreferenced object 0xffff888081ecd000 (size 2048):
...

root@syzkaller:~# echo dump=0xffff888082831000 > /sys/kernel/debug/kmemleak
[ 192.540842] kmemleak: Unknown object at 0xffff888082831000

root@syzkaller:~# echo dump=0xffff888103c3de00 > /sys/kernel/debug/kmemleak
[ 211.628825] kmemleak: Unknown object at 0xffff888103c3de00

root@syzkaller:~# echo dump=0xffff888081ecd000 > /sys/kernel/debug/kmemleak
[ 220.300779] kmemleak: Unknown object at 0xffff888081ecd000

As one can be see, these objects aren't in the list of kmemleak detector objects.

I assume that one of the reasons for the leaks is that after cloning a packet,
the kmemleak detector somehow doesn't take sk_buff->head links into account
during the scan. In scenarios with bridge, cloning happens quite a lot.

Interestingly, if I add a single line to __skb_clone() that increases
the cloning counter, the leaks are no longer detected:

@@ -1614,6 +1619,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
refcount_set(&n->users, 1);

atomic_inc(&(skb_shinfo(skb)->dataref));
+ atomic_inc(&(skb_shinfo(skb)->dbg.clone_count)); // when you add this line, the leaks disappear
skb->cloned = 1;

I'm attaching a slightly simplified reproducer:

=* repro.c =*
#define _GNU_SOURCE

#include <arpa/inet.h>
#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <netinet/in.h>
#include <sched.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#include <linux/capability.h>
#include <linux/genetlink.h>
#include <linux/if_addr.h>
#include <linux/if_ether.h>
#include <linux/if_link.h>

#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/neighbour.h>
#include <linux/net.h>
#include <linux/netlink.h>
//#include <linux/nl80211.h>
#include <linux/rfkill.h>
#include <linux/rtnetlink.h>
#include <linux/tcp.h>
#include <linux/veth.h>

static unsigned long long procid;

static __thread int skip_segv;
static __thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t *info, void *ctx)
{
uintptr_t addr = (uintptr_t)info->si_addr;
const uintptr_t prog_start = 1 << 20;
const uintptr_t prog_end = 100 << 20;
int skip = __atomic_load_n(&skip_segv, __ATOMIC_RELAXED) != 0;
int valid = addr < prog_start || addr > prog_end;
if (skip && valid) {
_longjmp(segv_env, 1);
}
exit(sig);
}

static void install_segv_handler(void)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_IGN;
syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = segv_handler;
sa.sa_flags = SA_NODEFER | SA_SIGINFO;
sigaction(SIGSEGV, &sa, NULL);
sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...) \
({ \
int ok = 1; \
__atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST); \
if (_setjmp(segv_env) == 0) { \
__VA_ARGS__; \
} else \
ok = 0; \
__atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST); \
ok; \
})

static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}

static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static void use_temporary_dir(void)
{
char tmpdir_template[] = "./syzkaller.XXXXXX";
char *tmpdir = mkdtemp(tmpdir_template);
if (!tmpdir)
exit(1);
if (chmod(tmpdir, 0777))
exit(1);
if (chdir(tmpdir))
exit(1);
}

static bool write_file(const char *file, const char *what, ...)
{
char buf[1024];
va_list args;
va_start(args, what);
vsnprintf(buf, sizeof(buf), what, args);
va_end(args);
buf[sizeof(buf) - 1] = 0;
int len = strlen(buf);
int fd = open(file, O_WRONLY | O_CLOEXEC);
if (fd == -1)
return false;
if (write(fd, buf, len) != len) {
int err = errno;
close(fd);
errno = err;
return false;
}
close(fd);
return true;
}

struct nlmsg {
char *pos;
int nesting;
struct nlattr *nested[8];
char buf[4096];
};

static void netlink_init(struct nlmsg *nlmsg, int typ, int flags,
const void *data, int size)
{
memset(nlmsg, 0, sizeof(*nlmsg));
struct nlmsghdr *hdr = (struct nlmsghdr *)nlmsg->buf;
hdr->nlmsg_type = typ;
hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
memcpy(hdr + 1, data, size);
nlmsg->pos = (char *)(hdr + 1) + NLMSG_ALIGN(size);
}

static void netlink_attr(struct nlmsg *nlmsg, int typ, const void *data,
int size)
{
struct nlattr *attr = (struct nlattr *)nlmsg->pos;
attr->nla_len = sizeof(*attr) + size;
attr->nla_type = typ;
if (size > 0)
memcpy(attr + 1, data, size);
nlmsg->pos += NLMSG_ALIGN(attr->nla_len);
}

static void netlink_nest(struct nlmsg *nlmsg, int typ)
{
struct nlattr *attr = (struct nlattr *)nlmsg->pos;
attr->nla_type = typ;
nlmsg->pos += sizeof(*attr);
nlmsg->nested[nlmsg->nesting++] = attr;
}

static void netlink_done(struct nlmsg *nlmsg)
{
struct nlattr *attr = nlmsg->nested[--nlmsg->nesting];
attr->nla_len = nlmsg->pos - (char *)attr;
}

static int netlink_send_ext(struct nlmsg *nlmsg, int sock, uint16_t reply_type,
int *reply_len, bool dofail)
{
if (nlmsg->pos > nlmsg->buf + sizeof(nlmsg->buf) || nlmsg->nesting)
exit(1);
struct nlmsghdr *hdr = (struct nlmsghdr *)nlmsg->buf;
hdr->nlmsg_len = nlmsg->pos - nlmsg->buf;
struct sockaddr_nl addr;
memset(&addr, 0, sizeof(addr));
addr.nl_family = AF_NETLINK;
ssize_t n = sendto(sock, nlmsg->buf, hdr->nlmsg_len, 0,
(struct sockaddr *)&addr, sizeof(addr));
if (n != (ssize_t)hdr->nlmsg_len) {
if (dofail)
exit(1);
return -1;
}
n = recv(sock, nlmsg->buf, sizeof(nlmsg->buf), 0);
if (reply_len)
*reply_len = 0;
if (n < 0) {
if (dofail)
exit(1);
return -1;
}
if (n < (ssize_t)sizeof(struct nlmsghdr)) {
errno = EINVAL;
if (dofail)
exit(1);
return -1;
}
if (hdr->nlmsg_type == NLMSG_DONE)
return 0;
if (reply_len && hdr->nlmsg_type == reply_type) {
*reply_len = n;
return 0;
}
if (n < (ssize_t)(sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr))) {
errno = EINVAL;
if (dofail)
exit(1);
return -1;
}
if (hdr->nlmsg_type != NLMSG_ERROR) {
errno = EINVAL;
if (dofail)
exit(1);
return -1;
}
errno = -((struct nlmsgerr *)(hdr + 1))->error;
return -errno;
}

static int netlink_send(struct nlmsg *nlmsg, int sock)
{
return netlink_send_ext(nlmsg, sock, 0, NULL, true);
}


static void netlink_add_device_impl(struct nlmsg *nlmsg, const char *type,
const char *name)
{
struct ifinfomsg hdr;
memset(&hdr, 0, sizeof(hdr));
netlink_init(nlmsg, RTM_NEWLINK, NLM_F_EXCL | NLM_F_CREATE, &hdr,
sizeof(hdr));
if (name)
netlink_attr(nlmsg, IFLA_IFNAME, name, strlen(name));
netlink_nest(nlmsg, IFLA_LINKINFO);
netlink_attr(nlmsg, IFLA_INFO_KIND, type, strlen(type));
}

static void netlink_add_device(struct nlmsg *nlmsg, int sock, const char *type,
const char *name)
{
netlink_add_device_impl(nlmsg, type, name);
netlink_done(nlmsg);
int err = netlink_send(nlmsg, sock);
if (err < 0) {
}
}

static void netlink_add_veth(struct nlmsg *nlmsg, int sock, const char *name,
const char *peer)
{
netlink_add_device_impl(nlmsg, "veth", name);
netlink_nest(nlmsg, IFLA_INFO_DATA);
netlink_nest(nlmsg, VETH_INFO_PEER);
nlmsg->pos += sizeof(struct ifinfomsg);
netlink_attr(nlmsg, IFLA_IFNAME, peer, strlen(peer));
netlink_done(nlmsg);
netlink_done(nlmsg);
netlink_done(nlmsg);
int err = netlink_send(nlmsg, sock);
if (err < 0) {
}
}

static void netlink_device_change(struct nlmsg *nlmsg, int sock,
const char *name, bool up, const char *master,
const void *mac, int macsize,
const char *new_name)
{
struct ifinfomsg hdr;
memset(&hdr, 0, sizeof(hdr));
if (up)
hdr.ifi_flags = hdr.ifi_change = IFF_UP;
hdr.ifi_index = if_nametoindex(name);
netlink_init(nlmsg, RTM_NEWLINK, 0, &hdr, sizeof(hdr));
if (new_name)
netlink_attr(nlmsg, IFLA_IFNAME, new_name, strlen(new_name));
if (master) {
int ifindex = if_nametoindex(master);
netlink_attr(nlmsg, IFLA_MASTER, &ifindex, sizeof(ifindex));
}
if (macsize)
netlink_attr(nlmsg, IFLA_ADDRESS, mac, macsize);
int err = netlink_send(nlmsg, sock);
if (err < 0) {
}
}

static int netlink_add_addr(struct nlmsg *nlmsg, int sock, const char *dev,
const void *addr, int addrsize)
{
struct ifaddrmsg hdr;
memset(&hdr, 0, sizeof(hdr));
hdr.ifa_family = addrsize == 4 ? AF_INET : AF_INET6;
hdr.ifa_prefixlen = addrsize == 4 ? 24 : 120;
hdr.ifa_scope = RT_SCOPE_UNIVERSE;
hdr.ifa_index = if_nametoindex(dev);
netlink_init(nlmsg, RTM_NEWADDR, NLM_F_CREATE | NLM_F_REPLACE, &hdr,
sizeof(hdr));
netlink_attr(nlmsg, IFA_LOCAL, addr, addrsize);
netlink_attr(nlmsg, IFA_ADDRESS, addr, addrsize);
return netlink_send(nlmsg, sock);
}

static void netlink_add_addr4(struct nlmsg *nlmsg, int sock, const char *dev,
const char *addr)
{
struct in_addr in_addr;
inet_pton(AF_INET, addr, &in_addr);
int err = netlink_add_addr(nlmsg, sock, dev, &in_addr, sizeof(in_addr));
if (err < 0) {
}
}

static void netlink_add_addr6(struct nlmsg *nlmsg, int sock, const char *dev,
const char *addr)
{
struct in6_addr in6_addr;
inet_pton(AF_INET6, addr, &in6_addr);
int err =
netlink_add_addr(nlmsg, sock, dev, &in6_addr, sizeof(in6_addr));
if (err < 0) {
}
}

static struct nlmsg nlmsg;

#define DEV_IPV4 "172.20.20.%d"
#define DEV_IPV6 "fe80::%02x"
#define DEV_MAC 0x00aaaaaaaaaa

static void initialize_netdevices(void)
{
struct {
const char *type;
const char *dev;
} devtypes[] = {
{ "bridge", "bridge0" },
{ "veth", 0 },
};

const char *devmasters[] = {
"bridge",
};

struct {
const char *name;
int macsize;
} devices[] = {
//{ "lo", ETH_ALEN },
{ "bridge0", ETH_ALEN },
// { "veth0", ETH_ALEN },
// { "veth1", ETH_ALEN },
{ "veth0_to_bridge", ETH_ALEN },
{ "veth1_to_bridge", ETH_ALEN },
};
int sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sock == -1)
exit(1);
unsigned i;
for (i = 0; i < sizeof(devtypes) / sizeof(devtypes[0]); i++)
netlink_add_device(&nlmsg, sock, devtypes[i].type,
devtypes[i].dev);
for (i = 0; i < sizeof(devmasters) / (sizeof(devmasters[0])); i++) {
char master[32], slave0[32], veth0[32], slave1[32], veth1[32];
sprintf(slave0, "%s_slave_0", devmasters[i]);
sprintf(veth0, "veth0_to_%s", devmasters[i]);
netlink_add_veth(&nlmsg, sock, slave0, veth0);
sprintf(slave1, "%s_slave_1", devmasters[i]);
sprintf(veth1, "veth1_to_%s", devmasters[i]);
netlink_add_veth(&nlmsg, sock, slave1, veth1);
sprintf(master, "%s0", devmasters[i]);
netlink_device_change(&nlmsg, sock, slave0, false, master, 0, 0,
NULL);
netlink_device_change(&nlmsg, sock, slave1, false, master, 0, 0,
NULL);
}
netlink_device_change(&nlmsg, sock, "bridge_slave_0", true, 0, 0, 0,
NULL);
netlink_device_change(&nlmsg, sock, "bridge_slave_1", true, 0, 0, 0,
NULL);

char addr[32];
sprintf(addr, DEV_IPV4, 14 + 10);

printf("-------------------------\n");
for (i = 0; i < sizeof(devices) / (sizeof(devices[0])); i++) {
char addr[32];
sprintf(addr, DEV_IPV4, i + 10);
netlink_add_addr4(&nlmsg, sock, devices[i].name, addr);

uint64_t macaddr = DEV_MAC + ((i + 10ull) << 40);
uint8_t * mac = (uint8_t *)&macaddr;

sprintf(addr, DEV_IPV6, i + 10);
netlink_add_addr6(&nlmsg, sock, devices[i].name, addr);
printf("device: %16s addr %s mac %02x:%02x:%02x:%02x:%02x:%02x\n"
, devices[i].name
, addr
, mac[0]
, mac[1]
, mac[2]
, mac[3]
, mac[4]
, mac[5]
);

netlink_device_change(&nlmsg, sock, devices[i].name, true, 0,
&macaddr, devices[i].macsize, NULL);
}
printf("-------------------------\n");
close(sock);
}

#define SIZEOF_IO_URING_SQE 64
#define SIZEOF_IO_URING_CQE 16
#define SQ_HEAD_OFFSET 0
#define SQ_TAIL_OFFSET 64
#define SQ_RING_MASK_OFFSET 256
#define SQ_RING_ENTRIES_OFFSET 264
#define SQ_FLAGS_OFFSET 276
#define SQ_DROPPED_OFFSET 272
#define CQ_HEAD_OFFSET 128
#define CQ_TAIL_OFFSET 192
#define CQ_RING_MASK_OFFSET 260
#define CQ_RING_ENTRIES_OFFSET 268
#define CQ_RING_OVERFLOW_OFFSET 284
#define CQ_FLAGS_OFFSET 280
#define CQ_CQES_OFFSET 320

struct io_sqring_offsets {
uint32_t head;
uint32_t tail;
uint32_t ring_mask;
uint32_t ring_entries;
uint32_t flags;
uint32_t dropped;
uint32_t array;
uint32_t resv1;
uint64_t resv2;
};

struct io_cqring_offsets {
uint32_t head;
uint32_t tail;
uint32_t ring_mask;
uint32_t ring_entries;
uint32_t overflow;
uint32_t cqes;
uint64_t resv[2];
};

struct io_uring_params {
uint32_t sq_entries;
uint32_t cq_entries;
uint32_t flags;
uint32_t sq_thread_cpu;
uint32_t sq_thread_idle;
uint32_t features;
uint32_t resv[4];
struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off;
};

#define IORING_OFF_SQ_RING 0
#define IORING_OFF_SQES 0x10000000ULL


#define MAX_FDS 30

static void mount_cgroups(const char *dir, const char **controllers, int count)
{
if (mkdir(dir, 0777)) {
}
char enabled[128] = { 0 };
int i = 0;
for (; i < count; i++) {
if (mount("none", dir, "cgroup", 0, controllers[i])) {
continue;
}
umount(dir);
strcat(enabled, ",");
strcat(enabled, controllers[i]);
}
if (enabled[0] == 0)
return;
if (mount("none", dir, "cgroup", 0, enabled + 1)) {
}
if (chmod(dir, 0777)) {
}
}

static void setup_cgroups()
{
const char *unified_controllers[] = { "+cpu", "+memory", "+io",
"+pids" };
const char *net_controllers[] = { "net", "net_prio", "devices", "blkio",
"freezer" };
const char *cpu_controllers[] = { "cpuset", "cpuacct", "hugetlb",
"rlimit" };
if (mkdir("/syzcgroup", 0777)) {
}
if (mkdir("/syzcgroup/unified", 0777)) {
}
if (mount("none", "/syzcgroup/unified", "cgroup2", 0, NULL)) {
}
if (chmod("/syzcgroup/unified", 0777)) {
}
int unified_control =
open("/syzcgroup/unified/cgroup.subtree_control", O_WRONLY);
if (unified_control != -1) {
unsigned i;
for (i = 0; i < sizeof(unified_controllers) /
sizeof(unified_controllers[0]);
i++)
if (write(unified_control, unified_controllers[i],
strlen(unified_controllers[i])) < 0) {
}
close(unified_control);
}
mount_cgroups("/syzcgroup/net", net_controllers,
sizeof(net_controllers) / sizeof(net_controllers[0]));
mount_cgroups("/syzcgroup/cpu", cpu_controllers,
sizeof(cpu_controllers) / sizeof(cpu_controllers[0]));
write_file("/syzcgroup/cpu/cgroup.clone_children", "1");
write_file("/syzcgroup/cpu/cpuset.memory_pressure_enabled", "1");
}

static void setup_cgroups_loop()
{
int pid = getpid();
char file[128];
char cgroupdir[64];
snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/unified/syz%llu",
procid);
if (mkdir(cgroupdir, 0777)) {
}
snprintf(file, sizeof(file), "%s/pids.max", cgroupdir);
write_file(file, "32");
snprintf(file, sizeof(file), "%s/memory.low", cgroupdir);
write_file(file, "%d", 298 << 20);
snprintf(file, sizeof(file), "%s/memory.high", cgroupdir);
write_file(file, "%d", 299 << 20);
snprintf(file, sizeof(file), "%s/memory.max", cgroupdir);
write_file(file, "%d", 300 << 20);
snprintf(file, sizeof(file), "%s/cgroup.procs", cgroupdir);
write_file(file, "%d", pid);
snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/cpu/syz%llu",
procid);
if (mkdir(cgroupdir, 0777)) {
}
snprintf(file, sizeof(file), "%s/cgroup.procs", cgroupdir);
write_file(file, "%d", pid);
snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/net/syz%llu",
procid);
if (mkdir(cgroupdir, 0777)) {
}
snprintf(file, sizeof(file), "%s/cgroup.procs", cgroupdir);
write_file(file, "%d", pid);
}

static void setup_cgroups_test()
{
char cgroupdir[64];
snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/unified/syz%llu",
procid);
if (symlink(cgroupdir, "./cgroup")) {
}
snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/cpu/syz%llu",
procid);
if (symlink(cgroupdir, "./cgroup.cpu")) {
}
snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/net/syz%llu",
procid);
if (symlink(cgroupdir, "./cgroup.net")) {
}
}

static void setup_common()
{
if (mount(0, "/sys/fs/fuse/connections", "fusectl", 0, 0)) {
}
}

static void setup_binderfs()
{
if (mkdir("/dev/binderfs", 0777)) {
}
if (mount("binder", "/dev/binderfs", "binder", 0, NULL)) {
}
}

static void loop();

static void sandbox_common()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setsid();
struct rlimit rlim;
rlim.rlim_cur = rlim.rlim_max = (200 << 20);
setrlimit(RLIMIT_AS, &rlim);
rlim.rlim_cur = rlim.rlim_max = 32 << 20;
setrlimit(RLIMIT_MEMLOCK, &rlim);
rlim.rlim_cur = rlim.rlim_max = 136 << 20;
setrlimit(RLIMIT_FSIZE, &rlim);
rlim.rlim_cur = rlim.rlim_max = 1 << 20;
setrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = rlim.rlim_max = 0;
setrlimit(RLIMIT_CORE, &rlim);
rlim.rlim_cur = rlim.rlim_max = 256;
setrlimit(RLIMIT_NOFILE, &rlim);
if (unshare(CLONE_NEWNS)) {
}
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
}
if (unshare(CLONE_NEWIPC)) {
}
if (unshare(0x02000000)) {
}
if (unshare(CLONE_NEWUTS)) {
}
if (unshare(CLONE_SYSVSEM)) {
}
typedef struct {
const char *name;
const char *value;
} sysctl_t;
static const sysctl_t sysctls[] = {
{ "/proc/sys/kernel/shmmax", "16777216" },
{ "/proc/sys/kernel/shmall", "536870912" },
{ "/proc/sys/kernel/shmmni", "1024" },
{ "/proc/sys/kernel/msgmax", "8192" },
{ "/proc/sys/kernel/msgmni", "1024" },
{ "/proc/sys/kernel/msgmnb", "1024" },
{ "/proc/sys/kernel/sem", "1024 1048576 500 1024" },
};
unsigned i;
for (i = 0; i < sizeof(sysctls) / sizeof(sysctls[0]); i++)
write_file(sysctls[i].name, sysctls[i].value);
}

static int wait_for_loop(int pid)
{
if (pid < 0)
exit(1);
int status = 0;
while (waitpid(-1, &status, __WALL) != pid) {
}
return WEXITSTATUS(status);
}

static void drop_caps(void)
{
struct __user_cap_header_struct cap_hdr = {};
struct __user_cap_data_struct cap_data[2] = {};
cap_hdr.version = _LINUX_CAPABILITY_VERSION_3;
cap_hdr.pid = getpid();
if (syscall(SYS_capget, &cap_hdr, &cap_data))
exit(1);
const int drop = (1 << CAP_SYS_PTRACE) | (1 << CAP_SYS_NICE);
cap_data[0].effective &= ~drop;
cap_data[0].permitted &= ~drop;
cap_data[0].inheritable &= ~drop;
if (syscall(SYS_capset, &cap_hdr, &cap_data))
exit(1);
}

static int do_sandbox_none(void)
{
if (unshare(CLONE_NEWPID)) {
}
int pid = fork();
if (pid != 0)
return wait_for_loop(pid);
setup_common();
sandbox_common();
drop_caps();
if (unshare(CLONE_NEWNET)) {
}
initialize_netdevices();
setup_binderfs();
loop();
exit(1);
}

#define FS_IOC_SETFLAGS _IOW('f', 2, long)
static void remove_dir(const char *dir)
{
int iter = 0;
DIR *dp = 0;
retry:
while (umount2(dir, MNT_DETACH | UMOUNT_NOFOLLOW) == 0) {
}
dp = opendir(dir);
if (dp == NULL) {
if (errno == EMFILE) {
exit(1);
}
exit(1);
}
struct dirent *ep = 0;
while ((ep = readdir(dp))) {
if (strcmp(ep->d_name, ".") == 0 ||
strcmp(ep->d_name, "..") == 0)
continue;
char filename[FILENAME_MAX];
snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
while (umount2(filename, MNT_DETACH | UMOUNT_NOFOLLOW) == 0) {
}
struct stat st;
if (lstat(filename, &st))
exit(1);
if (S_ISDIR(st.st_mode)) {
remove_dir(filename);
continue;
}
int i;
for (i = 0;; i++) {
if (unlink(filename) == 0)
break;
if (errno == EPERM) {
int fd = open(filename, O_RDONLY);
if (fd != -1) {
long flags = 0;
if (ioctl(fd, FS_IOC_SETFLAGS,
&flags) == 0) {
}
close(fd);
continue;
}
}
if (errno == EROFS) {
break;
}
if (errno != EBUSY || i > 100)
exit(1);
if (umount2(filename, MNT_DETACH | UMOUNT_NOFOLLOW))
exit(1);
}
}
closedir(dp);
for (int i = 0;; i++) {
if (rmdir(dir) == 0)
break;
if (i < 100) {
if (errno == EPERM) {
int fd = open(dir, O_RDONLY);
if (fd != -1) {
long flags = 0;
if (ioctl(fd, FS_IOC_SETFLAGS,
&flags) == 0) {
}
close(fd);
continue;
}
}
if (errno == EROFS) {
break;
}
if (errno == EBUSY) {
if (umount2(dir, MNT_DETACH | UMOUNT_NOFOLLOW))
exit(1);
continue;
}
if (errno == ENOTEMPTY) {
if (iter < 100) {
iter++;
goto retry;
}
}
}
exit(1);
}
}

static void kill_and_wait(int pid, int *status)
{
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
for (int i = 0; i < 100; i++) {
if (waitpid(-1, status, WNOHANG | __WALL) == pid)
return;
usleep(1000);
}
DIR *dir = opendir("/sys/fs/fuse/connections");
if (dir) {
for (;;) {
struct dirent *ent = readdir(dir);
if (!ent)
break;
if (strcmp(ent->d_name, ".") == 0 ||
strcmp(ent->d_name, "..") == 0)
continue;
char abort[300];
snprintf(abort, sizeof(abort),
"/sys/fs/fuse/connections/%s/abort",
ent->d_name);
int fd = open(abort, O_WRONLY);
if (fd == -1) {
continue;
}
if (write(fd, abort, 1) < 0) {
}
close(fd);
}
closedir(dir);
} else {
}
while (waitpid(-1, status, __WALL) != pid) {
}
}

static void setup_loop()
{
setup_cgroups_loop();
}

static void setup_test()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
setup_cgroups_test();
write_file("/proc/self/oom_score_adj", "1000");
if (symlink("/dev/binderfs", "./binderfs")) {
}
}

static void close_fds()
{
for (int fd = 3; fd < MAX_FDS; fd++)
close(fd);
}

#define KMEMLEAK_FILE "/sys/kernel/debug/kmemleak"

static void setup_leak()
{
if (!write_file(KMEMLEAK_FILE, "scan"))
exit(1);
sleep(5);
if (!write_file(KMEMLEAK_FILE, "scan"))
exit(1);
}

static void check_leaks(void)
{
int fd = open(KMEMLEAK_FILE, O_RDWR);
if (fd == -1)
exit(1);
uint64_t start = current_time_ms();
if (write(fd, "scan", 4) != 4)
exit(1);
sleep(1);
while (current_time_ms() - start < 4 * 1000)
sleep(1);
if (write(fd, "scan", 4) != 4)
exit(1);
static char buf[128 << 10];
ssize_t n = read(fd, buf, sizeof(buf) - 1);
if (n < 0)
exit(1);
int nleaks = 0;
if (n != 0) {
sleep(1);
if (write(fd, "scan", 4) != 4)
exit(1);
if (lseek(fd, 0, SEEK_SET) < 0)
exit(1);
n = read(fd, buf, sizeof(buf) - 1);
if (n < 0)
exit(1);
buf[n] = 0;
char *pos = buf;
char *end = buf + n;
while (pos < end) {
char *next = strstr(pos + 1, "unreferenced object");
if (!next)
next = end;
char prev = *next;
*next = 0;
fprintf(stderr, "BUG: memory leak\n%s\n", pos);
*next = prev;
pos = next;
nleaks++;
}
}
close(fd);
if (nleaks)
exit(1);
}

static void setup_binfmt_misc()
{
if (mount(0, "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, 0)) {
}
write_file("/proc/sys/fs/binfmt_misc/register",
":syz0:M:0:\x01::./file0:");
write_file("/proc/sys/fs/binfmt_misc/register",
":syz1:M:1:\x02::./file0:POC");
}

static void setup_sysctl()
{
char mypid[32];
snprintf(mypid, sizeof(mypid), "%d", getpid());
struct {
const char *name;
const char *data;
} files[] = {
{ "/sys/kernel/debug/x86/nmi_longest_ns", "10000000000" },
{ "/proc/sys/kernel/hung_task_check_interval_secs", "20" },
//{ "/proc/sys/net/core/bpf_jit_kallsyms", "1" },
//{ "/proc/sys/net/core/bpf_jit_harden", "0" },
{ "/proc/sys/kernel/kptr_restrict", "0" },
{ "/proc/sys/kernel/softlockup_all_cpu_backtrace", "1" },
{ "/proc/sys/fs/mount-max", "100" },
{ "/proc/sys/vm/oom_dump_tasks", "0" },
{ "/proc/sys/debug/exception-trace", "0" },
{ "/proc/sys/kernel/printk", "7 4 1 3" },
{ "/proc/sys/net/ipv4/ping_group_range", "0 65535" },
{ "/proc/sys/kernel/keys/gc_delay", "1" },
{ "/proc/sys/vm/oom_kill_allocating_task", "1" },
{ "/proc/sys/kernel/ctrl-alt-del", "0" },
{ "/proc/sys/kernel/cad_pid", mypid },
};
for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); i++) {
if (!write_file(files[i].name, files[i].data))
printf("write to %s failed: %s\n", files[i].name,
strerror(errno));
}
}

static void execute_one(void);

#define WAIT_FLAGS __WALL

static void loop(void)
{
setup_loop();
int iter = 0;
for (;; iter++) {
char cwdbuf[32];
sprintf(cwdbuf, "./%d", iter);
if (mkdir(cwdbuf, 0777))
exit(1);
int pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
if (chdir(cwdbuf))
exit(1);
setup_test();
execute_one();
close_fds();
exit(0);
}
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
break;
sleep_ms(1);
if (current_time_ms() - start < 5000)
continue;
kill_and_wait(pid, &status);
break;
}
remove_dir(cwdbuf);
check_leaks();
}
}

uint64_t r[2] = { 0xffffffffffffffff, 0xffffffffffffffff };

void execute_one(void)
{
intptr_t res = 0;
res = syscall(__NR_socket, 0x10ul, 3ul, 0);
fprintf(stderr, "### %s:%u errno=%u\n", __func__, __LINE__, res == -1 ? errno : 0);
if (res != -1)
r[0] = res;
syscall(__NR_getpgrp, -1);
res = syscall(__NR_socket, 0xaul, 1ul, 0);
fprintf(stderr, "### %s:%u errno=%u\n", __func__, __LINE__, res == -1 ? errno : 0);
if (res != -1)
r[1] = res;
NONFAILING(memcpy((void *)0x20000040, "veth1_to_bridge\000", 16));
res = syscall(__NR_ioctl, r[1], 0x8933, 0x20000040ul);
fprintf(stderr, "### %s:%u errno=%u\n", __func__, __LINE__, res == -1 ? errno : 0);
NONFAILING(memcpy((void *)0x20000040,
"bridge0\000\000\000\000\000\000\000\000\000", 16));
res = syscall(__NR_ioctl, r[0], 0x89a2, 0x20000040ul);
fprintf(stderr, "### %s:%u errno=%u\n", __func__, __LINE__, res == -1 ? errno : 0);
//syscall(__NR_migrate_pages, 0, 3ul, 0ul, 0ul);
}

int main(void)
{
syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
initialize_netdevices();
setup_sysctl();
setup_cgroups();
setup_binfmt_misc();
setup_leak();
install_segv_handler();
use_temporary_dir();
do_sandbox_none();
return 0;
}

Thanks, please let me know if you need any additional detailed information.

Regards,
Ilia Gavrilov.