net/ipv4: inconsistent lock state in tcp_conn_request/inet_ehash_insert
From: Andrey Konovalov
Date: Wed Mar 01 2017 - 10:27:14 EST
Hi,
I've got the following error report while fuzzing the kernel with syzkaller.
On commit e5d56efc97f8240d0b5d66c03949382b6d7e5570 (Feb 26).
A reproducer and .config are attached.
=================================
[ INFO: inconsistent lock state ]
4.10.0+ #60 Not tainted
---------------------------------
inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
syz-executor0/5090 [HC0[0]:SC0[0]:HE1:SE1] takes:
(&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at:
[<ffffffff83a6a370>] spin_lock include/linux/spinlock.h:299 [inline]
(&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at:
[<ffffffff83a6a370>] inet_ehash_insert+0x240/0xad0
net/ipv4/inet_hashtables.c:407
{IN-SOFTIRQ-W} state was registered at:
mark_irqflags kernel/locking/lockdep.c:2923 [inline]
__lock_acquire+0xbcf/0x3270 kernel/locking/lockdep.c:3295
lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753
__raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
_raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151
spin_lock include/linux/spinlock.h:299 [inline]
inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407
reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline]
inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764
tcp_conn_request+0x25cc/0x3310 net/ipv4/tcp_input.c:6399
tcp_v4_conn_request+0x157/0x220 net/ipv4/tcp_ipv4.c:1262
tcp_rcv_state_process+0x802/0x4130 net/ipv4/tcp_input.c:5889
tcp_v4_do_rcv+0x56b/0x940 net/ipv4/tcp_ipv4.c:1433
tcp_v4_rcv+0x2e12/0x3210 net/ipv4/tcp_ipv4.c:1711
ip_local_deliver_finish+0x4ce/0xc40 net/ipv4/ip_input.c:216
NF_HOOK include/linux/netfilter.h:257 [inline]
ip_local_deliver+0x1ce/0x710 net/ipv4/ip_input.c:257
dst_input include/net/dst.h:492 [inline]
ip_rcv_finish+0xb1d/0x2110 net/ipv4/ip_input.c:396
NF_HOOK include/linux/netfilter.h:257 [inline]
ip_rcv+0xd90/0x19c0 net/ipv4/ip_input.c:487
__netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4179
__netif_receive_skb+0x2a/0x170 net/core/dev.c:4217
netif_receive_skb_internal+0x1d6/0x430 net/core/dev.c:4245
napi_skb_finish net/core/dev.c:4602 [inline]
napi_gro_receive+0x4e6/0x680 net/core/dev.c:4636
e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4033 [inline]
e1000_clean_rx_irq+0x5e0/0x1490
drivers/net/ethernet/intel/e1000/e1000_main.c:4489
e1000_clean+0xb9a/0x2910 drivers/net/ethernet/intel/e1000/e1000_main.c:3834
napi_poll net/core/dev.c:5171 [inline]
net_rx_action+0xe70/0x1900 net/core/dev.c:5236
__do_softirq+0x2fb/0xb7d kernel/softirq.c:284
invoke_softirq kernel/softirq.c:364 [inline]
irq_exit+0x19e/0x1d0 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:658 [inline]
do_IRQ+0x81/0x1a0 arch/x86/kernel/irq.c:250
ret_from_intr+0x0/0x20
native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:53
arch_safe_halt arch/x86/include/asm/paravirt.h:98 [inline]
default_idle+0x8f/0x410 arch/x86/kernel/process.c:271
arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:262
default_idle_call+0x36/0x60 kernel/sched/idle.c:96
cpuidle_idle_call kernel/sched/idle.c:154 [inline]
do_idle+0x348/0x440 kernel/sched/idle.c:243
cpu_startup_entry+0x18/0x20 kernel/sched/idle.c:345
start_secondary+0x344/0x440 arch/x86/kernel/smpboot.c:272
verify_cpu+0x0/0xfc
irq event stamp: 1741
hardirqs last enabled at (1741): [<ffffffff84d49d77>]
__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160
[inline]
hardirqs last enabled at (1741): [<ffffffff84d49d77>]
_raw_spin_unlock_irqrestore+0xf7/0x1a0 kernel/locking/spinlock.c:191
hardirqs last disabled at (1740): [<ffffffff84d4a732>]
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline]
hardirqs last disabled at (1740): [<ffffffff84d4a732>]
_raw_spin_lock_irqsave+0xa2/0x110 kernel/locking/spinlock.c:159
softirqs last enabled at (1738): [<ffffffff84d4deff>]
__do_softirq+0x7cf/0xb7d kernel/softirq.c:310
softirqs last disabled at (1571): [<ffffffff84d4b92c>]
do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&(&hashinfo->ehash_locks[i])->rlock);
<Interrupt>
lock(&(&hashinfo->ehash_locks[i])->rlock);
*** DEADLOCK ***
1 lock held by syz-executor0/5090:
#0: (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff83406b43>] lock_sock
include/net/sock.h:1460 [inline]
#0: (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff83406b43>]
sock_setsockopt+0x233/0x1e40 net/core/sock.c:683
stack backtrace:
CPU: 1 PID: 5090 Comm: syz-executor0 Not tainted 4.10.0+ #60
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:15 [inline]
dump_stack+0x292/0x398 lib/dump_stack.c:51
print_usage_bug+0x3ef/0x450 kernel/locking/lockdep.c:2387
valid_state kernel/locking/lockdep.c:2400 [inline]
mark_lock_irq kernel/locking/lockdep.c:2602 [inline]
mark_lock+0xf30/0x1410 kernel/locking/lockdep.c:3065
mark_irqflags kernel/locking/lockdep.c:2941 [inline]
__lock_acquire+0x6dc/0x3270 kernel/locking/lockdep.c:3295
lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753
__raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
_raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151
spin_lock include/linux/spinlock.h:299 [inline]
inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407
reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline]
inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764
dccp_v6_conn_request+0xada/0x11b0 net/dccp/ipv6.c:380
dccp_rcv_state_process+0x51e/0x1660 net/dccp/input.c:606
dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632
sk_backlog_rcv include/net/sock.h:896 [inline]
__release_sock+0x127/0x3a0 net/core/sock.c:2052
release_sock+0xa5/0x2b0 net/core/sock.c:2539
sock_setsockopt+0x60f/0x1e40 net/core/sock.c:1016
SYSC_setsockopt net/socket.c:1782 [inline]
SyS_setsockopt+0x2fb/0x3a0 net/socket.c:1765
entry_SYSCALL_64_fastpath+0x1f/0xc2
RIP: 0033:0x4458b9
RSP: 002b:00007fe8b26c2b58 EFLAGS: 00000292 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00000000004458b9
RDX: 000000000000001a RSI: 0000000000000001 RDI: 0000000000000006
RBP: 00000000006e2110 R08: 0000000000000010 R09: 0000000000000000
R10: 00000000208c3000 R11: 0000000000000292 R12: 0000000000708000
R13: 0000000020000000 R14: 0000000000001000 R15: 0000000000000000
// autogenerated by syzkaller (http://github.com/google/syzkaller)
#ifndef __NR_listen
#define __NR_listen 50
#endif
#ifndef __NR_setsockopt
#define __NR_setsockopt 54
#endif
#ifndef __NR_connect
#define __NR_connect 42
#endif
#ifndef __NR_mmap
#define __NR_mmap 9
#endif
#ifndef __NR_socket
#define __NR_socket 41
#endif
#ifndef __NR_bind
#define __NR_bind 49
#endif
#define _GNU_SOURCE
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <linux/capability.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include <linux/kvm.h>
#include <linux/sched.h>
#include <net/if_arp.h>
#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <pthread.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
const int kFailStatus = 67;
const int kErrorStatus = 68;
const int kRetryStatus = 69;
__attribute__((noreturn)) void doexit(int status)
{
volatile unsigned i;
syscall(__NR_exit_group, status);
for (i = 0;; i++) {
}
}
__attribute__((noreturn)) void fail(const char* msg, ...)
{
int e = errno;
fflush(stdout);
va_list args;
va_start(args, msg);
vfprintf(stderr, msg, args);
va_end(args);
fprintf(stderr, " (errno %d)\n", e);
doexit(e == ENOMEM ? kRetryStatus : kFailStatus);
}
__attribute__((noreturn)) void exitf(const char* msg, ...)
{
int e = errno;
fflush(stdout);
va_list args;
va_start(args, msg);
vfprintf(stderr, msg, args);
va_end(args);
fprintf(stderr, " (errno %d)\n", e);
doexit(kRetryStatus);
}
static int flag_debug;
void debug(const char* msg, ...)
{
if (!flag_debug)
return;
va_list args;
va_start(args, msg);
vfprintf(stdout, msg, args);
va_end(args);
fflush(stdout);
}
__thread int skip_segv;
__thread jmp_buf segv_env;
static void segv_handler(int sig, siginfo_t* info, void* uctx)
{
uintptr_t addr = (uintptr_t)info->si_addr;
const uintptr_t prog_start = 1 << 20;
const uintptr_t prog_end = 100 << 20;
if (__atomic_load_n(&skip_segv, __ATOMIC_RELAXED) &&
(addr < prog_start || addr > prog_end)) {
debug("SIGSEGV on %p, skipping\n", addr);
_longjmp(segv_env, 1);
}
debug("SIGSEGV on %p, exiting\n", addr);
doexit(sig);
for (;;) {
}
}
static void install_segv_handler()
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = segv_handler;
sa.sa_flags = SA_NODEFER | SA_SIGINFO;
sigaction(SIGSEGV, &sa, NULL);
sigaction(SIGBUS, &sa, NULL);
}
#define NONFAILING(...) \
{ \
__atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST); \
if (_setjmp(segv_env) == 0) { \
__VA_ARGS__; \
} \
__atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST); \
}
#define BITMASK_LEN(type, bf_len) (type)((1ull << (bf_len)) - 1)
#define BITMASK_LEN_OFF(type, bf_off, bf_len) \
(type)(BITMASK_LEN(type, (bf_len)) << (bf_off))
#define STORE_BY_BITMASK(type, addr, val, bf_off, bf_len) \
if ((bf_off) == 0 && (bf_len) == 0) { \
*(type*)(addr) = (type)(val); \
} else { \
type new_val = *(type*)(addr); \
new_val &= ~BITMASK_LEN_OFF(type, (bf_off), (bf_len)); \
new_val |= ((type)(val)&BITMASK_LEN(type, (bf_len))) << (bf_off); \
*(type*)(addr) = new_val; \
}
static uintptr_t execute_syscall(int nr, uintptr_t a0, uintptr_t a1,
uintptr_t a2, uintptr_t a3,
uintptr_t a4, uintptr_t a5,
uintptr_t a6, uintptr_t a7,
uintptr_t a8)
{
switch (nr) {
default:
return syscall(nr, a0, a1, a2, a3, a4, a5);
}
}
static void setup_main_process()
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_IGN;
syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
install_segv_handler();
char tmpdir_template[] = "./syzkaller.XXXXXX";
char* tmpdir = mkdtemp(tmpdir_template);
if (!tmpdir)
fail("failed to mkdtemp");
if (chmod(tmpdir, 0777))
fail("failed to chmod");
if (chdir(tmpdir))
fail("failed to chdir");
}
static void loop();
static void sandbox_common()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
setsid();
struct rlimit rlim;
rlim.rlim_cur = rlim.rlim_max = 128 << 20;
setrlimit(RLIMIT_AS, &rlim);
rlim.rlim_cur = rlim.rlim_max = 1 << 20;
setrlimit(RLIMIT_FSIZE, &rlim);
rlim.rlim_cur = rlim.rlim_max = 1 << 20;
setrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = rlim.rlim_max = 0;
setrlimit(RLIMIT_CORE, &rlim);
unshare(CLONE_NEWNS);
unshare(CLONE_NEWIPC);
unshare(CLONE_IO);
}
static int do_sandbox_none(int executor_pid, bool enable_tun)
{
int pid = fork();
if (pid)
return pid;
sandbox_common();
loop();
doexit(1);
}
static void remove_dir(const char* dir)
{
DIR* dp;
struct dirent* ep;
int iter = 0;
retry:
dp = opendir(dir);
if (dp == NULL) {
if (errno == EMFILE) {
exitf("opendir(%s) failed due to NOFILE, exiting");
}
exitf("opendir(%s) failed", dir);
}
while ((ep = readdir(dp))) {
if (strcmp(ep->d_name, ".") == 0 || strcmp(ep->d_name, "..") == 0)
continue;
char filename[FILENAME_MAX];
snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
struct stat st;
if (lstat(filename, &st))
exitf("lstat(%s) failed", filename);
if (S_ISDIR(st.st_mode)) {
remove_dir(filename);
continue;
}
int i;
for (i = 0;; i++) {
debug("unlink(%s)\n", filename);
if (unlink(filename) == 0)
break;
if (errno == EROFS) {
debug("ignoring EROFS\n");
break;
}
if (errno != EBUSY || i > 100)
exitf("unlink(%s) failed", filename);
debug("umount(%s)\n", filename);
if (umount2(filename, MNT_DETACH))
exitf("umount(%s) failed", filename);
}
}
closedir(dp);
int i;
for (i = 0;; i++) {
debug("rmdir(%s)\n", dir);
if (rmdir(dir) == 0)
break;
if (i < 100) {
if (errno == EROFS) {
debug("ignoring EROFS\n");
break;
}
if (errno == EBUSY) {
debug("umount(%s)\n", dir);
if (umount2(dir, MNT_DETACH))
exitf("umount(%s) failed", dir);
continue;
}
if (errno == ENOTEMPTY) {
if (iter < 100) {
iter++;
goto retry;
}
}
}
exitf("rmdir(%s) failed", dir);
}
}
static uint64_t current_time_ms()
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
fail("clock_gettime failed");
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}
static void test();
void loop()
{
int iter;
for (iter = 0;; iter++) {
char cwdbuf[256];
sprintf(cwdbuf, "./%d", iter);
if (mkdir(cwdbuf, 0777))
fail("failed to mkdir");
int pid = fork();
if (pid < 0)
fail("clone failed");
if (pid == 0) {
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
if (chdir(cwdbuf))
fail("failed to chdir");
test();
doexit(0);
}
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
int res = waitpid(-1, &status, __WALL | WNOHANG);
if (res == pid)
break;
usleep(1000);
if (current_time_ms() - start > 5 * 1000) {
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
while (waitpid(-1, &status, __WALL) != pid) {
}
break;
}
}
remove_dir(cwdbuf);
}
}
long r[25];
void* thr(void* arg)
{
switch ((long)arg) {
case 0:
r[0] =
execute_syscall(__NR_mmap, 0x20000000ul, 0xe69000ul, 0x3ul,
0x32ul, 0xfffffffffffffffful, 0x0ul, 0, 0, 0);
break;
case 1:
r[1] = execute_syscall(__NR_socket, 0xaul, 0x6ul, 0x0ul, 0, 0, 0, 0,
0, 0);
break;
case 2:
r[2] = execute_syscall(__NR_socket, 0xaul, 0x6ul, 0x0ul, 0, 0, 0, 0,
0, 0);
break;
case 3:
NONFAILING(*(uint16_t*)0x20e62fe0 = (uint16_t)0xa);
NONFAILING(*(uint16_t*)0x20e62fe2 = (uint16_t)0x224e);
NONFAILING(*(uint32_t*)0x20e62fe4 = (uint32_t)0x0);
NONFAILING(*(uint64_t*)0x20e62fe8 = (uint64_t)0x0);
NONFAILING(*(uint64_t*)0x20e62ff0 = (uint64_t)0x100000000000000);
NONFAILING(*(uint32_t*)0x20e62ff8 = (uint32_t)0x0);
r[9] = execute_syscall(__NR_bind, r[2], 0x20e62fe0ul, 0x20ul, 0, 0,
0, 0, 0, 0);
break;
case 4:
r[10] =
execute_syscall(__NR_listen, r[2], 0x5ul, 0, 0, 0, 0, 0, 0, 0);
break;
case 5:
NONFAILING(*(uint16_t*)0x208c3000 = (uint16_t)0x1);
NONFAILING(*(uint64_t*)0x208c3008 = (uint64_t)0x20c58000);
NONFAILING(*(uint16_t*)0x20c58000 = (uint16_t)0x6);
NONFAILING(*(uint8_t*)0x20c58002 = (uint8_t)0x3);
NONFAILING(*(uint8_t*)0x20c58003 = (uint8_t)0xfe9);
NONFAILING(*(uint32_t*)0x20c58004 = (uint32_t)0x6);
r[17] = execute_syscall(__NR_setsockopt, r[2], 0x1ul, 0x1aul,
0x208c3000ul, 0x10ul, 0, 0, 0, 0);
break;
case 6:
NONFAILING(*(uint16_t*)0x20e5afe0 = (uint16_t)0xa);
NONFAILING(*(uint16_t*)0x20e5afe2 = (uint16_t)0x224e);
NONFAILING(*(uint32_t*)0x20e5afe4 = (uint32_t)0x0);
NONFAILING(*(uint64_t*)0x20e5afe8 = (uint64_t)0x0);
NONFAILING(*(uint64_t*)0x20e5aff0 = (uint64_t)0x100000000000000);
NONFAILING(*(uint32_t*)0x20e5aff8 = (uint32_t)0x0);
r[24] = execute_syscall(__NR_connect, r[1], 0x20e5afe0ul, 0x20ul, 0,
0, 0, 0, 0, 0);
break;
}
return 0;
}
void test()
{
long i;
pthread_t th[14];
memset(r, -1, sizeof(r));
srand(getpid());
for (i = 0; i < 7; i++) {
pthread_create(&th[i], 0, thr, (void*)i);
usleep(10000);
}
usleep(100000);
}
int main()
{
int i;
for (i = 0; i < 8; i++) {
if (fork() == 0) {
setup_main_process();
int pid = do_sandbox_none(i, false);
int status = 0;
while (waitpid(pid, &status, __WALL) != pid) {
}
return 0;
}
}
sleep(1000000);
return 0;
}
Attachment:
.config
Description: Binary data