Re: INFO: rcu detected stall in ext4_file_write_iter

From: Theodore Y. Ts'o
Date: Tue Feb 26 2019 - 10:17:59 EST


TL;DR: This doesn't appear to be ext4 specific, and seems to involve
an unholy combination of the perf_event_open(2) and sendfile(2) system
calls.

On Mon, Feb 25, 2019 at 10:50:05PM -0800, syzbot wrote:
> syzbot found the following crash on:
>
> HEAD commit: 8a61716ff2ab Merge tag 'ceph-for-5.0-rc8' of git://github...
> git tree: upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=161b71d4c00000
> kernel config: https://syzkaller.appspot.com/x/.config?x=7132344728e7ec3f
> dashboard link: https://syzkaller.appspot.com/bug?extid=7d19c5fe6a3f1161abb7
> compiler: gcc (GCC) 9.0.0 20181231 (experimental)
> syz repro: https://syzkaller.appspot.com/x/repro.syz?x=103908f8c00000
> C reproducer: https://syzkaller.appspot.com/x/repro.c?x=105e5cd0c00000
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+7d19c5fe6a3f1161abb7@xxxxxxxxxxxxxxxxxxxxxxxxx
>
> audit: type=1400 audit(1550814986.750:36): avc: denied { map } for
> pid=8058 comm="syz-executor004" path="/root/syz-executor004991115"
> dev="sda1" ino=1426 scontext=unconfined_u:system_r:insmod_t:s0-s0:c0.c1023
> tcontext=unconfined_u:object_r:user_home_t:s0 tclass=file permissive=1
> hrtimer: interrupt took 42841 ns
> rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
> rcu: (detected by 1, t=10502 jiffies, g=5873, q=2)
> rcu: All QSes seen, last rcu_preempt kthread activity 10502
> (4295059997-4295049495), jiffies_till_next_fqs=1, root ->qsmask 0x0
> syz-executor004 R running task 26448 8069 8060 0x00000000

This particular repro seems to induce similar failures when I tried it
with xfs and btrfs as well as ext4.

The repro seems to involve the perf_event_open(2) and sendfile(2)
system calls, and killing the process which is performing the
sendfile(2). The repro also uses the sched_setattr(2) system call,
but when I commented it out, the failure still happened, so this
appears to be another case of "Syzkaller? We don't need to bug
developers with a minimal test case! Open source developers are a
free unlimited resource, after all!" :-)

Commenting out the perf_event_open(2) does seem to make the problem go
away.

Since there are zillions of ways to self-DOS a Linux server without
having to resert to exotic combination of system calls, this isn't
something I'm going to prioritize for myself, but I'm hoping someone
else has time and curiosity.

- Ted
// autogenerated by syzkaller (https://github.com/google/syzkaller)

#define _GNU_SOURCE

#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#include <linux/futex.h>

unsigned long long procid;

static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}

static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static void use_temporary_dir(void)
{
char tmpdir_template[] = "./syzkaller.XXXXXX";
char* tmpdir = mkdtemp(tmpdir_template);
if (!tmpdir)
exit(1);
if (chmod(tmpdir, 0777))
exit(1);
if (chdir(tmpdir))
exit(1);
}

static void thread_start(void* (*fn)(void*), void* arg)
{
pthread_t th;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 128 << 10);
int i;
for (i = 0; i < 100; i++) {
if (pthread_create(&th, &attr, fn, arg) == 0) {
pthread_attr_destroy(&attr);
return;
}
if (errno == EAGAIN) {
usleep(50);
continue;
}
break;
}
exit(1);
}

#define BITMASK(bf_off, bf_len) (((1ull << (bf_len)) - 1) << (bf_off))
#define STORE_BY_BITMASK(type, htobe, addr, val, bf_off, bf_len) \
*(type*)(addr) = \
htobe((htobe(*(type*)(addr)) & ~BITMASK((bf_off), (bf_len))) | \
(((type)(val) << (bf_off)) & BITMASK((bf_off), (bf_len))))

typedef struct {
int state;
} event_t;

static void event_init(event_t* ev)
{
ev->state = 0;
}

static void event_reset(event_t* ev)
{
ev->state = 0;
}

static void event_set(event_t* ev)
{
if (ev->state)
exit(1);
__atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG);
}

static void event_wait(event_t* ev)
{
while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
}

static int event_isset(event_t* ev)
{
return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
}

static int event_timedwait(event_t* ev, uint64_t timeout)
{
uint64_t start = current_time_ms();
uint64_t now = start;
for (;;) {
uint64_t remain = timeout - (now - start);
struct timespec ts;
ts.tv_sec = remain / 1000;
ts.tv_nsec = (remain % 1000) * 1000 * 1000;
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
if (__atomic_load_n(&ev->state, __ATOMIC_RELAXED))
return 1;
now = current_time_ms();
if (now - start > timeout)
return 0;
}
}

#define FS_IOC_SETFLAGS _IOW('f', 2, long)
static void remove_dir(const char* dir)
{
DIR* dp;
struct dirent* ep;
int iter = 0;
retry:
while (umount2(dir, MNT_DETACH) == 0) {
}
dp = opendir(dir);
if (dp == NULL) {
if (errno == EMFILE) {
exit(1);
}
exit(1);
}
while ((ep = readdir(dp))) {
if (strcmp(ep->d_name, ".") == 0 || strcmp(ep->d_name, "..") == 0)
continue;
char filename[FILENAME_MAX];
snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
while (umount2(filename, MNT_DETACH) == 0) {
}
struct stat st;
if (lstat(filename, &st))
exit(1);
if (S_ISDIR(st.st_mode)) {
remove_dir(filename);
continue;
}
int i;
for (i = 0;; i++) {
if (unlink(filename) == 0)
break;
if (errno == EPERM) {
int fd = open(filename, O_RDONLY);
if (fd != -1) {
long flags = 0;
if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0)
close(fd);
continue;
}
}
if (errno == EROFS) {
break;
}
if (errno != EBUSY || i > 100)
exit(1);
if (umount2(filename, MNT_DETACH))
exit(1);
}
}
closedir(dp);
int i;
for (i = 0;; i++) {
if (rmdir(dir) == 0)
break;
if (i < 100) {
if (errno == EPERM) {
int fd = open(dir, O_RDONLY);
if (fd != -1) {
long flags = 0;
if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0)
close(fd);
continue;
}
}
if (errno == EROFS) {
break;
}
if (errno == EBUSY) {
if (umount2(dir, MNT_DETACH))
exit(1);
continue;
}
if (errno == ENOTEMPTY) {
if (iter < 100) {
iter++;
goto retry;
}
}
}
exit(1);
}
}

static void kill_and_wait(int pid, int* status)
{
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
int i;
for (i = 0; i < 100; i++) {
if (waitpid(-1, status, WNOHANG | __WALL) == pid)
return;
usleep(1000);
}
DIR* dir = opendir("/sys/fs/fuse/connections");
if (dir) {
for (;;) {
struct dirent* ent = readdir(dir);
if (!ent)
break;
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
continue;
char abort[300];
snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
ent->d_name);
int fd = open(abort, O_WRONLY);
if (fd == -1) {
continue;
}
if (write(fd, abort, 1) < 0) {
}
close(fd);
}
closedir(dir);
} else {
}
while (waitpid(-1, status, __WALL) != pid) {
}
}

#define SYZ_HAVE_SETUP_TEST 1
static void setup_test()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
}

#define SYZ_HAVE_RESET_TEST 1
static void reset_test()
{
int fd;
for (fd = 3; fd < 30; fd++)
close(fd);
}

struct thread_t {
int created, call;
event_t ready, done;
};

static struct thread_t threads[16];
static void execute_call(int call);
static int running;

static void* thr(void* arg)
{
struct thread_t* th = (struct thread_t*)arg;
for (;;) {
event_wait(&th->ready);
event_reset(&th->ready);
execute_call(th->call);
__atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
event_set(&th->done);
}
return 0;
}

static void execute_one(void)
{
int i, call, thread;
int collide = 0;
again:
for (call = 0; call < 5; call++) {
for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0]));
thread++) {
struct thread_t* th = &threads[thread];
if (!th->created) {
th->created = 1;
event_init(&th->ready);
event_init(&th->done);
event_set(&th->done);
thread_start(thr, th);
}
if (!event_isset(&th->done))
continue;
event_reset(&th->done);
th->call = call;
__atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
event_set(&th->ready);
if (collide && (call % 2) == 0)
break;
event_timedwait(&th->done, 45);
break;
}
}
for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
sleep_ms(1);
if (!collide) {
collide = 1;
goto again;
}
}

static void execute_one(void);

#define WAIT_FLAGS __WALL

static void loop(void)
{
int iter;
for (iter = 0;; iter++) {
char cwdbuf[32];
sprintf(cwdbuf, "./%d", iter);
if (mkdir(cwdbuf, 0777))
exit(1);
int pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
if (chdir(cwdbuf))
exit(1);
setup_test();
execute_one();
reset_test();
exit(0);
}
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
break;
sleep_ms(1);
if (current_time_ms() - start < 5 * 1000)
continue;
kill_and_wait(pid, &status);
break;
}
remove_dir(cwdbuf);
}
}
#ifndef __NR_sched_setattr
#define __NR_sched_setattr 314
#endif

uint64_t r[1] = {0xffffffffffffffff};

void execute_call(int call)
{
long res;
switch (call) {
case 0:
memcpy((void*)0x20000040, "./bus\000", 6);
res = syscall(__NR_open, 0x20000040, 0x1fe, 0);
if (res != -1)
r[0] = res;
break;
case 1:
memcpy((void*)0x200001c0, "\x15\x8a\xdd\x00", 4);
syscall(__NR_write, r[0], 0x200001c0, 4);
break;
case 2:
*(uint32_t*)0x20000800 = 1;
*(uint32_t*)0x20000804 = 0x70;
*(uint8_t*)0x20000808 = 0;
*(uint8_t*)0x20000809 = 0;
*(uint8_t*)0x2000080a = 0;
*(uint8_t*)0x2000080b = 0;
*(uint32_t*)0x2000080c = 0;
*(uint64_t*)0x20000810 = 0x50a;
*(uint64_t*)0x20000818 = 0;
*(uint64_t*)0x20000820 = 0;
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 0, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 1, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 2, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 3, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 4, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 7, 5, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 6, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 7, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 8, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 9, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 10, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 11, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 12, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 13, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0x7fff, 14, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 15, 2);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 17, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 18, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 19, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 20, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 21, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 22, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 23, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 24, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 25, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 26, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 27, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 28, 1);
STORE_BY_BITMASK(uint64_t, , 0x20000828, 0, 29, 35);
*(uint32_t*)0x20000830 = 0;
*(uint32_t*)0x20000834 = 0;
*(uint64_t*)0x20000838 = 0;
*(uint64_t*)0x20000840 = 0;
*(uint64_t*)0x20000848 = 0;
*(uint64_t*)0x20000850 = 0;
*(uint32_t*)0x20000858 = 0;
*(uint32_t*)0x2000085c = 0;
*(uint64_t*)0x20000860 = 0;
*(uint32_t*)0x20000868 = 0xfffffffd;
*(uint16_t*)0x2000086c = 0;
*(uint16_t*)0x2000086e = 0;
syscall(__NR_perf_event_open, 0x20000800, 0, -1, -1, 0);
break;
case 3:
*(uint64_t*)0x20000000 = 0;
syscall(__NR_sendfile, r[0], r[0], 0x20000000, 0x8080fffffffe);
break;
#if 0
case 4:
*(uint32_t*)0x200002c0 = 0;
*(uint32_t*)0x200002c4 = 2;
*(uint64_t*)0x200002c8 = 0;
*(uint32_t*)0x200002d0 = 0;
*(uint32_t*)0x200002d4 = 3;
*(uint64_t*)0x200002d8 = 0;
*(uint64_t*)0x200002e0 = 0;
*(uint64_t*)0x200002e8 = 0;
syscall(__NR_sched_setattr, 0, 0x200002c0, 0);
break;
#endif
}
}
int main(void)
{
syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0);
for (procid = 0; procid < 6; procid++) {
if (fork() == 0) {
use_temporary_dir();
loop();
}
}
sleep(1000000);
return 0;
}