Re: Filesystem kernel hangup, 2.6.3 (bad: scheduling while atomic!)

From: Mikael Wahlberg
Date: Mon Feb 23 2004 - 08:48:11 EST


On Mon, 2004-02-23 at 13:19, Christoph Hellwig wrote:
> On Sun, Feb 22, 2004 at 04:49:41PM +0100, Mikael Wahlberg wrote:
> > Description:
> >
> > On heavy FTP Load (About 1Gbit/s) running both reads and writes on two ServeRAID6m Raid5 controllers merged together to one filesystem with Raidtools we see the error below. The filesystem gets totally hanged up. Currently with XFS, but JFS gets the same problem (Actually even more often).
>
> What does the JFS oops look like?

Just tried it without HyperThreading, with JFS, the filesystem hanged
without any kernel oops after about 10 minutes of 1Gbit/s FTP input.

What I got was:

Feb 23 14:19:15 mserv4 kernel:
Feb 23 14:19:15 mserv4 kernel: swapper: page allocation failure.
order:0, mode:0x20
Feb 23 14:19:15 mserv4 kernel: Call Trace:
Feb 23 14:19:15 mserv4 kernel: [<c0143160>] __alloc_pages+0x320/0x370
Feb 23 14:19:15 mserv4 kernel: [<c01431d5>] __get_free_pages+0x25/0x40
Feb 23 14:19:15 mserv4 kernel: [<c01462ac>] cache_grow+0xcc/0x330
Feb 23 14:19:15 mserv4 kernel: [<c014660e>]
cache_alloc_refill+0xfe/0x2a0
Feb 23 14:19:15 mserv4 kernel: [<c0146b4e>] __kmalloc+0x7e/0x90
Feb 23 14:19:15 mserv4 kernel: [<c032e797>] alloc_skb+0x47/0xf0
Feb 23 14:19:15 mserv4 kernel: [<c02ad599>]
e1000_alloc_rx_buffers+0x59/0x100
Feb 23 14:19:15 mserv4 kernel: [<c02ad18e>]
e1000_clean_rx_irq+0xfe/0x4b0
Feb 23 14:19:15 mserv4 kernel: [<c02acfe6>]
e1000_clean_tx_irq+0x1d6/0x280
Feb 23 14:19:15 mserv4 kernel: [<c02acd9a>] e1000_intr+0x4a/0xc0
Feb 23 14:19:16 mserv4 kernel: [<c010d579>] handle_IRQ_event+0x49/0x80
Feb 23 14:19:16 mserv4 kernel: [<c010d952>] do_IRQ+0xd2/0x1d0
Feb 23 14:19:16 mserv4 kernel: [<c010b9f8>] common_interrupt+0x18/0x20
Feb 23 14:19:16 mserv4 kernel: [<c0146a51>] kmem_cache_alloc+0x31/0x50
Feb 23 14:19:16 mserv4 kernel: [<c032e775>] alloc_skb+0x25/0xf0
Feb 23 14:19:16 mserv4 kernel: [<c0368ee1>] tcp_send_ack+0x31/0xd0
Feb 23 14:19:16 mserv4 kernel: [<c0333ebd>]
netif_receive_skb+0x1cd/0x250
Feb 23 14:19:16 mserv4 kernel: [<c0369d9f>]
tcp_delack_timer+0x10f/0x220
Feb 23 14:19:16 mserv4 kernel: [<c02acfe6>]
e1000_clean_tx_irq+0x1d6/0x280
Feb 23 14:19:16 mserv4 kernel: [<c0369c90>] tcp_delack_timer+0x0/0x220
Feb 23 14:19:16 mserv4 kernel: [<c012c5d8>]
run_timer_softirq+0xf8/0x1e0
Feb 23 14:19:16 mserv4 kernel: [<c0278db1>]
add_interrupt_randomness+0x31/0x40
Feb 23 14:19:16 mserv4 kernel: [<c012793a>] do_softirq+0xca/0xd0
Feb 23 14:19:16 mserv4 kernel: [<c010d9dd>] do_IRQ+0x15d/0x1d0
Feb 23 14:19:16 mserv4 kernel: [<c0108a60>] default_idle+0x0/0x40
Feb 23 14:19:16 mserv4 kernel: [<c010b9f8>] common_interrupt+0x18/0x20
Feb 23 14:19:16 mserv4 kernel: [<c0108a60>] default_idle+0x0/0x40
Feb 23 14:19:17 mserv4 kernel: [<c0108a8d>] default_idle+0x2d/0x40
Feb 23 14:19:18 mserv4 kernel: [<c0108b26>] cpu_idle+0x46/0x50
Feb 23 14:19:18 mserv4 kernel: [<c0105000>] rest_init+0x0/0x70
Feb 23 14:19:18 mserv4 kernel: [<c0456930>] start_kernel+0x1a0/0x1e0
Feb 23 14:19:18 mserv4 kernel: [<c04564a0>]
unknown_bootoption+0x0/0x120
Feb 23 14:19:18 mserv4 kernel:
Feb 23 14:19:18 mserv4 kernel: swapper: page allocation failure.
order:0, mode:0x20
Feb 23 14:19:18 mserv4 kernel: Call Trace:
Feb 23 14:19:18 mserv4 kernel: [<c0143160>] __alloc_pages+0x320/0x370
Feb 23 14:19:18 mserv4 kernel: [<c03510c0>]
ip_local_deliver_finish+0x0/0x220
Feb 23 14:19:18 mserv4 kernel: [<c01431d5>] __get_free_pages+0x25/0x40
Feb 23 14:19:18 mserv4 kernel: [<c01462ac>] cache_grow+0xcc/0x330
Feb 23 14:19:18 mserv4 kernel: [<c014660e>]
cache_alloc_refill+0xfe/0x2a0
Feb 23 14:19:18 mserv4 kernel: [<c0146b4e>] __kmalloc+0x7e/0x90
Feb 23 14:19:19 mserv4 kernel: [<c032e797>] alloc_skb+0x47/0xf0
Feb 23 14:19:19 mserv4 kernel: [<c02ad599>]
e1000_alloc_rx_buffers+0x59/0x100
Feb 23 14:19:19 mserv4 kernel: [<c02ad18e>]
e1000_clean_rx_irq+0xfe/0x4b0
Feb 23 14:19:19 mserv4 kernel: [<c0333ebd>]
netif_receive_skb+0x1cd/0x250
Feb 23 14:19:19 mserv4 kernel: [<c02acd9a>] e1000_intr+0x4a/0xc0
Feb 23 14:19:19 mserv4 kernel: [<c010d579>] handle_IRQ_event+0x49/0x80
Feb 23 14:19:19 mserv4 kernel: [<c010d952>] do_IRQ+0xd2/0x1d0
Feb 23 14:19:19 mserv4 kernel: [<c0108a60>] default_idle+0x0/0x40
Feb 23 14:19:19 mserv4 kernel: [<c010b9f8>] common_interrupt+0x18/0x20
Feb 23 14:19:19 mserv4 kernel: [<c0108a60>] default_idle+0x0/0x40
Feb 23 14:19:19 mserv4 kernel: [<c0108a8d>] default_idle+0x2d/0x40
Feb 23 14:19:19 mserv4 kernel: [<c0108b26>] cpu_idle+0x46/0x50
Feb 23 14:19:19 mserv4 kernel: [<c0105000>] rest_init+0x0/0x70
Feb 23 14:19:19 mserv4 kernel: [<c0456930>] start_kernel+0x1a0/0x1e0
Feb 23 14:19:20 mserv4 kernel: [<c04564a0>]
unknown_bootoption+0x0/0x120
Feb 23 14:19:20 mserv4 kernel:



^[
Feb 23 14:31:57 mserv4 proftpd[22053]: mserv4 - ProFTPD killed (signal
15)

(I restardet proftpd when nothing happened)

Then if I try to read a file on the filesystem:

(Strace of less)
[root@mserv4 clips]# strace less f5.in.09
execve("/usr/bin/less", ["less", "f5.in.09"], [/* 26 vars */]) = 0
uname({sys="Linux", node="mserv4", ...}) = 0
brk(0) = 0x806b000
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
-1, 0) = 0x40013000
open("/etc/ld.so.preload", O_RDONLY) = -1 ENOENT (No such file or
directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=62008, ...}) = 0
old_mmap(NULL, 62008, PROT_READ, MAP_PRIVATE, 3, 0) = 0x40014000
close(3) = 0
open("/usr/lib/libncurses.so.5", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\300\341"...,
512) = 512fstat64(3, {st_mode=S_IFREG|0755, st_size=255300, ...}) = 0
old_mmap(NULL, 256076, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) =
0x40024000
old_mmap(0x4005a000, 36864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED,
3, 0x36000) = 0x4005a000
close(3) = 0
open("/lib/libc.so.6", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\310V\1"...,
512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=1476244, ...}) = 0
old_mmap(NULL, 1207972, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) =
0x40063000
old_mmap(0x40184000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED,
3, 0x120000) = 0x40184000
old_mmap(0x40188000, 7844, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x40188000
close(3) = 0
munmap(0x40014000, 62008) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE, {B38400 opost isig icanon echo ...}) = 0
brk(0) = 0x806b000
brk(0x806c000) = 0x806c000
brk(0) = 0x806c000
access("/root/.terminfo/x/xterm", R_OK) = -1 ENOENT (No such file or
directory)
access("/usr/share/terminfo/x/xterm", R_OK) = 0
open("/usr/share/terminfo/x/xterm", O_RDONLY) = 3
read(3, "\32\1\34\0\35\0\17\0i\1\273\3", 12) = 12
read(3, "xterm|X11 terminal emulator\0", 28) = 28
read(3, "\0\1\0\0\1\0\0\0\1\0\0\0\0\1\1\0\0\0\0\0\0\0\1\0\0\0\0"..., 29)
= 29
read(3, "\0", 1) = 1
read(3, "P\0\10\0\30\0\377\377\377\377\377\377\377\377\377\377\377"...,
30) = 30read(3,
"\0\0\4\0\6\0\10\0\31\0\36\0&\0*\0.\0\377\3779\0J\0L\0P"..., 722) = 722
read(3, "\33[Z\0\7\0\r\0\33[%i%p1%d;%p2%dr\0\33[3g\0\33["..., 955) = 955
read(3, "", 1) = 0
read(3, "", 10) = 0
close(3) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, TIOCGWINSZ, {ws_row=61, ws_col=80, ws_xpixel=0, ws_ypixel=0}) =
0
ioctl(2, TIOCGWINSZ, {ws_row=61, ws_col=80, ws_xpixel=0, ws_ypixel=0}) =
0
open("/usr/bin/.sysless", O_RDONLY|O_LARGEFILE) = -1 ENOENT (No such
file or directory)
open("/etc/sysless", O_RDONLY|O_LARGEFILE) = -1 ENOENT (No such file or
directory)
open("/root/.less", O_RDONLY|O_LARGEFILE) = -1 ENOENT (No such file or
directory)
brk(0) = 0x806c000
brk(0x806d000) = 0x806d000
open("/usr/lib/locale/locale-archive", O_RDONLY|O_LARGEFILE) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=31202800, ...}) = 0
mmap2(NULL, 2097152, PROT_READ, MAP_PRIVATE, 3, 0) = 0x4018a000
close(3) = 0
brk(0) = 0x806d000
brk(0x806e000) = 0x806e000
open("/dev/tty", O_RDONLY|O_LARGEFILE) = 3
ioctl(3, SNDCTL_TMR_TIMEBASE, {B38400 opost isig icanon echo ...}) = 0
fsync(3) = -1 EINVAL (Invalid argument)
ioctl(3, SNDCTL_TMR_STOP, {B38400 opost isig -icanon -echo ...}) = 0
rt_sigaction(SIGINT, {0x8059e90, [INT], SA_RESTORER|SA_RESTART,
0x40089cf8}, {SIG_DFL}, 8) = 0
rt_sigaction(SIGTSTP, {0x8059ed0, [TSTP], SA_RESTORER|SA_RESTART,
0x40089cf8}, {SIG_DFL}, 8) = 0
rt_sigaction(SIGWINCH, {0x8059f10, [WINCH], SA_RESTORER|SA_RESTART,
0x40089cf8}, {SIG_DFL}, 8) = 0
pipe([4, 5]) = 0
vfork() = 22348
close(5) = 0
read(4, "", 1) = 0
--- SIGCHLD (Child exited) @ 0 (0) ---
close(4) = 0
wait4(22348, [WIFEXITED(s) && WEXITSTATUS(s) == 127], 0, NULL) = 22348
stat64("f5.in.09", {st_mode=S_IFREG|0644, st_size=67719168, ...}) = 0
stat64("f5.in.09", {st_mode=S_IFREG|0644, st_size=67719168, ...}) = 0
open("f5.in.09", O_RDONLY|O_LARGEFILE) = 4
_llseek(4, 1,


then hang..

Anybody got an idea?

/Mikael

--
-----------------------------------------------------------------------
Mikael Wahlberg, M.Sc. Ardendo
Unit Manager Professional Services/ e-mail: mikael@xxxxxxxxxx
Technical Project Manager GSM: +46 733 279 274

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/