[PATCH -rt] rt-slab: fix cpu inconsistency case

From: Hiroshi Shimamoto
Date: Thu Mar 20 2008 - 14:00:52 EST


Hi Ingo,

I encountered BUG at mm/slab.c.
It tells me there is an inconsistency case.

One of the panics, it shows obviously exclusive control in slab is not right.

The console log;
Unable to handle kernel paging request at ffff8108cf81d3c8 RIP:
[<ffffffff8029e92f>] kmem_cache_alloc+0x5e/0xdb
PGD 8063 PUD 0
Oops: 0000 [1] PREEMPT SMP
CPU 1
Modules linked in:
Pid: 32268, comm: dcachebench Not tainted 2.6.24.3-rt3 #3
RIP: 0010:[<ffffffff8029e92f>] [<ffffffff8029e92f>] kmem_cache_alloc+0x5e/0xdb
RSP: 0018:ffff810048895e58 EFLAGS: 00010097
RAX: 00000000ffffffff RBX: 0000000000000246 RCX: 0000000000000000
RDX: ffff8100cf81d3b8 RSI: 0000000000000000 RDI: ffff8100cf80ee40
RBP: ffff8100cf80ee40 R08: 0000000000000c62 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000000000d0
R13: ffffffff802aa83d R14: 0000000000000006 R15: 0000000000000000
FS: 00002adc7b6686f0(0000) GS:ffff8100cf81f340(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffff8108cf81d3c8 CR3: 00000000b36e0000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process dcachebench (pid: 32268, threadinfo ffff810048894000, task ffff8100cd248080)
Stack: 0000000000000000 ffff810097d1c000 0000000100000282 0000000000000006
00000000ffffff9c 0000000000000000 00000000005038b0 ffffffff802aa83d
ffff8100cf811080 0000000000000006 00000000ffffff9c 0000000000000000
Call Trace:
[<ffffffff802aa83d>] getname+0x1e/0x1f0
[<ffffffff802ac38c>] do_rmdir+0x17/0xfd
[<ffffffff8026fb5f>] audit_syscall_entry+0x163/0x189
[<ffffffff8020c2f3>] tracesys+0x71/0xe1
[<ffffffff8020c35e>] tracesys+0xdc/0xe1
---------------------------
| preempt count: 00000001 ]
| 1-level deep critical section nesting:
----------------------------------------
.. [<ffffffff80640737>] .... __spin_trylock+0xe/0x49
.....[<00000000>] .. ( <= 0x0)
Code: 48 8b 54 c2 18 53 9d 4c 89 e9 44 89 e6 48 89 ef e8 f1 e1 ff
RIP [<ffffffff8029e92f>] kmem_cache_alloc+0x5e/0xdb
RSP <ffff810048895e58>


The kernel dump;
(gdb) info thr
4 process 32262 flush_tlb_others (cpumask={bits = {4}}, mm=0xffff8100cd16d450,
va=46966178668544) at arch/x86/kernel/smp_64.c:195
3 process 32325 0xffffffff8020cad0 in invalidate_interrupt3 ()
2 process 32268 0xffffffff8029e92f in kmem_cache_alloc (cachep=0xffff8100cf80ee40,
flags=208) at mm/slab.c:3320
* 1 process 32326 0xffffffff806408c4 in __spin_lock (lock=0xffff8100cdc9a4d0)
at kernel/spinlock.c:333
(gdb) thr 2
[Switching to thread 2 (process 32268)]#0 0xffffffff8029e92f in kmem_cache_alloc (
cachep=0xffff8100cf80ee40, flags=208) at mm/slab.c:3320
3320 mm/slab.c: No such file or directory.
in mm/slab.c

0xffffffff8029e903 <kmem_cache_alloc+50>: callq 0xffffffff8029c292 <check_irq_off>
0xffffffff8029e908 <kmem_cache_alloc+55>: movslq 0x14(%rsp),%rax
0xffffffff8029e90d <kmem_cache_alloc+60>: mov 0x0(%rbp,%rax,8),%rdx
0xffffffff8029e912 <kmem_cache_alloc+65>: mov (%rdx),%eax
0xffffffff8029e914 <kmem_cache_alloc+67>: test %eax,%eax
0xffffffff8029e916 <kmem_cache_alloc+69>: je 0xffffffff8029e990 <kmem_cache_alloc+191>

1st. %eax, ac->avail is not 0.

0xffffffff8029e918 <kmem_cache_alloc+71>: lock incl 0xf8(%rbp)
0xffffffff8029e91f <kmem_cache_alloc+78>: movl $0x1,0xc(%rdx)
0xffffffff8029e926 <kmem_cache_alloc+85>: mov (%rdx),%eax
0xffffffff8029e928 <kmem_cache_alloc+87>: sub $0x1,%eax
0xffffffff8029e92b <kmem_cache_alloc+90>: mov %eax,(%rdx)
0xffffffff8029e92d <kmem_cache_alloc+92>: mov %eax,%eax
0xffffffff8029e92f <kmem_cache_alloc+94>: mov 0x18(%rdx,%rax,8),%rdx

2nd. %eax, ac->avail = 0xffffffff, it causes this panic.

0xffffffff8029e934 <kmem_cache_alloc+99>: push %rbx
0xffffffff8029e935 <kmem_cache_alloc+100>: popfq

static inline void *
____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
{
void *objp;
struct array_cache *ac;

check_irq_off();

ac = cpu_cache_get(cachep, *this_cpu);
if (likely(ac->avail)) { ---------------------------> 1st. check
STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
objp = ac->entry[--ac->avail]; -------------> PANIC HERE
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags, this_cpu);
}
return objp;
}

It means that ac->avail is changed by the other cpu.
So I looked for who touch another cpu's data and found the root cause.

I think, calling cache_grow() in cache_alloc_refill() can change cpu,
but cache_grow() doesn't update this_cpu variable in spite of changing cpu.
It makes the kernel access other cpu's data in cache_alloc_refill().

I'm testing this patch now.
---
From: Hiroshi Shimamoto <h-shimamoto@xxxxxxxxxxxxx>

On !PREEMPT_RT, an inconsistency case may be occurred.
After releasing cpu by slab_irq_enable_nort() the cpu can be changed.
In slab_irq_disable_nort() new cpu id should be gotten.
If not, it makes data of other cpu corrupted.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@xxxxxxxxxxxxx>
---
mm/slab.c | 18 +++++++++---------
1 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 4bf6ca8..7beb378 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -138,8 +138,8 @@
*
* (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.)
*/
-# define slab_irq_disable_nort() local_irq_disable()
-# define slab_irq_enable_nort() local_irq_enable()
+# define slab_irq_disable_nort(cpu) slab_irq_disable(cpu)
+# define slab_irq_enable_nort(cpu) slab_irq_enable(cpu)
# define slab_irq_disable_rt(flags) do { (void)(flags); } while (0)
# define slab_irq_enable_rt(flags) do { (void)(flags); } while (0)
# define slab_spin_lock_irq(lock, cpu) \
@@ -160,8 +160,8 @@ DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, };
do { slab_irq_enable(cpu); (void) (flags); } while (0)
# define slab_irq_disable_rt(cpu) slab_irq_disable(cpu)
# define slab_irq_enable_rt(cpu) slab_irq_enable(cpu)
-# define slab_irq_disable_nort() do { } while (0)
-# define slab_irq_enable_nort() do { } while (0)
+# define slab_irq_disable_nort(cpu) do { } while (0)
+# define slab_irq_enable_nort(cpu) do { } while (0)
# define slab_spin_lock_irq(lock, cpu) \
do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
# define slab_spin_unlock_irq(lock, cpu) \
@@ -2899,7 +2899,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
offset *= cachep->colour_off;

if (local_flags & __GFP_WAIT)
- slab_irq_enable_nort();
+ slab_irq_enable_nort(*this_cpu);
slab_irq_enable_rt(*this_cpu);

/*
@@ -2932,7 +2932,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,

slab_irq_disable_rt(*this_cpu);
if (local_flags & __GFP_WAIT)
- slab_irq_disable_nort();
+ slab_irq_disable_nort(*this_cpu);

check_irq_off();
spin_lock(&l3->list_lock);
@@ -2948,7 +2948,7 @@ opps1:
failed:
slab_irq_disable_rt(*this_cpu);
if (local_flags & __GFP_WAIT)
- slab_irq_disable_nort();
+ slab_irq_disable_nort(*this_cpu);
return 0;
}

@@ -3397,7 +3397,7 @@ retry:
* set and go into memory reserves if necessary.
*/
if (local_flags & __GFP_WAIT)
- slab_irq_enable_nort();
+ slab_irq_enable_nort(*this_cpu);
slab_irq_enable_rt(*this_cpu);

kmem_flagcheck(cache, flags);
@@ -3405,7 +3405,7 @@ retry:

slab_irq_disable_rt(*this_cpu);
if (local_flags & __GFP_WAIT)
- slab_irq_disable_nort();
+ slab_irq_disable_nort(*this_cpu);

if (obj) {
/*
--
1.5.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/