Re: [patch] Fixed the race that was oopsing Linux-2.2.0

Andrea Arcangeli (andrea@e-mind.com)
Wed, 27 Jan 1999 21:14:35 +0100 (CET)


On Wed, 27 Jan 1999, Linus Torvalds wrote:

> Don't bother sending me patches that do the old task "grab" operation.
> They won't be applied. By applying that patch, you just open yourself up
> to overrunning your stack, which is much worse than any normal oops.

Are you sure you are not been hurted by the munmap bug that was removing
the pgd if no mmap was present and munmap was run? The /proc code looks
like not too much recursive on the stack (and can't be less recursive for
me than for you)... I had never problems here (but maybe I don't have
enough irq handler running at the same time... ;).

But yes, obviously I agree to not use the stack. So here is the fixed
patch against 2.2.0. I checked and works fine too.

Index: arch/i386/kernel/ldt.c
===================================================================
RCS file: /var/cvs/linux/arch/i386/kernel/ldt.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 ldt.c
--- ldt.c 1999/01/18 01:28:57 1.1.1.1
+++ linux/arch/i386/kernel/ldt.c 1999/01/27 19:36:10
@@ -83,7 +83,7 @@
set_ldt_desc(i, ldt, LDT_ENTRIES);
current->tss.ldt = _LDT(i);
load_ldt(i);
- if (atomic_read(&mm->count) > 1)
+ if (mm->count > 1)
printk(KERN_WARNING
"LDT allocated for cloned task!\n");
} else {
Index: fs/binfmt_aout.c
===================================================================
RCS file: /var/cvs/linux/fs/binfmt_aout.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 binfmt_aout.c
--- binfmt_aout.c 1999/01/18 01:26:49 1.1.1.1
+++ linux/fs/binfmt_aout.c 1999/01/27 20:06:16
@@ -101,7 +101,7 @@
# define START_STACK(u) (u.start_stack)
#endif

- if (!current->dumpable || atomic_read(&current->mm->count) != 1)
+ if (!current->dumpable || current->mm->count != 1)
return 0;
current->dumpable = 0;

Index: fs/binfmt_elf.c
===================================================================
RCS file: /var/cvs/linux/fs/binfmt_elf.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 binfmt_elf.c
--- binfmt_elf.c 1999/01/18 01:26:49 1.1.1.1
+++ linux/fs/binfmt_elf.c 1999/01/27 19:36:10
@@ -1067,7 +1067,7 @@

if (!current->dumpable ||
limit < ELF_EXEC_PAGESIZE ||
- atomic_read(&current->mm->count) != 1)
+ current->mm->count != 1)
return 0;
current->dumpable = 0;

Index: fs/exec.c
===================================================================
RCS file: /var/cvs/linux/fs/exec.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 exec.c
--- exec.c 1999/01/23 16:28:07 1.1.1.2
+++ linux/fs/exec.c 1999/01/27 19:36:10
@@ -378,7 +378,7 @@
struct mm_struct * mm, * old_mm;
int retval, nr;

- if (atomic_read(&current->mm->count) == 1) {
+ if (current->mm->count == 1) {
flush_cache_mm(current->mm);
mm_release();
release_segments(current->mm);
@@ -409,7 +409,9 @@
activate_context(current);
up(&mm->mmap_sem);
mm_release();
+ spin_lock(&mm_lock);
mmput(old_mm);
+ spin_unlock(&mm_lock);
return 0;

/*
Index: fs/proc/array.c
===================================================================
RCS file: /var/cvs/linux/fs/proc/array.c,v
retrieving revision 1.1.1.4
diff -u -r1.1.1.4 array.c
--- array.c 1999/01/26 18:30:44 1.1.1.4
+++ linux/fs/proc/array.c 1999/01/27 19:59:22
@@ -43,6 +43,8 @@
* <Alan.Cox@linux.org>
*
* Andi Kleen : Race Fixes.
+ *
+ * Andrea Arcangeli : do_exit/mmput/mmget race fix on the Race Fixes ;).
*
*/

@@ -65,6 +67,7 @@
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/signal.h>
+#include <linux/init.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -77,6 +80,7 @@
int get_malloc(char * buffer);
#endif

+static kmem_cache_t * task_struct_head_cache;

static ssize_t read_core(struct file * file, char * buf,
size_t count, loff_t *ppos)
@@ -399,8 +403,11 @@

read_lock(&tasklist_lock);
tsk = find_task_by_pid(pid);
+ spin_lock(&mm_lock);
if (tsk && tsk->mm && tsk->mm != &init_mm)
- mmget(mm = tsk->mm);
+ if (!mmget(mm = tsk->mm))
+ mm = NULL;
+ spin_unlock(&mm_lock);
read_unlock(&tasklist_lock);
if (mm != NULL)
down(&mm->mmap_sem);
@@ -410,7 +417,9 @@
static void release_mm(struct mm_struct *mm)
{
up(&mm->mmap_sem);
+ spin_lock(&mm_lock);
mmput(mm);
+ spin_unlock(&mm_lock);
}

static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
@@ -504,17 +513,9 @@
return res;
}

-/*
- * These bracket the sleeping functions..
- */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-#define first_sched ((unsigned long) scheduling_functions_start_here)
-#define last_sched ((unsigned long) scheduling_functions_end_here)
-
static unsigned long get_wchan(struct task_struct *p)
{
- if (!p || p == current || p->state == TASK_RUNNING)
+ if (!p || real_tsk(p) == current || p->state == TASK_RUNNING)
return 0;
#if defined(__i386__)
{
@@ -522,7 +523,7 @@
unsigned long stack_page;
int count = 0;

- stack_page = (unsigned long)p;
+ stack_page = (unsigned long)real_tsk(p);
esp = p->tss.esp;
if (!stack_page || esp < stack_page || esp >= 8188+stack_page)
return 0;
@@ -564,7 +565,7 @@
unsigned long stack_page;
int count = 0;

- stack_page = (unsigned long)p;
+ stack_page = (unsigned long)real_tsk(p);
fp = ((struct switch_stack *)p->tss.ksp)->a6;
do {
if (fp < stack_page+sizeof(struct task_struct) ||
@@ -585,7 +586,7 @@
unsigned long stack_page;
int count = 0;

- stack_page = 4096 + (unsigned long)p;
+ stack_page = 4096 + (unsigned long)real_tsk(p);
fp = get_css_fp (&p->tss);
do {
if (fp < stack_page || fp > 4092+stack_page)
@@ -599,7 +600,7 @@
#elif defined (__sparc__)
{
unsigned long pc, fp, bias = 0;
- unsigned long task_base = (unsigned long) p;
+ unsigned long task_base = (unsigned long) real_tsk(p);
struct reg_window *rw;
int count = 0;

@@ -624,8 +625,8 @@
}

#if defined(__i386__)
-# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
-# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
+# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1019])
+# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022])
#elif defined(__alpha__)
/*
* See arch/alpha/kernel/ptrace.c for details.
@@ -633,11 +634,11 @@
# define PT_REG(reg) (PAGE_SIZE - sizeof(struct pt_regs) \
+ (long)&((struct pt_regs *)0)->reg)
# define KSTK_EIP(tsk) \
- (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)(tsk)))
+ (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)real_tsk(tsk)))
# define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->tss.usp)
#elif defined(CONFIG_ARM)
-# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
-# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1020])
+# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022])
+# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1020])
#elif defined(__mc68000__)
#define KSTK_EIP(tsk) \
({ \
@@ -646,7 +647,7 @@
MAP_NR((tsk)->tss.esp0) < max_mapnr) \
eip = ((struct pt_regs *) (tsk)->tss.esp0)->pc; \
eip; })
-#define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->tss.usp)
+#define KSTK_ESP(tsk) (real_tsk(tsk) == current ? rdusp() : (tsk)->tss.usp)
#elif defined(__powerpc__)
#define KSTK_EIP(tsk) ((tsk)->tss.regs->nip)
#define KSTK_ESP(tsk) ((tsk)->tss.regs->gpr[1])
@@ -851,21 +852,42 @@

static struct task_struct *grab_task(int pid)
{
- struct task_struct *tsk = current;
+ struct task_struct *tsk = current, *dst;
if (pid != tsk->pid) {
+ dst = kmem_cache_alloc(task_struct_head_cache, SLAB_KERNEL);
+ if (!dst)
+ return NULL;
read_lock(&tasklist_lock);
tsk = find_task_by_pid(pid);
- if (tsk && tsk->mm && tsk->mm != &init_mm)
- mmget(tsk->mm);
+ if (tsk) {
+ spin_lock(&mm_lock);
+ if (tsk->mm && tsk->mm != &init_mm)
+ if (!mmget(tsk->mm))
+ tsk = NULL;
+ spin_unlock(&mm_lock);
+ if (tsk)
+ {
+ memcpy(dst, tsk, sizeof(struct task_struct));
+ tsk = dst;
+ }
+ }
read_unlock(&tasklist_lock);
- }
+ if (!tsk)
+ kmem_cache_free(task_struct_head_cache, dst);
+ }
return tsk;
}

static void release_task(struct task_struct *tsk)
{
- if (tsk != current && tsk->mm && tsk->mm != &init_mm)
- mmput(tsk->mm);
+ if (tsk != current)
+ {
+ spin_lock(&mm_lock);
+ if (tsk->mm && tsk->mm != &init_mm)
+ mmput(tsk->mm);
+ spin_unlock(&mm_lock);
+ kmem_cache_free(task_struct_head_cache, tsk);
+ }
}

static int get_status(int pid, char * buffer)
@@ -1149,7 +1171,7 @@
goto getlen_out;

/* Check whether the mmaps could change if we sleep */
- volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
+ volatile_task = (p != current || p->mm->count > 1);

/* decode f_pos */
lineno = *ppos >> MAPS_LINE_SHIFT;
@@ -1600,3 +1622,14 @@
NULL, /* truncate */
NULL /* permission */
};
+
+void __init proc_array_init(void)
+{
+ task_struct_head_cache = kmem_cache_create("task_struct",
+ sizeof(struct task_struct),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!task_struct_head_cache)
+ panic("cannot create task_struct cache");
+}
Index: fs/proc/root.c
===================================================================
RCS file: /var/cvs/linux/fs/proc/root.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 root.c
--- root.c 1999/01/18 13:39:15 1.1.1.2
+++ linux/fs/proc/root.c 1999/01/27 19:49:40
@@ -671,6 +671,7 @@

__initfunc(void proc_root_init(void))
{
+ proc_array_init();
proc_base_init();
proc_register(&proc_root, &proc_root_loadavg);
proc_register(&proc_root, &proc_root_uptime);
Index: include/asm-i386/pgtable.h
===================================================================
RCS file: /var/cvs/linux/include/asm-i386/pgtable.h,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 pgtable.h
--- pgtable.h 1999/01/18 01:27:15 1.1.1.1
+++ linux/include/asm-i386/pgtable.h 1999/01/27 19:45:56
@@ -101,7 +101,7 @@
static inline void flush_tlb_current_task(void)
{
/* just one copy of this mm? */
- if (atomic_read(&current->mm->count) == 1)
+ if (current->mm->count == 1)
local_flush_tlb(); /* and that's us, so.. */
else
smp_flush_tlb();
@@ -113,7 +113,7 @@

static inline void flush_tlb_mm(struct mm_struct * mm)
{
- if (mm == current->mm && atomic_read(&mm->count) == 1)
+ if (mm == current->mm && mm->count == 1)
local_flush_tlb();
else
smp_flush_tlb();
@@ -122,7 +122,7 @@
static inline void flush_tlb_page(struct vm_area_struct * vma,
unsigned long va)
{
- if (vma->vm_mm == current->mm && atomic_read(&current->mm->count) == 1)
+ if (vma->vm_mm == current->mm && current->mm->count == 1)
__flush_tlb_one(va);
else
smp_flush_tlb();
Index: include/linux/proc_fs.h
===================================================================
RCS file: /var/cvs/linux/include/linux/proc_fs.h,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 proc_fs.h
--- proc_fs.h 1999/01/18 01:27:09 1.1.1.1
+++ linux/include/linux/proc_fs.h 1999/01/27 19:48:49
@@ -312,6 +312,7 @@
extern struct inode_operations proc_scsi_inode_operations;

extern void proc_root_init(void);
+extern void proc_array_init(void);
extern void proc_base_init(void);

extern int proc_register(struct proc_dir_entry *, struct proc_dir_entry *);
Index: include/linux/sched.h
===================================================================
RCS file: /var/cvs/linux/include/linux/sched.h,v
retrieving revision 1.1.1.3
diff -u -r1.1.1.3 sched.h
--- sched.h 1999/01/23 16:29:58 1.1.1.3
+++ linux/include/linux/sched.h 1999/01/27 19:45:56
@@ -114,6 +114,15 @@
extern rwlock_t tasklist_lock;
extern spinlock_t scheduler_lock;

+/*
+ * These bracket the sleeping functions..
+ */
+extern void scheduling_functions_start_here(void);
+extern void scheduling_functions_end_here(void);
+#define first_sched ((unsigned long) scheduling_functions_start_here)
+#define last_sched ((unsigned long) scheduling_functions_end_here)
+#define real_tsk(tsk) (*(tsk)->tarray_ptr)
+
extern void sched_init(void);
extern void show_state(void);
extern void trap_init(void);
@@ -164,7 +173,7 @@
struct vm_area_struct *mmap_avl; /* tree of VMAs */
struct vm_area_struct *mmap_cache; /* last find_vma result */
pgd_t * pgd;
- atomic_t count;
+ int count;
int map_count; /* number of VMAs */
struct semaphore mmap_sem;
unsigned long context;
@@ -184,7 +193,7 @@
#define INIT_MM { \
&init_mmap, NULL, NULL, \
swapper_pg_dir, \
- ATOMIC_INIT(1), 1, \
+ 1, 1, \
MUTEX, \
0, \
0, 0, 0, 0, \
@@ -391,6 +400,7 @@

extern struct task_struct **tarray_freelist;
extern spinlock_t taskslot_lock;
+extern spinlock_t mm_lock;

extern __inline__ void add_free_taskslot(struct task_struct **t)
{
@@ -609,11 +619,8 @@
* Routines for handling mm_structs
*/
extern struct mm_struct * mm_alloc(void);
-static inline void mmget(struct mm_struct * mm)
-{
- atomic_inc(&mm->count);
-}
-extern void mmput(struct mm_struct *);
+extern int FASTCALL(mmget(struct mm_struct *));
+extern void FASTCALL(mmput(struct mm_struct *));
/* Remove the current tasks stale references to the old mm_struct */
extern void mm_release(void);

Index: kernel/exit.c
===================================================================
RCS file: /var/cvs/linux/kernel/exit.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 exit.c
--- exit.c 1999/01/23 16:30:27 1.1.1.2
+++ linux/kernel/exit.c 1999/01/27 19:36:10
@@ -248,8 +248,10 @@

static inline void __exit_mm(struct task_struct * tsk)
{
- struct mm_struct * mm = tsk->mm;
+ struct mm_struct * mm;

+ spin_lock(&mm_lock);
+ mm = tsk->mm;
/* Set us up to use the kernel mm state */
if (mm != &init_mm) {
flush_cache_mm(mm);
@@ -261,6 +263,7 @@
mm_release();
mmput(mm);
}
+ spin_unlock(&mm_lock);
}

void exit_mm(struct task_struct *tsk)
Index: kernel/fork.c
===================================================================
RCS file: /var/cvs/linux/kernel/fork.c,v
retrieving revision 1.1.1.3
diff -u -r1.1.1.3 fork.c
--- fork.c 1999/01/23 16:30:27 1.1.1.3
+++ linux/kernel/fork.c 1999/01/27 19:36:23
@@ -10,6 +10,11 @@
* Fork is rather simple, once you get the hang of it, but the memory
* management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
*/
+/*
+ * Fixed a race between mmget() and mmput(): added the mm_lock spinlock
+ * to serialize accesses to the tsk->mm field.
+ * Copyright (C) 1999 Andrea Arcangeli
+ */

#include <linux/malloc.h>
#include <linux/init.h>
@@ -39,6 +44,7 @@

struct task_struct **tarray_freelist = NULL;
spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t mm_lock = SPIN_LOCK_UNLOCKED;

/* UID task count cache, to prevent walking entire process list every
* single fork() operation.
@@ -261,7 +267,7 @@
if (mm) {
*mm = *current->mm;
init_new_context(mm);
- atomic_set(&mm->count, 1);
+ mm->count = 1;
mm->map_count = 0;
mm->def_flags = 0;
mm->mmap_sem = MUTEX_LOCKED;
@@ -303,12 +309,25 @@
}
}

+int mmget(struct mm_struct * mm)
+{
+ int retval = 0;
+
+ if (mm->count)
+ {
+ mm->count++;
+ retval = 1;
+ }
+
+ return retval;
+}
+
/*
* Decrement the use count and release all resources for an mm.
*/
void mmput(struct mm_struct *mm)
{
- if (atomic_dec_and_test(&mm->count)) {
+ if (!--mm->count) {
release_segments(mm);
exit_mmap(mm);
free_page_tables(mm);
@@ -322,7 +341,9 @@
int retval;

if (clone_flags & CLONE_VM) {
+ spin_lock(&mm_lock);
mmget(current->mm);
+ spin_unlock(&mm_lock);
/*
* Set up the LDT descriptor for the clone task.
*/

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/