Re: [PATCH] mm: Fix boot crash in mm_alloc()

From: Linus Torvalds
Date: Sun May 29 2011 - 13:20:41 EST


On Sun, May 29, 2011 at 9:22 AM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> Or, in fact, we could just do something like the attached (UNTESTED!)

So I did warn you that it was untested.

It still is, but I walked through it a bit more, and I realized that
while I had gotten rid of the extra allocations of the
cpu_vm_mask_var, I hadn't gotten rid of the freeing.

So that patch would definitely not have worked very well with
CONFIG_CPUMASK_OFFSTACK.

And I noticed that I moved the cpu_vm_mask back in the wrong space, it
should likely be as close as possible to the mm_context_t, since the
main user is likely the task switching code that touches that anyway.

So here's a slightly updated patch.

STILL TOTALLY UNTESTED! The fixes were just from eyeballing it a bit
more, not from any actual testing.

Linus
include/linux/mm_types.h | 14 ++++++++++++--
include/linux/sched.h | 1 -
init/main.c | 2 +-
kernel/fork.c | 42 ++++++++++--------------------------------
4 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2a78aae78c69..027935c86c68 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -264,6 +264,8 @@ struct mm_struct {

struct linux_binfmt *binfmt;

+ cpumask_var_t cpu_vm_mask_var;
+
/* Architecture-specific MM context */
mm_context_t context;

@@ -311,10 +313,18 @@ struct mm_struct {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
-
- cpumask_var_t cpu_vm_mask_var;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ struct cpumask cpumask_allocation;
+#endif
};

+static inline void mm_init_cpumask(struct mm_struct *mm)
+{
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ mm->cpu_vm_mask_var = &mm->cpumask_allocation;
+#endif
+}
+
/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bcddd0138105..2a8621c4be1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2194,7 +2194,6 @@ static inline void mmdrop(struct mm_struct * mm)
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}
-extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm);

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
diff --git a/init/main.c b/init/main.c
index d2f1e086bf33..cafba67c13bf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -487,6 +487,7 @@ asmlinkage void __init start_kernel(void)
printk(KERN_NOTICE "%s", linux_banner);
setup_arch(&command_line);
mm_init_owner(&init_mm, &init_task);
+ mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
@@ -510,7 +511,6 @@ asmlinkage void __init start_kernel(void)
sort_main_extable();
trap_init();
mm_init();
- BUG_ON(mm_init_cpumask(&init_mm, 0));

/*
* Set up the scheduler prior starting any interrupts (such as the
diff --git a/kernel/fork.c b/kernel/fork.c
index ca406d916713..0276c30401a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}

-int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-#ifdef CONFIG_CPUMASK_OFFSTACK
- if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
- return -ENOMEM;
-
- if (oldmm)
- cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
- else
- memset(mm_cpumask(mm), 0, cpumask_size());
-#endif
- return 0;
-}
-
static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
@@ -538,17 +524,8 @@ struct mm_struct * mm_alloc(void)
return NULL;

memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
- if (!mm)
- return NULL;
-
- if (mm_init_cpumask(mm, NULL)) {
- mm_free_pgd(mm);
- free_mm(mm);
- return NULL;
- }
-
- return mm;
+ mm_init_cpumask(mm);
+ return mm_init(mm, current);
}

/*
@@ -559,7 +536,6 @@ struct mm_struct * mm_alloc(void)
void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
- free_cpumask_var(mm->cpu_vm_mask_var);
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
@@ -753,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
goto fail_nomem;

memcpy(mm, oldmm, sizeof(*mm));
+ mm_init_cpumask(mm);

/* Initializing for Swap token stuff */
mm->token_priority = 0;
@@ -765,9 +742,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
if (!mm_init(mm, tsk))
goto fail_nomem;

- if (mm_init_cpumask(mm, oldmm))
- goto fail_nocpumask;
-
if (init_new_context(tsk, mm))
goto fail_nocontext;

@@ -794,9 +768,6 @@ fail_nomem:
return NULL;

fail_nocontext:
- free_cpumask_var(mm->cpu_vm_mask_var);
-
-fail_nocpumask:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
@@ -1591,6 +1562,13 @@ void __init proc_caches_init(void)
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ /*
+ * FIXME! The "sizeof(struct mm_struct)" currently includes the
+ * whole struct cpumask for the OFFSTACK case. We could change
+ * this to *only* allocate as much of it as required by the
+ * maximum number of CPU's we can ever have. The cpumask_allocation
+ * is at the end of the structure, exactly for that reason.
+ */
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);