More problems in dup_mmap

Bill Hawes (whawes@star.net)
Tue, 05 Aug 1997 16:36:10 -0400


This is a multi-part message in MIME format.
--------------E786F4C3F42B1F0D7B341214
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

It appears that there are yet more problems in dup_mmap -- as far as I
can see, an error return from copy_page_range leaves the current vma
struct not linked into the mmap list, so it won't be released. The
memory leak is the lesser problem, as it also means that there could be
many pages left stuck from elevated use counts. This would certainly
exacerbate a low-memory situation ...

I've attached the latest (but still imperfect) patch. The changes so
far are to
(1) initialize the new mmap_sem,

(2) clear tsk->mm if the error exit is taken. The cleanup if fork fails
includes two blocking calls, so it's not a good idea to leave a stale
pointer in tsk->mm.

(3) Preallocate the vma structs, and do the copying while holding the
mmap semaphore. This seems not entirely safe, as copy_page_range
allocates memory and may generate page faults.

(4) Link the vma struct into the mmap list even if copy_page_range
fails, so that everything gets cleaned up.

To make this really solid, we need to make it safe to allocate memory
while holding mmap_sem, and need to check that exit_mmap is up to the
job of cleaning a partially-finished vma struct.

I've tested the patch a little, but if anyone has some ideas on a better
way to do this, I'd appreciate the comments.

Regards,
Bill
--------------E786F4C3F42B1F0D7B341214
Content-Type: text/plain; charset=us-ascii; name="fork_30-patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="fork_30-patch"

--- kernel/fork.c.old Sun Apr 27 15:54:44 1997
+++ kernel/fork.c Tue Aug 5 15:48:59 1997
@@ -77,22 +77,64 @@
return last_pid;
}

+/*
+ * There are some serious race conditions here if the current process is
+ * a cloned process, as its (shared) mmap may change while we're trying
+ * to copy it.
+ */
static inline int dup_mmap(struct mm_struct * mm)
{
struct vm_area_struct * mpnt, **p, *tmp;
+ struct vm_area_struct * free = NULL;
+ int need, have=0;
+ int retval = -ENOMEM;
+
+ /*
+ * Acquire the mmap semaphore, and then count how many vm_area_structs
+ * we need to do the copy. If we don't have enough yet, release the
+ * semaphore and allocate more.
+ */
+count:
+ down(&current->mm->mmap_sem);
+ need = 0;
+ for (mpnt = current->mm->mmap; mpnt ; mpnt = mpnt->vm_next)
+ need++;
+ if (need <= have)
+ goto begin_copy;
+ up(&current->mm->mmap_sem);
+
+ /*
+ * We don't have enough vm_area_structs ... allocate more and put
+ * them on our private free list.
+ */
+ while (need > have++) {
+ tmp = (struct vm_area_struct *)
+ kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (!tmp)
+ goto free_mem_out;
+ tmp->vm_next = free;
+ free = tmp;
+ }
+ goto count;

+ /*
+ * We now have enough vm_area_structs and are holding the mmap
+ * semaphore, so it's safe to start the copy operation.
+ */
+begin_copy:
mm->mmap = NULL;
p = &mm->mmap;
for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
- tmp = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
- if (!tmp) {
- exit_mmap(mm);
- return -ENOMEM;
- }
+ tmp = free;
+ free = tmp->vm_next;
*tmp = *mpnt;
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
tmp->vm_next = NULL;
+
+ /*
+ * Can this be deferred until copy_page_range succeeds?
+ */
if (tmp->vm_inode) {
tmp->vm_inode->i_count++;
/* insert tmp into the share list, just after mpnt */
@@ -100,43 +142,96 @@
mpnt->vm_next_share = tmp;
tmp->vm_prev_share = mpnt;
}
- if (copy_page_range(mm, current->mm, tmp)) {
- exit_mmap(mm);
- return -ENOMEM;
+
+ /*
+ * If this blocks, vmtruncate() could mess with the pages.
+ * Check that out ...
+ */
+ retval = copy_page_range(mm, current->mm, tmp);
+
+ if (!retval) {
+ /* do we need a flag saying it was opened? */
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
}
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
+
+ /*
+ * Link the vma struct into the list even if we had an error,
+ * so exit_mmap can clean up the mess.
+ */
*p = tmp;
p = &tmp->vm_next;
+ if (retval)
+ goto exit_mmap_out;
}
build_mmap_avl(mm);
- return 0;
+ retval = 0;
+
+up_and_out:
+ up(&current->mm->mmap_sem);
+
+ /*
+ * It's possible that we have leftover vm_area_structs, so just
+ * release everything on the free list.
+ */
+free_mem_out:
+ while ((tmp = free) != NULL) {
+ free = tmp->vm_next;
+ kfree(tmp);
+ }
+ return retval;
+
+ /*
+ * If we have a partially built mmap, release it here.
+ */
+exit_mmap_out:
+ printk("dup_mmap: failure, cleaning up mm\n");
+ exit_mmap(mm);
+ goto up_and_out;
}

static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
- if (!(clone_flags & CLONE_VM)) {
- struct mm_struct * mm = kmalloc(sizeof(*tsk->mm), GFP_KERNEL);
- if (!mm)
- return -1;
- *mm = *current->mm;
- mm->count = 1;
- mm->def_flags = 0;
- tsk->mm = mm;
- tsk->min_flt = tsk->maj_flt = 0;
- tsk->cmin_flt = tsk->cmaj_flt = 0;
- tsk->nswap = tsk->cnswap = 0;
- if (new_page_tables(tsk))
- return -1;
- if (dup_mmap(mm)) {
- free_page_tables(mm);
- return -1;
- }
+ struct mm_struct * mm;
+ int retval;
+
+ if (clone_flags & CLONE_VM) {
+ SET_PAGE_DIR(tsk, current->mm->pgd);
+ current->mm->count++;
return 0;
}
- SET_PAGE_DIR(tsk, current->mm->pgd);
- current->mm->count++;
+
+ retval = -ENOMEM;
+ mm = kmalloc(sizeof(*tsk->mm), GFP_KERNEL);
+ if (!mm)
+ goto fail_nomem;
+ *mm = *current->mm;
+ mm->count = 1;
+ mm->def_flags = 0;
+ /*
+ * If current is a cloned process, the semaphore may be
+ * in use, so we don't want a copy.
+ */
+ mm->mmap_sem = MUTEX;
+ tsk->mm = mm;
+ tsk->min_flt = tsk->maj_flt = 0;
+ tsk->cmin_flt = tsk->cmaj_flt = 0;
+ tsk->nswap = tsk->cnswap = 0;
+ if (new_page_tables(tsk))
+ goto cleanup_mm;
+
+ retval = dup_mmap(mm);
+ if (retval)
+ goto cleanup_page_tables;
return 0;
+
+cleanup_page_tables:
+ free_page_tables(mm);
+cleanup_mm:
+ tsk->mm = NULL;
+ kfree(mm);
+fail_nomem:
+ return retval;
}

static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
@@ -290,9 +385,9 @@
bad_fork_cleanup_sighand:
exit_sighand(p);
bad_fork_cleanup_fs:
- exit_fs(p);
+ exit_fs(p); /* blocking */
bad_fork_cleanup_files:
- exit_files(p);
+ exit_files(p); /* blocking */
bad_fork_cleanup:
if (p->exec_domain && p->exec_domain->use_count)
(*p->exec_domain->use_count)--;

--------------E786F4C3F42B1F0D7B341214--