Re: Deadlock on the mm->mmap_sem

From: Andrea Arcangeli (andrea@suse.de)
Date: Wed Sep 19 2001 - 23:37:00 EST


On Thu, Sep 20, 2001 at 04:07:02AM +0200, Andrea Arcangeli wrote:
> On Wed, Sep 19, 2001 at 08:19:09PM +0200, Manfred Spraul wrote:
> > > if we go generic then I strongly recommend my version of the generic
> > > semaphores is _much_ faster (and cleaner) than this one (it even
> > allows
> > > more than 2^31 concurrent readers on 64 bit archs ;).
> > >
> > Andrea,
> >
> > implementing recursive semaphores is trivial, but do you have any idea
> > how to fix the latency problem?
>
> yes, one solution to the latency problem without writing the
> ugly code would be simply to add a per-process counter to pass to a
> modified rwsem api, then to hide the trickery in a mm_down_read macro.
> such way it will be recursive _and_ fair.

ok, here it is attached a rwsem patch (now with the fast path inlined :)
of my version of the rwsem-spinlock semaphores (as only option all over
the ports). It's against pre12 (it is not inlined in the email since
it's not readable anyways)

and below you find (this time inlined) an incremental patch to be
applied on top of the attachment that implements the read recursive and
at the same time fair rw semaphores. This should close all the problems.

But keep in mind the rwsem by default are _fair_, so you cannot do read
recursion unless you use the recursive version and you pass a
rw_sem_recursor to it.

diff -urN rwsem/arch/alpha/mm/fault.c rwsem-recurisve/arch/alpha/mm/fault.c
--- rwsem/arch/alpha/mm/fault.c Thu Sep 20 01:43:26 2001
+++ rwsem-recurisve/arch/alpha/mm/fault.c Thu Sep 20 06:31:37 2001
@@ -113,7 +113,7 @@
                 goto vmalloc_fault;
 #endif
 
- down_read(&mm->mmap_sem);
+ down_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         vma = find_vma(mm, address);
         if (!vma)
                 goto bad_area;
@@ -147,7 +147,7 @@
          * the fault.
          */
         fault = handle_mm_fault(mm, vma, address, cause > 0);
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
 
         if (fault < 0)
                 goto out_of_memory;
@@ -161,7 +161,7 @@
  * Fix it, but check if it's kernel or user first..
  */
 bad_area:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
 
         if (user_mode(regs)) {
                 force_sig(SIGSEGV, current);
@@ -198,7 +198,7 @@
         if (current->pid == 1) {
                 current->policy |= SCHED_YIELD;
                 schedule();
- down_read(&mm->mmap_sem);
+ down_read_recursive(&mm->mmap_sem, &current->mm_recursor);
                 goto survive;
         }
         printk(KERN_ALERT "VM: killing process %s(%d)\n",
diff -urN rwsem/arch/i386/mm/fault.c rwsem-recurisve/arch/i386/mm/fault.c
--- rwsem/arch/i386/mm/fault.c Thu Sep 20 01:43:27 2001
+++ rwsem-recurisve/arch/i386/mm/fault.c Thu Sep 20 06:31:53 2001
@@ -191,7 +191,7 @@
         if (in_interrupt() || !mm)
                 goto no_context;
 
- down_read(&mm->mmap_sem);
+ down_read_recursive(&mm->mmap_sem, &current->mm_recursor);
 
         vma = find_vma(mm, address);
         if (!vma)
@@ -265,7 +265,7 @@
                 if (bit < 32)
                         tsk->thread.screen_bitmap |= 1 << bit;
         }
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         return;
 
 /*
@@ -273,7 +273,7 @@
  * Fix it, but check if it's kernel or user first..
  */
 bad_area:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
 
         /* User mode accesses just cause a SIGSEGV */
         if (error_code & 4) {
@@ -341,11 +341,11 @@
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         if (tsk->pid == 1) {
                 tsk->policy |= SCHED_YIELD;
                 schedule();
- down_read(&mm->mmap_sem);
+ down_read_recursive(&mm->mmap_sem, &current->mm_recursor);
                 goto survive;
         }
         printk("VM: killing process %s\n", tsk->comm);
@@ -354,7 +354,7 @@
         goto no_context;
 
 do_sigbus:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
 
         /*
          * Send a sigbus, regardless of whether we were in kernel
diff -urN rwsem/arch/ia64/mm/fault.c rwsem-recurisve/arch/ia64/mm/fault.c
--- rwsem/arch/ia64/mm/fault.c Tue May 1 19:35:18 2001
+++ rwsem-recurisve/arch/ia64/mm/fault.c Thu Sep 20 06:04:45 2001
@@ -60,7 +60,7 @@
         if (in_interrupt() || !mm)
                 goto no_context;
 
- down_read(&mm->mmap_sem);
+ down_read_recursive(&mm->mmap_sem, &current->mm_recursor);
 
         vma = find_vma_prev(mm, address, &prev_vma);
         if (!vma)
@@ -112,7 +112,7 @@
               default:
                 goto out_of_memory;
         }
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         return;
 
   check_expansion:
@@ -135,7 +135,7 @@
         goto good_area;
 
   bad_area:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         if (isr & IA64_ISR_SP) {
                 /*
                  * This fault was due to a speculative load set the "ed" bit in the psr to
@@ -184,7 +184,7 @@
         return;
 
   out_of_memory:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         printk("VM: killing process %s\n", current->comm);
         if (user_mode(regs))
                 do_exit(SIGKILL);
diff -urN rwsem/arch/ppc/mm/fault.c rwsem-recurisve/arch/ppc/mm/fault.c
--- rwsem/arch/ppc/mm/fault.c Wed Jul 4 04:03:45 2001
+++ rwsem-recurisve/arch/ppc/mm/fault.c Thu Sep 20 06:10:09 2001
@@ -103,7 +103,7 @@
                 bad_page_fault(regs, address, SIGSEGV);
                 return;
         }
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem, &current->mm_recursor);
         vma = find_vma(mm, address);
         if (!vma)
                 goto bad_area;
@@ -163,7 +163,7 @@
                 goto out_of_memory;
         }
 
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         /*
          * keep track of tlb+htab misses that are good addrs but
          * just need pte's created via handle_mm_fault()
@@ -173,7 +173,7 @@
         return;
 
 bad_area:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         pte_errors++;
 
         /* User mode accesses cause a SIGSEGV */
@@ -194,7 +194,7 @@
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         printk("VM: killing process %s\n", current->comm);
         if (user_mode(regs))
                 do_exit(SIGKILL);
@@ -202,7 +202,7 @@
         return;
 
 do_sigbus:
- up_read(&mm->mmap_sem);
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
         info.si_signo = SIGBUS;
         info.si_errno = 0;
         info.si_code = BUS_ADRERR;
diff -urN rwsem/fs/exec.c rwsem-recurisve/fs/exec.c
--- rwsem/fs/exec.c Thu Sep 20 01:44:06 2001
+++ rwsem-recurisve/fs/exec.c Thu Sep 20 06:31:06 2001
@@ -969,9 +969,9 @@
         if (do_truncate(file->f_dentry, 0) != 0)
                 goto close_fail;
 
- down_read(&current->mm->mmap_sem);
+ down_read_recursive(&current->mm->mmap_sem, &current->mm_recursor);
         retval = binfmt->core_dump(signr, regs, file);
- up_read(&current->mm->mmap_sem);
+ up_read_recursive(&current->mm->mmap_sem, &current->mm_recursor);
 
 close_fail:
         filp_close(file, NULL);
diff -urN rwsem/fs/proc/array.c rwsem-recurisve/fs/proc/array.c
--- rwsem/fs/proc/array.c Sat Aug 11 08:04:22 2001
+++ rwsem-recurisve/fs/proc/array.c Thu Sep 20 06:22:35 2001
@@ -577,7 +577,10 @@
         column = *ppos & (MAPS_LINE_LENGTH-1);
 
         /* quickly go to line lineno */
- down_read(&mm->mmap_sem);
+ if (task == current)
+ down_read_recursive(&mm->mmap_sem, &current->mm_recursor);
+ else
+ down_read(&mm->mmap_sem);
         for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
                 continue;
 
@@ -658,7 +661,10 @@
                 if (volatile_task)
                         break;
         }
- up_read(&mm->mmap_sem);
+ if (task == current)
+ up_read_recursive(&mm->mmap_sem, &current->mm_recursor);
+ else
+ up_read(&mm->mmap_sem);
 
         /* encode f_pos */
         *ppos = (lineno << MAPS_LINE_SHIFT) + column;
diff -urN rwsem/include/linux/rwsem.h rwsem-recurisve/include/linux/rwsem.h
--- rwsem/include/linux/rwsem.h Thu Sep 20 05:08:56 2001
+++ rwsem-recurisve/include/linux/rwsem.h Thu Sep 20 06:25:49 2001
@@ -18,6 +18,11 @@
 #endif
 };
 
+struct rw_sem_recursor
+{
+ int counter;
+};
+
 #if RWSEM_DEBUG
 #define __SEM_DEBUG_INIT(name) \
         , (long)&(name).__magic
@@ -42,6 +47,7 @@
         __SEM_DEBUG_INIT(name) \
 }
 #define RWSEM_INITIALIZER(name) __RWSEM_INITIALIZER(name, 0)
+#define RWSEM_RECURSOR_INITIALIZER ((struct rw_sem_recursor) { 0, })
 
 #define __DECLARE_RWSEM(name, count) \
         struct rw_semaphore name = __RWSEM_INITIALIZER(name, count)
@@ -112,6 +118,34 @@
         spin_lock(&sem->lock);
         sem->count -= RWSEM_WRITE_BIAS;
         if (unlikely(sem->count))
+ rwsem_wake(sem);
+ spin_unlock(&sem->lock);
+}
+
+static inline void down_read_recursive(struct rw_semaphore *sem,
+ struct rw_sem_recursor * recursor)
+{
+ int count, counter;
+ CHECK_MAGIC(sem->__magic);
+
+ spin_lock(&sem->lock);
+ count = sem->count;
+ sem->count += RWSEM_READ_BIAS;
+ counter = recursor->counter++;
+ if (unlikely(count < 0 && !counter && !(count & RWSEM_READ_MASK)))
+ rwsem_down_failed(sem, RWSEM_READ_BLOCKING_BIAS);
+ spin_unlock(&sem->lock);
+}
+
+static inline void up_read_recursive(struct rw_semaphore *sem,
+ struct rw_sem_recursor * recursor)
+{
+ CHECK_MAGIC(sem->__magic);
+
+ spin_lock(&sem->lock);
+ sem->count -= RWSEM_READ_BIAS;
+ recursor->counter--;
+ if (unlikely(sem->count < 0 && !(sem->count & RWSEM_READ_MASK)))
                 rwsem_wake(sem);
         spin_unlock(&sem->lock);
 }
diff -urN rwsem/include/linux/sched.h rwsem-recurisve/include/linux/sched.h
--- rwsem/include/linux/sched.h Thu Sep 20 05:09:07 2001
+++ rwsem-recurisve/include/linux/sched.h Thu Sep 20 06:25:50 2001
@@ -315,6 +315,7 @@
 
         struct task_struct *next_task, *prev_task;
         struct mm_struct *active_mm;
+ struct rw_sem_recursor mm_recursor;
         struct list_head local_pages;
         unsigned int allocation_order, nr_local_pages;
 
@@ -460,6 +461,7 @@
     policy: SCHED_OTHER, \
     mm: NULL, \
     active_mm: &init_mm, \
+ mm_recursor: RWSEM_RECURSOR_INITIALIZER, \
     cpus_allowed: -1, \
     run_list: LIST_HEAD_INIT(tsk.run_list), \
     next_task: &tsk, \

Andrea



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Sep 23 2001 - 21:00:35 EST