[PATCH] anobjrmap 9 priority mjb tree
From: Hugh Dickins
Date:  Sun Apr 04 2004 - 07:38:40 EST
This anobjrmap 9 (or anon_mm9) patch adds Rajesh's radix priority search
tree on top of Martin's 2.6.5-rc3-mjb2 tree, making a priority mjb tree!
Approximately equivalent to Andrea's 2.6.5-aa1, but using anonmm instead
of anon_vma, and of course each tree has its own additional features.
 arch/arm/mm/fault-armv.c        |   80 ++---
 arch/mips/mm/cache.c            |    9 
 arch/parisc/kernel/cache.c      |   86 ++---
 arch/parisc/kernel/sys_parisc.c |   14 
 arch/s390/kernel/compat_exec.c  |    2 
 arch/sparc64/mm/init.c          |    8 
 arch/x86_64/ia32/ia32_binfmt.c  |    2 
 fs/exec.c                       |    2 
 fs/hugetlbfs/inode.c            |   14 
 fs/inode.c                      |    5 
 fs/locks.c                      |    8 
 fs/proc/task_mmu.c              |    2 
 fs/xfs/linux/xfs_vnode.h        |    5 
 include/asm-arm/cacheflush.h    |    8 
 include/asm-parisc/cacheflush.h |   10 
 include/asm-sh/pgalloc.h        |    5 
 include/linux/fs.h              |    6 
 include/linux/mm.h              |  167 +++++++++++
 include/linux/prio_tree.h       |   78 +++++
 init/main.c                     |    2 
 kernel/fork.c                   |    4 
 kernel/kexec.c                  |    2 
 mm/Makefile                     |    3 
 mm/filemap.c                    |    3 
 mm/fremap.c                     |   14 
 mm/memory.c                     |   15 -
 mm/mmap.c                       |  100 +++---
 mm/mremap.c                     |   42 ++
 mm/page_io.c                    |    4 
 mm/prio_tree.c                  |  577 ++++++++++++++++++++++++++++++++++++++++
 mm/rmap.c                       |  164 ++++++-----
 mm/shmem.c                      |    3 
 mm/vmscan.c                     |    6 
 33 files changed, 1172 insertions(+), 278 deletions(-)
--- 2.6.5-rc3-mjb2/arch/arm/mm/fault-armv.c	2004-04-02 21:01:43.725406016 +0100
+++ anobjrmap9/arch/arm/mm/fault-armv.c	2004-04-04 13:05:41.369476056 +0100
@@ -16,6 +16,7 @@
 #include <linux/bitops.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
+#include <linux/pagemap.h>
 
 #include <asm/cacheflush.h>
 #include <asm/io.h>
@@ -186,47 +187,47 @@ no_pmd:
 
 void __flush_dcache_page(struct page *page)
 {
+	struct address_space *mapping = page_mapping(page);
 	struct mm_struct *mm = current->active_mm;
-	struct list_head *l;
+	struct vm_area_struct *mpnt;
+	struct prio_tree_iter iter;
+	unsigned long offset;
+	pgoff_t pgoff;
 
 	__cpuc_flush_dcache_page(page_address(page));
 
-	if (!page_mapping(page))
+	if (!mapping)
 		return;
 
 	/*
 	 * With a VIVT cache, we need to also write back
 	 * and invalidate any user data.
 	 */
-	list_for_each(l, &page->mapping->i_mmap_shared) {
-		struct vm_area_struct *mpnt;
-		unsigned long off;
-
-		mpnt = list_entry(l, struct vm_area_struct, shared);
-
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+					&iter, pgoff, pgoff);
+	while (mpnt) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
 		 */
-		if (mpnt->vm_mm != mm)
-			continue;
-
-		if (page->index < mpnt->vm_pgoff)
-			continue;
-
-		off = page->index - mpnt->vm_pgoff;
-		if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
-			continue;
-
-		flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
+		if (mpnt->vm_mm == mm) {
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			flush_cache_page(mpnt, mpnt->vm_start + offset);
+		}
+		mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+						&iter, pgoff, pgoff);
 	}
 }
 
 static void
 make_coherent(struct vm_area_struct *vma, unsigned long addr, struct page *page, int dirty)
 {
-	struct list_head *l;
+	struct address_space *mapping = page->mapping;
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+	struct vm_area_struct *mpnt;
+	struct prio_tree_iter iter;
+	unsigned long offset;
+	pgoff_t pgoff;
 	int aliases = 0;
 
 	/*
@@ -234,36 +235,21 @@ make_coherent(struct vm_area_struct *vma
 	 * space, then we need to handle them specially to maintain
 	 * cache coherency.
 	 */
-	list_for_each(l, &page->mapping->i_mmap_shared) {
-		struct vm_area_struct *mpnt;
-		unsigned long off;
-
-		mpnt = list_entry(l, struct vm_area_struct, shared);
-
+	pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+	mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+					&iter, pgoff, pgoff);
+	while (mpnt) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
-		 * Note that we intentionally don't mask out the VMA
+		 * Note that we intentionally mask out the VMA
 		 * that we are fixing up.
 		 */
-		if (mpnt->vm_mm != mm || mpnt == vma)
-			continue;
-
-		/*
-		 * If the page isn't in this VMA, we can also ignore it.
-		 */
-		if (pgoff < mpnt->vm_pgoff)
-			continue;
-
-		off = pgoff - mpnt->vm_pgoff;
-		if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
-			continue;
-
-		off = mpnt->vm_start + (off << PAGE_SHIFT);
-
-		/*
-		 * Ok, it is within mpnt.  Fix it up.
-		 */
-		aliases += adjust_pte(mpnt, off);
+		if (mpnt->vm_mm == mm && mpnt != vma) {
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			aliases += adjust_pte(mpnt, mpnt->vm_start + offset);
+		}
+		mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+						&iter, pgoff, pgoff);
 	}
 	if (aliases)
 		adjust_pte(vma, addr);
--- 2.6.5-rc3-mjb2/arch/mips/mm/cache.c	2004-04-02 21:01:44.347311472 +0100
+++ anobjrmap9/arch/mips/mm/cache.c	2004-04-04 13:05:41.370475904 +0100
@@ -55,13 +55,14 @@ asmlinkage int sys_cacheflush(void *addr
 
 void flush_dcache_page(struct page *page)
 {
+	struct address_space *mapping = page_mapping(page);
 	unsigned long addr;
 
-	if (page_mapping(page) &&
-	    list_empty(&page->mapping->i_mmap) &&
-	    list_empty(&page->mapping->i_mmap_shared)) {
+	if (mapping &&
+	    prio_tree_empty(&mapping->i_mmap) &&
+	    prio_tree_empty(&mapping->i_mmap_shared) &&
+	    list_empty(&mapping->i_mmap_nonlinear)) {
 		SetPageDcacheDirty(page);
-
 		return;
 	}
 
--- 2.6.5-rc3-mjb2/arch/parisc/kernel/cache.c	2004-04-02 21:01:44.356310104 +0100
+++ anobjrmap9/arch/parisc/kernel/cache.c	2004-04-04 13:05:41.371475752 +0100
@@ -17,6 +17,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/pagemap.h>
 
 #include <asm/pdc.h>
 #include <asm/cache.h>
@@ -229,67 +230,60 @@ void disable_sr_hashing(void)
 
 void __flush_dcache_page(struct page *page)
 {
+	struct address_space *mapping = page_mapping(page);
 	struct mm_struct *mm = current->active_mm;
-	struct list_head *l;
+	struct vm_area_struct *mpnt;
+	struct prio_tree_iter iter;
+	unsigned long offset;
+	pgoff_t pgoff;
 
 	flush_kernel_dcache_page(page_address(page));
 
-	if (!page_mapping(page))
+	if (!mapping)
 		return;
-	/* check shared list first if it's not empty...it's usually
-	 * the shortest */
-	list_for_each(l, &page->mapping->i_mmap_shared) {
-		struct vm_area_struct *mpnt;
-		unsigned long off;
 
-		mpnt = list_entry(l, struct vm_area_struct, shared);
+	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 
+	/* check shared list first if it's not empty...it's usually
+	 * the shortest */
+	mpnt = __vma_prio_tree_first(&mapping->i_mmap_shared,
+					&iter, pgoff, pgoff);
+	while (mpnt) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
 		 */
-		if (mpnt->vm_mm != mm)
-			continue;
-
-		if (page->index < mpnt->vm_pgoff)
-			continue;
-
-		off = page->index - mpnt->vm_pgoff;
-		if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
-			continue;
-
-		flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
-
-		/* All user shared mappings should be equivalently mapped,
-		 * so once we've flushed one we should be ok
-		 */
-		return;
+		if (mpnt->vm_mm == mm) {
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			flush_cache_page(mpnt, mpnt->vm_start + offset);
+
+			/* All user shared mappings should be equivalently
+			 * mapped, so once we've flushed one we should be ok
+			 */
+			return;
+		}
+		mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+						&iter, pgoff, pgoff);
 	}
 
 	/* then check private mapping list for read only shared mappings
 	 * which are flagged by VM_MAYSHARE */
-	list_for_each(l, &page->mapping->i_mmap) {
-		struct vm_area_struct *mpnt;
-		unsigned long off;
-
-		mpnt = list_entry(l, struct vm_area_struct, shared);
-
-
-		if (mpnt->vm_mm != mm || !(mpnt->vm_flags & VM_MAYSHARE))
-			continue;
-
-		if (page->index < mpnt->vm_pgoff)
-			continue;
-
-		off = page->index - mpnt->vm_pgoff;
-		if (off >= (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT)
-			continue;
-
-		flush_cache_page(mpnt, mpnt->vm_start + (off << PAGE_SHIFT));
-
-		/* All user shared mappings should be equivalently mapped,
-		 * so once we've flushed one we should be ok
+	mpnt = __vma_prio_tree_first(&mapping->i_mmap,
+					&iter, pgoff, pgoff);
+	while (mpnt) {
+		/*
+		 * If this VMA is not in our MM, we can ignore it.
 		 */
-		break;
+		if (mpnt->vm_mm == mm && (mpnt->vm_flags & VM_MAYSHARE)) {
+			offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
+			flush_cache_page(mpnt, mpnt->vm_start + offset);
+
+			/* All user shared mappings should be equivalently
+			 * mapped, so once we've flushed one we should be ok
+			 */
+			return;
+		}
+		mpnt = __vma_prio_tree_next(mpnt, &mapping->i_mmap_shared,
+						&iter, pgoff, pgoff);
 	}
 }
 EXPORT_SYMBOL(__flush_dcache_page);
--- 2.6.5-rc3-mjb2/arch/parisc/kernel/sys_parisc.c	2004-03-30 13:04:00.000000000 +0100
+++ anobjrmap9/arch/parisc/kernel/sys_parisc.c	2004-04-04 13:05:41.372475600 +0100
@@ -68,17 +68,8 @@ static unsigned long get_unshared_area(u
  * existing mapping and use the same offset.  New scheme is to use the
  * address of the kernel data structure as the seed for the offset.
  * We'll see how that works...
- */
-#if 0
-static int get_offset(struct address_space *mapping)
-{
-	struct vm_area_struct *vma = list_entry(mapping->i_mmap_shared.next,
-			struct vm_area_struct, shared);
-	return (vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT)) &
-		(SHMLBA - 1);
-}
-#else
-/* The mapping is cacheline aligned, so there's no information in the bottom
+ *
+ * The mapping is cacheline aligned, so there's no information in the bottom
  * few bits of the address.  We're looking for 10 bits (4MB / 4k), so let's
  * drop the bottom 8 bits and use bits 8-17.  
  */
@@ -87,7 +78,6 @@ static int get_offset(struct address_spa
 	int offset = (unsigned long) mapping << (PAGE_SHIFT - 8);
 	return offset & 0x3FF000;
 }
-#endif
 
 static unsigned long get_shared_area(struct address_space *mapping,
 		unsigned long addr, unsigned long len, unsigned long pgoff)
--- 2.6.5-rc3-mjb2/arch/s390/kernel/compat_exec.c	2003-07-10 21:16:28.000000000 +0100
+++ anobjrmap9/arch/s390/kernel/compat_exec.c	2004-04-04 13:05:41.372475600 +0100
@@ -71,7 +71,7 @@ int setup_arg_pages32(struct linux_binpr
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
-		INIT_LIST_HEAD(&mpnt->shared);
+		INIT_VMA_SHARED(mpnt);
 		mpnt->vm_private_data = (void *) 0;
 		insert_vm_struct(mm, mpnt);
 		mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/arch/sparc64/mm/init.c	2004-04-02 21:01:44.535282896 +0100
+++ anobjrmap9/arch/sparc64/mm/init.c	2004-04-04 13:05:41.375475144 +0100
@@ -224,12 +224,14 @@ void update_mmu_cache(struct vm_area_str
 
 void flush_dcache_page(struct page *page)
 {
+	struct address_space *mapping = page_mapping(page);
 	int dirty = test_bit(PG_dcache_dirty, &page->flags);
 	int dirty_cpu = dcache_dirty_cpu(page);
 
-	if (page_mapping(page) &&
-	    list_empty(&page->mapping->i_mmap) &&
-	    list_empty(&page->mapping->i_mmap_shared)) {
+	if (mapping &&
+	    prio_tree_empty(&mapping->i_mmap) &&
+	    prio_tree_empty(&mapping->i_mmap_shared) &&
+	    list_empty(&mapping->i_mmap_nonlinear)) {
 		if (dirty) {
 			if (dirty_cpu == smp_processor_id())
 				return;
--- 2.6.5-rc3-mjb2/arch/x86_64/ia32/ia32_binfmt.c	2004-03-30 13:04:03.000000000 +0100
+++ anobjrmap9/arch/x86_64/ia32/ia32_binfmt.c	2004-04-04 13:05:41.375475144 +0100
@@ -360,7 +360,7 @@ int setup_arg_pages(struct linux_binprm 
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
-		INIT_LIST_HEAD(&mpnt->shared);
+		INIT_VMA_SHARED(mpnt);
 		mpnt->vm_private_data = (void *) 0;
 		insert_vm_struct(mm, mpnt);
 		mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/fs/exec.c	2004-04-02 21:01:45.785092896 +0100
+++ anobjrmap9/fs/exec.c	2004-04-04 13:05:41.377474840 +0100
@@ -423,7 +423,7 @@ int setup_arg_pages(struct linux_binprm 
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
-		INIT_LIST_HEAD(&mpnt->shared);
+		INIT_VMA_SHARED(mpnt);
 		mpnt->vm_private_data = (void *) 0;
 		insert_vm_struct(mm, mpnt);
 		mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
--- 2.6.5-rc3-mjb2/fs/hugetlbfs/inode.c	2004-04-02 21:01:45.794091528 +0100
+++ anobjrmap9/fs/hugetlbfs/inode.c	2004-04-04 13:05:41.378474688 +0100
@@ -270,11 +270,13 @@ static void hugetlbfs_drop_inode(struct 
  * vma->vm_pgoff is in PAGE_SIZE units.
  */
 static void
-hugetlb_vmtruncate_list(struct list_head *list, unsigned long h_pgoff)
+hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 {
 	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
 
-	list_for_each_entry(vma, list, shared) {
+	vma = __vma_prio_tree_first(root, &iter, h_pgoff, ULONG_MAX);
+	while (vma) {
 		unsigned long h_vm_pgoff;
 		unsigned long v_length;
 		unsigned long h_length;
@@ -306,6 +308,8 @@ hugetlb_vmtruncate_list(struct list_head
 		zap_hugepage_range(vma,
 				vma->vm_start + v_offset,
 				v_length - v_offset);
+
+		vma = __vma_prio_tree_next(vma, root, &iter, h_pgoff, ULONG_MAX);
 	}
 }
 
@@ -325,9 +329,11 @@ static int hugetlb_vmtruncate(struct ino
 
 	inode->i_size = offset;
 	down(&mapping->i_shared_sem);
-	if (!list_empty(&mapping->i_mmap))
+	/* Protect against page fault */
+	atomic_inc(&mapping->truncate_count);
+	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
-	if (!list_empty(&mapping->i_mmap_shared))
+	if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
 		hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
 	up(&mapping->i_shared_sem);
 	truncate_hugepages(mapping, offset);
--- 2.6.5-rc3-mjb2/fs/inode.c	2004-03-30 13:04:15.000000000 +0100
+++ anobjrmap9/fs/inode.c	2004-04-04 13:05:41.380474384 +0100
@@ -189,8 +189,9 @@ void inode_init_once(struct inode *inode
 	atomic_set(&inode->i_data.truncate_count, 0);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+	INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
+	INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap_shared);
+	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
 	spin_lock_init(&inode->i_lock);
 	i_size_ordered_init(inode);
 }
--- 2.6.5-rc3-mjb2/fs/locks.c	2004-03-11 01:56:12.000000000 +0000
+++ anobjrmap9/fs/locks.c	2004-04-04 13:05:41.382474080 +0100
@@ -1455,8 +1455,8 @@ int fcntl_setlk(struct file *filp, unsig
 	if (IS_MANDLOCK(inode) &&
 	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
 		struct address_space *mapping = filp->f_mapping;
-
-		if (!list_empty(&mapping->i_mmap_shared)) {
+		if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+			!list_empty(&mapping->i_mmap_nonlinear)) {
 			error = -EAGAIN;
 			goto out;
 		}
@@ -1593,8 +1593,8 @@ int fcntl_setlk64(struct file *filp, uns
 	if (IS_MANDLOCK(inode) &&
 	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
 		struct address_space *mapping = filp->f_mapping;
-
-		if (!list_empty(&mapping->i_mmap_shared)) {
+		if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+			!list_empty(&mapping->i_mmap_nonlinear)) {
 			error = -EAGAIN;
 			goto out;
 		}
--- 2.6.5-rc3-mjb2/fs/proc/task_mmu.c	2004-04-02 21:01:45.864080888 +0100
+++ anobjrmap9/fs/proc/task_mmu.c	2004-04-04 13:05:41.383473928 +0100
@@ -65,7 +65,7 @@ int task_statm(struct mm_struct *mm, int
 				*shared += pages;
 			continue;
 		}
-		if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared))
+		if (vma->vm_flags & VM_SHARED || !vma_shared_empty(vma))
 			*shared += pages;
 		if (vma->vm_flags & VM_EXECUTABLE)
 			*text += pages;
--- 2.6.5-rc3-mjb2/fs/xfs/linux/xfs_vnode.h	2004-02-04 02:45:43.000000000 +0000
+++ anobjrmap9/fs/xfs/linux/xfs_vnode.h	2004-04-04 13:05:41.384473776 +0100
@@ -597,8 +597,9 @@ static __inline__ void vn_flagclr(struct
  * Some useful predicates.
  */
 #define VN_MAPPED(vp)	\
-	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
-	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
+	(!prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
+	 !prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)) || \
+	 !list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_nonlinear)))
 #define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
 #define VN_DIRTY(vp)	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages)))
 #define VMODIFY(vp)	VN_FLAGSET(vp, VMODIFIED)
--- 2.6.5-rc3-mjb2/include/asm-arm/cacheflush.h	2004-04-02 21:01:45.998060520 +0100
+++ anobjrmap9/include/asm-arm/cacheflush.h	2004-04-04 13:05:41.385473624 +0100
@@ -292,8 +292,12 @@ flush_cache_page(struct vm_area_struct *
  * about to change to user space.  This is the same method as used on SPARC64.
  * See update_mmu_cache for the user space part.
  */
-#define mapping_mapped(map)	(!list_empty(&(map)->i_mmap) || \
-				 !list_empty(&(map)->i_mmap_shared))
+static inline int mapping_mapped(struct address_space *mapping)
+{
+	return	!prio_tree_empty(&mapping->i_mmap) ||
+		!prio_tree_empty(&mapping->i_mmap_shared) ||
+		!list_empty(&mapping->i_mmap_nonlinear);
+}
 
 extern void __flush_dcache_page(struct page *);
 
--- 2.6.5-rc3-mjb2/include/asm-parisc/cacheflush.h	2004-04-02 21:01:46.794939376 +0100
+++ anobjrmap9/include/asm-parisc/cacheflush.h	2004-04-04 13:05:41.385473624 +0100
@@ -65,12 +65,18 @@ flush_user_icache_range(unsigned long st
 #endif
 }
 
+static inline int mapping_mapped(struct address_space *mapping)
+{
+	return	!prio_tree_empty(&mapping->i_mmap) ||
+		!prio_tree_empty(&mapping->i_mmap_shared) ||
+		!list_empty(&mapping->i_mmap_nonlinear);
+}
+
 extern void __flush_dcache_page(struct page *page);
 
 static inline void flush_dcache_page(struct page *page)
 {
-	if (page_mapping(page) && list_empty(&page->mapping->i_mmap) &&
-			list_empty(&page->mapping->i_mmap_shared)) {
+	if (page_mapping(page) && !mapping_mapped(page->mapping)) {
 		set_bit(PG_dcache_dirty, &page->flags);
 	} else {
 		__flush_dcache_page(page);
--- 2.6.5-rc3-mjb2/include/asm-sh/pgalloc.h	2004-04-02 21:01:46.963913688 +0100
+++ anobjrmap9/include/asm-sh/pgalloc.h	2004-04-04 13:05:41.386473472 +0100
@@ -101,8 +101,9 @@ static inline pte_t ptep_get_and_clear(p
 		unsigned long pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (!page_mapping(page)
-			    || list_empty(&page->mapping->i_mmap_shared))
+			if (!page_mapping(page) ||
+			    (prio_tree_empty(&page->mapping->i_mmap_shared) &&
+			     list_empty(&page->mapping->i_mmap_nonlinear)))
 				__clear_bit(PG_mapped, &page->flags);
 		}
 	}
--- 2.6.5-rc3-mjb2/include/linux/fs.h	2004-03-30 13:04:18.000000000 +0100
+++ anobjrmap9/include/linux/fs.h	2004-04-04 13:05:41.388473168 +0100
@@ -18,6 +18,7 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/radix-tree.h>
+#include <linux/prio_tree.h>
 #include <linux/kobject.h>
 #include <asm/atomic.h>
 
@@ -329,8 +330,9 @@ struct address_space {
 	struct list_head	io_pages;	/* being prepared for I/O */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
-	struct list_head	i_mmap;		/* list of private mappings */
-	struct list_head	i_mmap_shared;	/* list of shared mappings */
+	struct prio_tree_root	i_mmap;		/* tree of private mappings */
+	struct prio_tree_root	i_mmap_shared;	/* tree of shared mappings */
+	struct list_head	i_mmap_nonlinear;/*list of nonlinear mappings */
 	struct semaphore	i_shared_sem;	/* protect both above lists */
 	atomic_t		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		flags;		/* error bits/gfp mask */
--- 2.6.5-rc3-mjb2/include/linux/mm.h	2004-04-02 21:01:47.316860032 +0100
+++ anobjrmap9/include/linux/mm.h	2004-04-04 13:05:41.390472864 +0100
@@ -11,6 +11,7 @@
 #include <linux/list.h>
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
+#include <linux/prio_tree.h>
 #include <linux/fs.h>
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
@@ -68,7 +69,26 @@ struct vm_area_struct {
 	 * one of the address_space->i_mmap{,shared} lists,
 	 * for shm areas, the list of attaches, otherwise unused.
 	 */
-	struct list_head shared;
+	union {
+		struct {
+			struct list_head list;
+			void *parent;
+		} vm_set;
+
+		struct prio_tree_node  prio_tree_node;
+
+		struct {
+			void *first;
+			void *second;
+			void *parent;
+		} both;
+	} shared;
+
+	/*
+	 * shared.vm_set : list of vmas that map exactly the same set of pages
+	 * vm_set_head   : head of the vm_set list
+	 */
+	struct vm_area_struct *vm_set_head;
 
 	/* Function pointers to deal with this struct. */
 	struct vm_operations_struct * vm_ops;
@@ -130,6 +150,150 @@ struct vm_area_struct {
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
 /*
+ * The following macros are used for implementing prio_tree for i_mmap{_shared}
+ */
+
+#define	RADIX_INDEX(vma)  ((vma)->vm_pgoff)
+#define	VMA_SIZE(vma)	  (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
+/* avoid overflow */
+#define	HEAP_INDEX(vma)	  ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
+
+#define GET_INDEX_VMA(vma, radix, heap)		\
+do {						\
+	radix = RADIX_INDEX(vma);		\
+	heap = HEAP_INDEX(vma);			\
+} while (0)
+
+#define GET_INDEX(node, radix, heap)		\
+do { 						\
+	struct vm_area_struct *__tmp = 		\
+	  prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\
+	GET_INDEX_VMA(__tmp, radix, heap); 	\
+} while (0)
+
+#define	INIT_VMA_SHARED_LIST(vma)			\
+do {							\
+	INIT_LIST_HEAD(&(vma)->shared.vm_set.list);	\
+	(vma)->shared.vm_set.parent = NULL;		\
+	(vma)->vm_set_head = NULL;			\
+} while (0)
+
+#define INIT_VMA_SHARED(vma)			\
+do {						\
+	(vma)->shared.both.first = NULL;	\
+	(vma)->shared.both.second = NULL;	\
+	(vma)->shared.both.parent = NULL;	\
+	(vma)->vm_set_head = NULL;		\
+} while (0)
+
+extern void __vma_prio_tree_insert(struct prio_tree_root *,
+	struct vm_area_struct *);
+
+extern void __vma_prio_tree_remove(struct prio_tree_root *,
+	struct vm_area_struct *);
+
+static inline int vma_shared_empty(struct vm_area_struct *vma)
+{
+	return vma->shared.both.first == NULL;
+}
+
+/*
+ * Helps to add a new vma that maps the same (identical) set of pages as the
+ * old vma to an i_mmap tree.
+ */
+static inline void __vma_prio_tree_add(struct vm_area_struct *vma,
+	struct vm_area_struct *old)
+{
+	INIT_VMA_SHARED_LIST(vma);
+
+	/* Leave these BUG_ONs till prio_tree patch stabilizes */
+	BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
+	BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
+
+	if (old->shared.both.parent) {
+		if (old->vm_set_head) {
+			list_add_tail(&vma->shared.vm_set.list,
+					&old->vm_set_head->shared.vm_set.list);
+			return;
+		}
+		else {
+			old->vm_set_head = vma;
+			vma->vm_set_head = old;
+		}
+	}
+	else
+		list_add(&vma->shared.vm_set.list, &old->shared.vm_set.list);
+}
+
+/*
+ * We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been
+ * already present in an i_mmap{_shared} tree without modifying the tree. The
+ * following helper function should be used when such modifications are
+ * necessary. We should hold the mapping's i_shared_sem.
+ *
+ * This function can be (micro)optimized for some special cases (maybe later).
+ */
+static inline void __vma_modify(struct prio_tree_root *root,
+	struct vm_area_struct *vma, unsigned long start, unsigned long end,
+	unsigned long pgoff)
+{
+	if (root)
+		__vma_prio_tree_remove(root, vma);
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+	if (root)
+		__vma_prio_tree_insert(root, vma);
+}
+
+/*
+ * Helper functions to enumerate vmas that map a given file page or a set of
+ * contiguous file pages. The functions return vmas that at least map a single
+ * page in the given range of contiguous file pages.
+ */
+static inline struct vm_area_struct *__vma_prio_tree_first(
+	struct prio_tree_root *root, struct prio_tree_iter *iter,
+	unsigned long begin, unsigned long end)
+{
+	struct prio_tree_node *ptr;
+
+	ptr = prio_tree_first(root, iter, begin, end);
+
+	if (ptr)
+		return prio_tree_entry(ptr, struct vm_area_struct,
+				shared.prio_tree_node);
+	else
+		return NULL;
+}
+
+static inline struct vm_area_struct *__vma_prio_tree_next(
+	struct vm_area_struct *vma, struct prio_tree_root *root,
+	struct prio_tree_iter *iter, unsigned long begin, unsigned long end)
+{
+	struct prio_tree_node *ptr;
+	struct vm_area_struct *next;
+
+	if (vma->shared.both.parent) {
+		if (vma->vm_set_head)
+			return vma->vm_set_head;
+	}
+	else {
+		next = list_entry(vma->shared.vm_set.list.next,
+				struct vm_area_struct, shared.vm_set.list);
+		if (!(next->vm_set_head))
+			return next;
+	}
+
+	ptr = prio_tree_next(root, iter, begin, end);
+
+	if (ptr)
+		return prio_tree_entry(ptr, struct vm_area_struct,
+				shared.prio_tree_node);
+	else
+		return NULL;
+}
+
+/*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
@@ -520,7 +684,6 @@ extern void __vma_link_rb(struct mm_stru
 	struct rb_node **, struct rb_node *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
 	unsigned long addr, unsigned long len, unsigned long pgoff);
-extern void vma_relink_file(struct vm_area_struct *, struct vm_area_struct *);
 extern void exit_mmap(struct mm_struct *);
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
--- 2.6.5-rc3-mjb2/include/linux/prio_tree.h	1970-01-01 01:00:00.000000000 +0100
+++ anobjrmap9/include/linux/prio_tree.h	2004-04-04 13:05:41.391472712 +0100
@@ -0,0 +1,78 @@
+#ifndef _LINUX_PRIO_TREE_H
+#define _LINUX_PRIO_TREE_H
+
+struct prio_tree_node {
+	struct prio_tree_node *left;
+	struct prio_tree_node *right;
+	struct prio_tree_node *parent;
+};
+
+struct prio_tree_root {
+	struct prio_tree_node	*prio_tree_node;
+	unsigned int 		index_bits;
+};
+
+struct prio_tree_iter {
+	struct prio_tree_node	*cur;
+	unsigned long		mask;
+	unsigned long		value;
+	int			size_level;
+};
+
+#define PRIO_TREE_ROOT	(struct prio_tree_root) {NULL, 1}
+
+#define PRIO_TREE_ROOT_INIT	{NULL, 1}
+
+#define INIT_PRIO_TREE_ROOT(ptr)	\
+do {					\
+	(ptr)->prio_tree_node = NULL;	\
+	(ptr)->index_bits = 1;		\
+} while (0)
+
+#define PRIO_TREE_NODE_INIT(name)	{&(name), &(name), &(name)}
+
+#define PRIO_TREE_NODE(name) \
+	struct prio_tree_node name = PRIO_TREE_NODE_INIT(name)
+
+#define INIT_PRIO_TREE_NODE(ptr)				\
+do {								\
+	(ptr)->left = (ptr)->right = (ptr)->parent = (ptr);	\
+} while (0)
+
+#define	prio_tree_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define	PRIO_TREE_ITER	(struct prio_tree_iter) {NULL, 0UL, 0UL, 0}
+
+static inline int prio_tree_empty(const struct prio_tree_root *root)
+{
+	return root->prio_tree_node == NULL;
+}
+
+static inline int prio_tree_root(const struct prio_tree_node *node)
+{
+	return node->parent == node;
+}
+
+static inline int prio_tree_left_empty(const struct prio_tree_node *node)
+{
+	return node->left == node;
+}
+
+static inline int prio_tree_right_empty(const struct prio_tree_node *node)
+{
+	return node->right == node;
+}
+
+extern struct prio_tree_node *prio_tree_insert(struct prio_tree_root *,
+	struct prio_tree_node *);
+
+extern void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);
+
+extern struct prio_tree_node *prio_tree_first(struct prio_tree_root *,
+	struct prio_tree_iter *, unsigned long, unsigned long);
+
+extern struct prio_tree_node *prio_tree_next(struct prio_tree_root *,
+	struct prio_tree_iter *, unsigned long, unsigned long);
+
+#endif
--- 2.6.5-rc3-mjb2/init/main.c	2004-04-02 21:01:47.345855624 +0100
+++ anobjrmap9/init/main.c	2004-04-04 13:05:41.392472560 +0100
@@ -85,6 +85,7 @@ extern void buffer_init(void);
 extern void pidhash_init(void);
 extern void pidmap_init(void);
 extern void radix_tree_init(void);
+extern void prio_tree_init(void);
 extern void free_initmem(void);
 extern void populate_rootfs(void);
 extern void driver_init(void);
@@ -466,6 +467,7 @@ asmlinkage void __init start_kernel(void
 	calibrate_delay();
 	pidmap_init();
 	pgtable_cache_init();
+	prio_tree_init();
 #ifdef CONFIG_X86
 	if (efi_enabled)
 		efi_enter_virtual_mode();
--- 2.6.5-rc3-mjb2/kernel/fork.c	2004-04-02 21:01:47.350854864 +0100
+++ anobjrmap9/kernel/fork.c	2004-04-04 13:05:41.394472256 +0100
@@ -314,7 +314,7 @@ static inline int dup_mmap(struct mm_str
 		tmp->vm_mm = mm;
 		tmp->vm_next = NULL;
 		file = tmp->vm_file;
-		INIT_LIST_HEAD(&tmp->shared);
+		INIT_VMA_SHARED(tmp);
 		if (file) {
 			struct inode *inode = file->f_dentry->d_inode;
 			get_file(file);
@@ -323,7 +323,7 @@ static inline int dup_mmap(struct mm_str
       
 			/* insert tmp into the share list, just after mpnt */
 			down(&file->f_mapping->i_shared_sem);
-			list_add(&tmp->shared, &mpnt->shared);
+			__vma_prio_tree_add(tmp, mpnt);
 			up(&file->f_mapping->i_shared_sem);
 		}
 
--- 2.6.5-rc3-mjb2/kernel/kexec.c	2004-04-02 21:01:47.354854256 +0100
+++ anobjrmap9/kernel/kexec.c	2004-04-04 13:05:41.395472104 +0100
@@ -191,7 +191,7 @@ static int identity_map_pages(struct pag
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
 	vma->vm_file = NULL;
 	vma->vm_private_data = NULL;
-	INIT_LIST_HEAD(&vma->shared);
+	INIT_VMA_SHARED(vma);
 	insert_vm_struct(mm, vma);
 
 	error = remap_page_range(vma, vma->vm_start, vma->vm_start,
--- 2.6.5-rc3-mjb2/mm/Makefile	2004-04-02 21:01:47.392848480 +0100
+++ anobjrmap9/mm/Makefile	2004-04-04 13:05:41.395472104 +0100
@@ -9,7 +9,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o readahead.o \
-			   slab.o swap.o truncate.o vmscan.o $(mmu-y)
+			   slab.o swap.o truncate.o vmscan.o prio_tree.o \
+			   $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_X86_4G)	+= usercopy.o
--- 2.6.5-rc3-mjb2/mm/filemap.c	2004-04-02 21:01:47.395848024 +0100
+++ anobjrmap9/mm/filemap.c	2004-04-04 13:05:41.397471800 +0100
@@ -632,7 +632,8 @@ page_ok:
 		 * virtual addresses, take care about potential aliasing
 		 * before reading the page on the kernel side.
 		 */
-		if (!list_empty(&mapping->i_mmap_shared))
+		if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+			!list_empty(&mapping->i_mmap_nonlinear))
 			flush_dcache_page(page);
 
 		/*
--- 2.6.5-rc3-mjb2/mm/fremap.c	2004-04-02 21:01:47.397847720 +0100
+++ anobjrmap9/mm/fremap.c	2004-04-04 13:05:41.398471648 +0100
@@ -157,6 +157,8 @@ asmlinkage long sys_remap_file_pages(uns
 	unsigned long __prot, unsigned long pgoff, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
+	struct address_space *mapping;
+	unsigned long linear_pgoff;
 	unsigned long end = start + size;
 	struct vm_area_struct *vma;
 	int err = -EINVAL;
@@ -197,8 +199,18 @@ asmlinkage long sys_remap_file_pages(uns
 				end <= vma->vm_end) {
 
 		/* Must set VM_NONLINEAR before any pages are populated. */
-		if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
+		linear_pgoff = vma->vm_pgoff;
+		linear_pgoff += ((start - vma->vm_start) >> PAGE_SHIFT);
+		if (pgoff != linear_pgoff && !(vma->vm_flags & VM_NONLINEAR)) {
+			mapping = vma->vm_file->f_mapping;
+			down(&mapping->i_shared_sem);
 			vma->vm_flags |= VM_NONLINEAR;
+			__vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
+			INIT_VMA_SHARED_LIST(vma);
+			list_add_tail(&vma->shared.vm_set.list,
+					&mapping->i_mmap_nonlinear);
+			up(&mapping->i_shared_sem);
+		}
 
 		/* ->populate can take a long time, so downgrade the lock. */
 		downgrade_write(&mm->mmap_sem);
--- 2.6.5-rc3-mjb2/mm/memory.c	2004-04-02 21:01:47.402846960 +0100
+++ anobjrmap9/mm/memory.c	2004-04-04 13:05:41.400471344 +0100
@@ -1077,11 +1077,11 @@ no_new_page:
  * An hlen of zero blows away the entire portion file after hba.
  */
 static void
-invalidate_mmap_range_list(struct list_head *head,
+invalidate_mmap_range_list(struct prio_tree_root *root,
 			   unsigned long const hba,
 			   unsigned long const hlen)
 {
-	struct list_head *curr;
+	struct prio_tree_iter iter;
 	unsigned long hea;	/* last page of hole. */
 	unsigned long vba;
 	unsigned long vea;	/* last page of corresponding uva hole. */
@@ -1092,17 +1092,16 @@ invalidate_mmap_range_list(struct list_h
 	hea = hba + hlen - 1;	/* avoid overflow. */
 	if (hea < hba)
 		hea = ULONG_MAX;
-	list_for_each(curr, head) {
-		vp = list_entry(curr, struct vm_area_struct, shared);
+	vp = __vma_prio_tree_first(root, &iter, hba, hea);
+	while(vp) {
 		vba = vp->vm_pgoff;
 		vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1;
-		if (hea < vba || vea < hba)
-		    	continue;	/* Mapping disjoint from hole. */
 		zba = (hba <= vba) ? vba : hba;
 		zea = (vea <= hea) ? vea : hea;
 		zap_page_range(vp,
 			       ((zba - vba) << PAGE_SHIFT) + vp->vm_start,
 			       (zea - zba + 1) << PAGE_SHIFT);
+		vp = __vma_prio_tree_next(vp, root, &iter, hba, hea);
 	}
 }
 
@@ -1137,9 +1136,9 @@ void invalidate_mmap_range(struct addres
 	down(&mapping->i_shared_sem);
 	/* Protect against page fault */
 	atomic_inc(&mapping->truncate_count);
-	if (unlikely(!list_empty(&mapping->i_mmap)))
+	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
 		invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
-	if (unlikely(!list_empty(&mapping->i_mmap_shared)))
+	if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared)))
 		invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
 	up(&mapping->i_shared_sem);
 }
--- 2.6.5-rc3-mjb2/mm/mmap.c	2004-04-02 21:01:47.406846352 +0100
+++ anobjrmap9/mm/mmap.c	2004-04-04 13:05:41.403470888 +0100
@@ -68,12 +68,20 @@ int mmap_hugepages_map_sz = 256;
  * Requires inode->i_mapping->i_shared_sem
  */
 static inline void
-__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
+__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode,
+			  struct address_space * mapping)
 {
 	if (inode) {
 		if (vma->vm_flags & VM_DENYWRITE)
 			atomic_inc(&inode->i_writecount);
-		list_del_init(&vma->shared);
+		if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+			list_del_init(&vma->shared.vm_set.list);
+			INIT_VMA_SHARED(vma);
+		}
+		else if (vma->vm_flags & VM_SHARED)
+			__vma_prio_tree_remove(&mapping->i_mmap_shared, vma);
+		else
+			__vma_prio_tree_remove(&mapping->i_mmap, vma);
 	}
 }
 
@@ -87,7 +95,8 @@ static void remove_shared_vm_struct(stru
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		down(&mapping->i_shared_sem);
-		__remove_shared_vm_struct(vma, file->f_dentry->d_inode);
+		__remove_shared_vm_struct(vma, file->f_dentry->d_inode,
+				mapping);
 		up(&mapping->i_shared_sem);
 	}
 }
@@ -261,10 +270,15 @@ static inline void __vma_link_file(struc
 		if (vma->vm_flags & VM_DENYWRITE)
 			atomic_dec(&file->f_dentry->d_inode->i_writecount);
 
-		if (vma->vm_flags & VM_SHARED)
-			list_add_tail(&vma->shared, &mapping->i_mmap_shared);
+		if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+			INIT_VMA_SHARED_LIST(vma);
+			list_add_tail(&vma->shared.vm_set.list,
+					&mapping->i_mmap_nonlinear);
+		}
+		else if (vma->vm_flags & VM_SHARED)
+			__vma_prio_tree_insert(&mapping->i_mmap_shared, vma);
 		else
-			list_add_tail(&vma->shared, &mapping->i_mmap);
+			__vma_prio_tree_insert(&mapping->i_mmap, vma);
 	}
 }
 
@@ -393,7 +407,9 @@ static struct vm_area_struct *vma_merge(
 {
 	spinlock_t *lock = &mm->page_table_lock;
 	struct inode *inode = file ? file->f_dentry->d_inode : NULL;
+	struct address_space *mapping = file ? file->f_mapping : NULL;
 	struct semaphore *i_shared_sem;
+	struct prio_tree_root *root = NULL;
 
 	/*
 	 * We later require that vma->vm_flags == vm_flags, so this tests
@@ -404,6 +420,15 @@ static struct vm_area_struct *vma_merge(
 
 	i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
 
+	if (mapping) {
+		if (vm_flags & VM_SHARED) {
+			if (likely(!(vm_flags & VM_NONLINEAR)))
+				root = &mapping->i_mmap_shared;
+		}
+		else
+			root = &mapping->i_mmap;
+	}
+
 	if (!prev) {
 		prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
 		goto merge_next;
@@ -423,18 +448,18 @@ static struct vm_area_struct *vma_merge(
 			need_up = 1;
 		}
 		spin_lock(lock);
-		prev->vm_end = end;
 
 		/*
 		 * OK, it did.  Can we now merge in the successor as well?
 		 */
 		next = prev->vm_next;
-		if (next && prev->vm_end == next->vm_start &&
+		if (next && end == next->vm_start &&
 				can_vma_merge_before(next, vm_flags, file,
 					pgoff, (end - addr) >> PAGE_SHIFT)) {
-			prev->vm_end = next->vm_end;
 			__vma_unlink(mm, next, prev);
-			__remove_shared_vm_struct(next, inode);
+			__vma_modify(root, prev, prev->vm_start,
+					next->vm_end, prev->vm_pgoff);
+			__remove_shared_vm_struct(next, inode, mapping);
 			spin_unlock(lock);
 			if (need_up)
 				up(i_shared_sem);
@@ -445,6 +470,8 @@ static struct vm_area_struct *vma_merge(
 			kmem_cache_free(vm_area_cachep, next);
 			return prev;
 		}
+
+		__vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff);
 		spin_unlock(lock);
 		if (need_up)
 			up(i_shared_sem);
@@ -464,8 +491,8 @@ static struct vm_area_struct *vma_merge(
 			if (file)
 				down(i_shared_sem);
 			spin_lock(lock);
-			prev->vm_start = addr;
-			prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
+			__vma_modify(root, prev, addr, prev->vm_end,
+				prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT));
 			spin_unlock(lock);
 			if (file)
 				up(i_shared_sem);
@@ -698,7 +725,7 @@ munmap_back:
 	vma->vm_file = NULL;
 	vma->vm_private_data = NULL;
 	vma->vm_next = NULL;
-	INIT_LIST_HEAD(&vma->shared);
+	INIT_VMA_SHARED(vma);
 
 	if (file) {
 		error = -EINVAL;
@@ -1289,6 +1316,7 @@ int split_vma(struct mm_struct * mm, str
 {
 	struct vm_area_struct *new;
 	struct address_space *mapping = NULL;
+	struct prio_tree_root *root = NULL;
 
 	if (mm->map_count >= MAX_MAP_COUNT)
 		return -ENOMEM;
@@ -1300,7 +1328,7 @@ int split_vma(struct mm_struct * mm, str
 	/* most fields are the same, copy all, and then fixup */
 	*new = *vma;
 
-	INIT_LIST_HEAD(&new->shared);
+	INIT_VMA_SHARED(new);
 
 	if (new_below)
 		new->vm_end = addr;
@@ -1315,18 +1343,25 @@ int split_vma(struct mm_struct * mm, str
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
 
-	if (vma->vm_file)
+	if (vma->vm_file) {
 		 mapping = vma->vm_file->f_mapping;
+		 if (vma->vm_flags & VM_SHARED) {
+			 if (likely(!(vma->vm_flags & VM_NONLINEAR)))
+			 	root = &mapping->i_mmap_shared;
+		 }
+		 else
+			 root = &mapping->i_mmap;
+	}
 
 	if (mapping)
 		down(&mapping->i_shared_sem);
 	spin_lock(&mm->page_table_lock);
 
-	if (new_below) {
-		vma->vm_start = addr;
-		vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
-	} else
-		vma->vm_end = addr;
+	if (new_below)
+		__vma_modify(root, vma, addr, vma->vm_end,
+			vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT));
+	else
+		__vma_modify(root, vma, vma->vm_start, addr, vma->vm_pgoff);
 
 	__insert_vm_struct(mm, new);
 
@@ -1499,7 +1534,7 @@ unsigned long do_brk(unsigned long addr,
 	vma->vm_pgoff = 0;
 	vma->vm_file = NULL;
 	vma->vm_private_data = NULL;
-	INIT_LIST_HEAD(&vma->shared);
+	INIT_VMA_SHARED(vma);
 
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 
@@ -1597,7 +1632,7 @@ struct vm_area_struct *copy_vma(struct v
 		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
-			INIT_LIST_HEAD(&new_vma->shared);
+			INIT_VMA_SHARED(new_vma);
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
@@ -1610,24 +1645,3 @@ struct vm_area_struct *copy_vma(struct v
 	}
 	return new_vma;
 }
-
-/*
- * Position vma after prev in shared file list:
- * for mremap move error recovery racing against vmtruncate.
- */
-void vma_relink_file(struct vm_area_struct *vma, struct vm_area_struct *prev)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	struct address_space *mapping;
-
-	if (vma->vm_file) {
-		mapping = vma->vm_file->f_mapping;
-		if (mapping) {
-			down(&mapping->i_shared_sem);
-			spin_lock(&mm->page_table_lock);
-			list_move(&vma->shared, &prev->shared);
-			spin_unlock(&mm->page_table_lock);
-			up(&mapping->i_shared_sem);
-		}
-	}
-}
--- 2.6.5-rc3-mjb2/mm/mremap.c	2004-04-02 21:01:47.409845896 +0100
+++ anobjrmap9/mm/mremap.c	2004-04-04 13:05:41.405470584 +0100
@@ -237,6 +237,7 @@ static int move_page_tables(struct vm_ar
 	 * only a few pages.. This also makes error recovery easier.
 	 */
 	while (offset < len) {
+		cond_resched();
 		ret = move_one_page(vma, old_addr+offset, new_addr+offset);
 		if (!ret) {
 			offset += PAGE_SIZE;
@@ -266,6 +267,7 @@ static unsigned long move_vma(struct vm_
 		unsigned long new_len, unsigned long new_addr)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	struct address_space *mapping = NULL;
 	struct vm_area_struct *new_vma;
 	unsigned long vm_flags = vma->vm_flags;
 	unsigned long new_pgoff;
@@ -285,26 +287,31 @@ static unsigned long move_vma(struct vm_
 	if (!new_vma)
 		return -ENOMEM;
 
+	if (vma->vm_file) {
+		/*
+		 * Subtle point from Rajesh Venkatasubramanian: before
+		 * moving file-based ptes, we must lock vmtruncate out,
+		 * since it might clean the dst vma before the src vma,
+		 * and we propagate stale pages into the dst afterward.
+		 */
+		mapping = vma->vm_file->f_mapping;
+		down(&mapping->i_shared_sem);
+	}
 	moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
 	if (moved_len < old_len) {
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
 		 * and then proceed to unmap new area instead of old.
-		 *
-		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, move new_vma before old vma
-		 * in the i_mmap or i_mmap_shared list, so when racing
-		 * against vmtruncate we cannot propagate pages to be
-		 * truncated back from new_vma into just cleaned old.
 		 */
-		vma_relink_file(vma, new_vma);
 		move_page_tables(new_vma, old_addr, new_addr, moved_len);
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;
 		new_addr = -ENOMEM;
 	}
+	if (mapping)
+		up(&mapping->i_shared_sem);
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
 	if (vm_flags & VM_ACCOUNT) {
@@ -351,6 +358,8 @@ unsigned long do_mremap(unsigned long ad
 	unsigned long flags, unsigned long new_addr)
 {
 	struct vm_area_struct *vma;
+	struct address_space *mapping = NULL;
+	struct prio_tree_root *root = NULL;
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
 
@@ -458,9 +467,26 @@ unsigned long do_mremap(unsigned long ad
 		/* can we just expand the current mapping? */
 		if (max_addr - addr >= new_len) {
 			int pages = (new_len - old_len) >> PAGE_SHIFT;
+
+			if (vma->vm_file) {
+				mapping = vma->vm_file->f_mapping;
+				if (vma->vm_flags & VM_SHARED) {
+					if (likely(!(vma->vm_flags & VM_NONLINEAR)))
+						root = &mapping->i_mmap_shared;
+				}
+				else
+					root = &mapping->i_mmap;
+				down(&mapping->i_shared_sem);
+			}
+
 			spin_lock(&vma->vm_mm->page_table_lock);
-			vma->vm_end = addr + new_len;
+			__vma_modify(root, vma, vma->vm_start,
+					addr + new_len, vma->vm_pgoff);
 			spin_unlock(&vma->vm_mm->page_table_lock);
+
+			if(mapping)
+				up(&mapping->i_shared_sem);
+
 			current->mm->total_vm += pages;
 			if (vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += pages;
--- 2.6.5-rc3-mjb2/mm/page_io.c	2004-04-02 21:01:47.416844832 +0100
+++ anobjrmap9/mm/page_io.c	2004-04-04 13:05:41.405470584 +0100
@@ -135,12 +135,14 @@ out:
 int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
 {
 	int ret;
+	unsigned long save_private;
 	struct writeback_control swap_wbc = {
 		.sync_mode = WB_SYNC_ALL,
 	};
 
 	lock_page(page);
 	SetPageSwapCache(page);
+	save_private = page->private;
 	page->private = entry.val;
 
 	if (rw == READ) {
@@ -150,7 +152,9 @@ int rw_swap_page_sync(int rw, swp_entry_
 		ret = swap_writepage(page, &swap_wbc);
 		wait_on_page_writeback(page);
 	}
+
 	ClearPageSwapCache(page);
+	page->private = save_private;
 	if (ret == 0 && (!PageUptodate(page) || PageError(page)))
 		ret = -EIO;
 	return ret;
--- 2.6.5-rc3-mjb2/mm/prio_tree.c	1970-01-01 01:00:00.000000000 +0100
+++ anobjrmap9/mm/prio_tree.c	2004-04-04 13:05:41.408470128 +0100
@@ -0,0 +1,577 @@
+/*
+ * mm/prio_tree.c - priority search tree for mapping->i_mmap{,_shared}
+ *
+ * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@xxxxxxxxx>
+ *
+ * Based on the radix priority search tree proposed by Edward M. McCreight
+ * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
+ *
+ * 02Feb2004	Initial version
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/prio_tree.h>
+
+/*
+ * A clever mix of heap and radix trees forms a radix priority search tree (PST)
+ * which is useful for storing intervals, e.g, we can consider a vma as a closed
+ * interval of file pages [offset_begin, offset_end], and store all vmas that
+ * map a file in a PST. Then, using the PST, we can answer a stabbing query,
+ * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
+ * given input interval X (a set of consecutive file pages), in "O(log n + m)"
+ * time where 'log n' is the height of the PST, and 'm' is the number of stored
+ * intervals (vmas) that overlap (map) with the input interval X (the set of
+ * consecutive file pages).
+ *
+ * In our implementation, we store closed intervals of the form [radix_index,
+ * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
+ * is designed for storing intervals with unique radix indices, i.e., each
+ * interval have different radix_index. However, this limitation can be easily
+ * overcome by using the size, i.e., heap_index - radix_index, as part of the
+ * index, so we index the tree using [(radix_index,size), heap_index].
+ *
+ * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
+ * machine, the maximum height of a PST can be 64. We can use a balanced version
+ * of the priority search tree to optimize the tree height, but the balanced
+ * tree proposed by McCreight is too complex and memory-hungry for our purpose.
+ */
+
+static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
+
+/*
+ * Maximum heap_index that can be stored in a PST with index_bits bits
+ */
+static inline unsigned long prio_tree_maxindex(unsigned int bits)
+{
+	return index_bits_to_maxindex[bits - 1];
+}
+
+/*
+ * Extend a priority search tree so that it can store a node with heap_index
+ * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
+ * However, this function is used rarely and the common case performance is
+ * not bad.
+ */
+static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
+	struct prio_tree_node *node, unsigned long max_heap_index)
+{
+	struct prio_tree_node *first = NULL, *prev, *last = NULL;
+
+	if (max_heap_index > prio_tree_maxindex(root->index_bits))
+		root->index_bits++;
+
+	while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
+		root->index_bits++;
+
+		if (prio_tree_empty(root))
+			continue;
+
+		if (first == NULL) {
+			first = root->prio_tree_node;
+			prio_tree_remove(root, root->prio_tree_node);
+			INIT_PRIO_TREE_NODE(first);
+			last = first;
+		}
+		else {
+			prev = last;
+			last = root->prio_tree_node;
+			prio_tree_remove(root, root->prio_tree_node);
+			INIT_PRIO_TREE_NODE(last);
+			prev->left = last;
+			last->parent = prev;
+		}
+	}
+
+	INIT_PRIO_TREE_NODE(node);
+
+	if (first) {
+		node->left = first;
+		first->parent = node;
+	}
+	else
+		last = node;
+
+	if (!prio_tree_empty(root)) {
+		last->left = root->prio_tree_node;
+		last->left->parent = last;
+	}
+
+	root->prio_tree_node = node;
+	return node;
+}
+
+/*
+ * Replace a prio_tree_node with a new node and return the old node
+ */
+static inline struct prio_tree_node *prio_tree_replace(
+	struct prio_tree_root *root, struct prio_tree_node *old,
+	struct prio_tree_node *node)
+{
+	INIT_PRIO_TREE_NODE(node);
+
+	if (prio_tree_root(old)) {
+		BUG_ON(root->prio_tree_node != old);
+		/*
+		 * We can reduce root->index_bits here. However, it is complex
+		 * and does not help much to improve performance (IMO).
+		 */
+		node->parent = node;
+		root->prio_tree_node = node;
+	}
+	else {
+		node->parent = old->parent;
+		if (old->parent->left == old)
+			old->parent->left = node;
+		else {
+			BUG_ON(old->parent->right != old);
+			old->parent->right = node;
+		}
+	}
+
+	if (!prio_tree_left_empty(old)) {
+		node->left = old->left;
+		old->left->parent = node;
+	}
+
+	if (!prio_tree_right_empty(old)) {
+		node->right = old->right;
+		old->right->parent = node;
+	}
+
+	return old;
+}
+
+#undef	swap
+#define	swap(x,y,z)	do {z = x; x = y; y = z; } while (0)
+
+/*
+ * Insert a prio_tree_node @node into a radix priority search tree @root. The
+ * algorithm typically takes O(log n) time where 'log n' is the number of bits
+ * required to represent the maximum heap_index. In the worst case, the algo
+ * can take O((log n)^2) - check prio_tree_expand.
+ *
+ * If a prior node with same radix_index and heap_index is already found in
+ * the tree, then returns the address of the prior node. Otherwise, inserts
+ * @node into the tree and returns @node.
+ */
+
+struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
+			struct prio_tree_node *node)
+{
+	struct prio_tree_node *cur, *res = node;
+	unsigned long radix_index, heap_index;
+	unsigned long r_index, h_index, index, mask;
+	int size_flag = 0;
+
+	GET_INDEX(node, radix_index, heap_index);
+
+	if (prio_tree_empty(root) ||
+			heap_index > prio_tree_maxindex(root->index_bits))
+		return prio_tree_expand(root, node, heap_index);
+
+	cur = root->prio_tree_node;
+	mask = 1UL << (root->index_bits - 1);
+
+	while (mask) {
+		GET_INDEX(cur, r_index, h_index);
+
+		if (r_index == radix_index && h_index == heap_index)
+			return cur;
+
+                if (h_index < heap_index || (h_index == heap_index &&
+						r_index > radix_index))
+		{
+			struct prio_tree_node *tmp = node;
+			node = prio_tree_replace(root, cur, node);
+			cur = tmp;
+			swap(r_index, radix_index, index);
+			swap(h_index, heap_index, index);
+		}
+
+		if (size_flag)
+			index = heap_index - radix_index;
+		else
+			index = radix_index;
+
+		if (index & mask) {
+			if (prio_tree_right_empty(cur)) {
+				INIT_PRIO_TREE_NODE(node);
+				cur->right = node;
+				node->parent = cur;
+				return res;
+			}
+			else
+				cur = cur->right;
+		}
+		else {
+			if (prio_tree_left_empty(cur)) {
+				INIT_PRIO_TREE_NODE(node);
+				cur->left = node;
+				node->parent = cur;
+				return res;
+			}
+			else
+				cur = cur->left;
+		}
+
+		mask >>= 1;
+
+		if (!mask) {
+			mask = 1UL << (root->index_bits - 1);
+			size_flag = 1;
+		}
+	}
+	/* Should not reach here */
+	BUG();
+	return NULL;
+}
+
+/*
+ * Remove a prio_tree_node @node from a radix priority search tree @root. The
+ * algorithm takes O(log n) time where 'log n' is the number of bits required
+ * to represent the maximum heap_index.
+ */
+
+void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
+{
+	struct prio_tree_node *cur;
+	unsigned long r_index, h_index_right, h_index_left;
+
+	cur = node;
+
+	while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
+		if (!prio_tree_left_empty(cur))
+			GET_INDEX(cur->left, r_index, h_index_left);
+		else {
+			cur = cur->right;
+			continue;
+		}
+
+		if (!prio_tree_right_empty(cur))
+			GET_INDEX(cur->right, r_index, h_index_right);
+		else {
+			cur = cur->left;
+			continue;
+		}
+
+		/* both h_index_left and h_index_right cannot be 0 */
+		if (h_index_left >= h_index_right)
+			cur = cur->left;
+		else
+			cur = cur->right;
+	}
+
+	if (prio_tree_root(cur)) {
+		BUG_ON(root->prio_tree_node != cur);
+		*root = PRIO_TREE_ROOT;
+		return;
+	}
+
+	if (cur->parent->right == cur)
+		cur->parent->right = cur->parent;
+	else {
+		BUG_ON(cur->parent->left != cur);
+		cur->parent->left = cur->parent;
+	}
+
+	while (cur != node)
+		cur = prio_tree_replace(root, cur->parent, cur);
+
+	return;
+}
+
+/*
+ * Following functions help to enumerate all prio_tree_nodes in the tree that
+ * overlap with the input interval X [radix_index, heap_index]. The enumeration
+ * takes O(log n + m) time where 'log n' is the height of the tree (which is
+ * proportional to # of bits required to represent the maximum heap_index) and
+ * 'm' is the number of prio_tree_nodes that overlap the interval X.
+ */
+
+static inline struct prio_tree_node *__prio_tree_left(
+	struct prio_tree_root *root, struct prio_tree_iter *iter,
+	unsigned long radix_index, unsigned long heap_index,
+	unsigned long *r_index, unsigned long *h_index)
+{
+	if (prio_tree_left_empty(iter->cur))
+		return NULL;
+
+	GET_INDEX(iter->cur->left, *r_index, *h_index);
+
+	if (radix_index <= *h_index) {
+		iter->cur = iter->cur->left;
+		iter->mask >>= 1;
+		if (iter->mask) {
+			if (iter->size_level)
+				iter->size_level++;
+		}
+		else {
+			iter->size_level = 1;
+			iter->mask = 1UL << (root->index_bits - 1);
+		}
+		return iter->cur;
+	}
+
+	return NULL;
+}
+
+
+static inline struct prio_tree_node *__prio_tree_right(
+	struct prio_tree_root *root, struct prio_tree_iter *iter,
+	unsigned long radix_index, unsigned long heap_index,
+	unsigned long *r_index, unsigned long *h_index)
+{
+	unsigned long value;
+
+	if (prio_tree_right_empty(iter->cur))
+		return NULL;
+
+	if (iter->size_level)
+		value = iter->value;
+	else
+		value = iter->value | iter->mask;
+
+	if (heap_index < value)
+		return NULL;
+
+	GET_INDEX(iter->cur->right, *r_index, *h_index);
+
+	if (radix_index <= *h_index) {
+		iter->cur = iter->cur->right;
+		iter->mask >>= 1;
+		iter->value = value;
+		if (iter->mask) {
+			if (iter->size_level)
+				iter->size_level++;
+		}
+		else {
+			iter->size_level = 1;
+			iter->mask = 1UL << (root->index_bits - 1);
+		}
+		return iter->cur;
+	}
+
+	return NULL;
+}
+
+static inline struct prio_tree_node *__prio_tree_parent(
+	struct prio_tree_iter *iter)
+{
+	iter->cur = iter->cur->parent;
+	iter->mask <<= 1;
+	if (iter->size_level) {
+		if (iter->size_level == 1)
+			iter->mask = 1UL;
+		iter->size_level--;
+	}
+	else if (iter->value & iter->mask)
+		iter->value ^= iter->mask;
+	return iter->cur;
+}
+
+static inline int overlap(unsigned long radix_index, unsigned long heap_index,
+	unsigned long r_index, unsigned long h_index)
+{
+	if (heap_index < r_index || radix_index > h_index)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * prio_tree_first:
+ *
+ * Get the first prio_tree_node that overlaps with the interval [radix_index,
+ * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
+ * traversal of the tree.
+ */
+struct prio_tree_node *prio_tree_first(struct prio_tree_root *root,
+	struct prio_tree_iter *iter, unsigned long radix_index,
+	unsigned long heap_index)
+{
+	unsigned long r_index, h_index;
+
+	*iter = PRIO_TREE_ITER;
+
+	if (prio_tree_empty(root))
+		return NULL;
+
+	GET_INDEX(root->prio_tree_node, r_index, h_index);
+
+	if (radix_index > h_index)
+		return NULL;
+
+	iter->mask = 1UL << (root->index_bits - 1);
+	iter->cur = root->prio_tree_node;
+
+	while (1) {
+		if (overlap(radix_index, heap_index, r_index, h_index))
+			return iter->cur;
+
+		if (__prio_tree_left(root, iter, radix_index, heap_index,
+					&r_index, &h_index))
+			continue;
+
+		if (__prio_tree_right(root, iter, radix_index, heap_index,
+					&r_index, &h_index))
+			continue;
+
+		break;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(prio_tree_first);
+
+/* Get the next prio_tree_node that overlaps with the input interval in iter */
+struct prio_tree_node *prio_tree_next(struct prio_tree_root *root,
+	struct prio_tree_iter *iter, unsigned long radix_index,
+	unsigned long heap_index)
+{
+	unsigned long r_index, h_index;
+
+repeat:
+	while (__prio_tree_left(root, iter, radix_index, heap_index,
+				&r_index, &h_index))
+		if (overlap(radix_index, heap_index, r_index, h_index))
+			return iter->cur;
+
+	while (!__prio_tree_right(root, iter, radix_index, heap_index,
+				&r_index, &h_index)) {
+	    	while (!prio_tree_root(iter->cur) &&
+				iter->cur->parent->right == iter->cur)
+			__prio_tree_parent(iter);
+
+		if (prio_tree_root(iter->cur))
+			return NULL;
+
+		__prio_tree_parent(iter);
+	}
+
+	if (overlap(radix_index, heap_index, r_index, h_index))
+			return iter->cur;
+
+	goto repeat;
+}
+EXPORT_SYMBOL(prio_tree_next);
+
+/*
+ * Radix priority search tree for address_space->i_mmap_{_shared}
+ *
+ * For each vma that map a unique set of file pages i.e., unique [radix_index,
+ * heap_index] value, we have a corresponing priority search tree node. If
+ * multiple vmas have identical [radix_index, heap_index] value, then one of
+ * them is used as a tree node and others are stored in a vm_set list. The tree
+ * node points to the first vma (head) of the list using vm_set_head.
+ *
+ * prio_tree_root
+ *      |
+ *      A       vm_set_head
+ *     / \      /
+ *    L   R -> H-I-J-K-M-N-O-P-Q-S
+ *    ^   ^    <-- vm_set.list -->
+ *  tree nodes
+ *
+ * We need some way to identify whether a vma is a tree node, head of a vm_set
+ * list, or just a member of a vm_set list. We cannot use vm_flags to store
+ * such information. The reason is, in the above figure, it is possible that
+ * vm_flags' of R and H are covered by the different mmap_sems. When R is
+ * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
+ * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
+ * That's why some trick involving shared.both.parent is used for identifying
+ * tree nodes and list head nodes. We can possibly use the least significant
+ * bit of the vm_set_head field to mark tree and list head nodes. I was worried
+ * about the alignment of vm_area_struct in various architectures.
+ *
+ * vma radix priority search tree node rules:
+ *
+ * vma->shared.both.parent != NULL	==>	a tree node
+ *
+ * vma->shared.both.parent == NULL
+ * 	vma->vm_set_head != NULL  ==>  list head of vmas that map same pages
+ * 	vma->vm_set_head == NULL  ==>  a list node
+ */
+
+void __vma_prio_tree_insert(struct prio_tree_root *root,
+	struct vm_area_struct *vma)
+{
+	struct prio_tree_node *ptr;
+	struct vm_area_struct *old;
+
+	ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
+
+	if (ptr == &vma->shared.prio_tree_node) {
+		vma->vm_set_head = NULL;
+		return;
+	}
+
+	old = prio_tree_entry(ptr, struct vm_area_struct,
+			shared.prio_tree_node);
+
+	__vma_prio_tree_add(vma, old);
+}
+
+void __vma_prio_tree_remove(struct prio_tree_root *root,
+	struct vm_area_struct *vma)
+{
+	struct vm_area_struct *node, *head, *new_head;
+
+	if (vma->shared.both.parent == NULL && vma->vm_set_head == NULL) {
+		list_del_init(&vma->shared.vm_set.list);
+		INIT_VMA_SHARED(vma);
+		return;
+	}
+
+	if (vma->vm_set_head) {
+		/* Leave this BUG_ON till prio_tree patch stabilizes */
+		BUG_ON(vma->vm_set_head->vm_set_head != vma);
+		if (vma->shared.both.parent) {
+			head = vma->vm_set_head;
+			if (!list_empty(&head->shared.vm_set.list)) {
+				new_head = list_entry(
+					head->shared.vm_set.list.next,
+					struct vm_area_struct,
+					shared.vm_set.list);
+				list_del_init(&head->shared.vm_set.list);
+			}
+			else
+				new_head = NULL;
+
+			prio_tree_replace(root, &vma->shared.prio_tree_node,
+					&head->shared.prio_tree_node);
+			head->vm_set_head = new_head;
+			if (new_head)
+				new_head->vm_set_head = head;
+
+		}
+		else {
+			node = vma->vm_set_head;
+			if (!list_empty(&vma->shared.vm_set.list)) {
+				new_head = list_entry(
+					vma->shared.vm_set.list.next,
+					struct vm_area_struct,
+					shared.vm_set.list);
+				list_del_init(&vma->shared.vm_set.list);
+				node->vm_set_head = new_head;
+				new_head->vm_set_head = node;
+			}
+			else
+				node->vm_set_head = NULL;
+		}
+		INIT_VMA_SHARED(vma);
+		return;
+	}
+
+	prio_tree_remove(root, &vma->shared.prio_tree_node);
+	INIT_VMA_SHARED(vma);
+}
+
+void __init prio_tree_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
+		index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
+	index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
+}
--- 2.6.5-rc3-mjb2/mm/rmap.c	2004-04-02 21:01:47.423843768 +0100
+++ anobjrmap9/mm/rmap.c	2004-04-04 13:05:41.411469672 +0100
@@ -154,19 +154,16 @@ static inline void clear_page_anon(struc
  **/
 
 /*
- * At what user virtual address is page expected in file-backed vma?
+ * At what user virtual address is pgoff expected in file-backed vma?
  */
-#define NOADDR	(~0UL)		/* impossible user virtual address */
 static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address(struct vm_area_struct *vma, unsigned long pgoff)
 {
-	unsigned long pgoff;
 	unsigned long address;
 
-	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-	return (address >= vma->vm_start && address < vma->vm_end)?
-		address: NOADDR;
+	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+	return address;
 }
 
 /**
@@ -182,8 +179,14 @@ static int page_referenced_one(struct pa
 	pte_t *pte;
 	int referenced = 0;
 
-	if (!spin_trylock(&mm->page_table_lock))
+	if (!spin_trylock(&mm->page_table_lock)) {
+		/*
+		 * For debug we're currently warning if not all found,
+		 * but in this case that's expected: suppress warning.
+		 */
+		*mapcount = -1;
 		return 0;
+	}
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
@@ -246,6 +249,8 @@ static inline int page_referenced_anon(s
 		if (!*mapcount)
 			goto out;
 	}
+	
+	WARN_ON(*mapcount > 0);
 out:
 	spin_unlock(&anonhd->lock);
 	return referenced;
@@ -268,45 +273,54 @@ out:
 static inline int page_referenced_obj(struct page *page, int *mapcount)
 {
 	struct address_space *mapping = page->mapping;
+	unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
 	unsigned long address;
 	int referenced = 0;
 
 	if (down_trylock(&mapping->i_shared_sem))
 		return 0;
 
-	list_for_each_entry(vma, &mapping->i_mmap, shared) {
-		if (!vma->vm_mm->rss)
-			continue;
-		address = vma_address(page, vma);
-		if (address == NOADDR)
-			continue;
-		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) ==
-					(VM_LOCKED|VM_MAYSHARE)) {
+	vma = __vma_prio_tree_first(&mapping->i_mmap,
+					&iter, pgoff, pgoff);
+	while (vma) {
+		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
+				  == (VM_LOCKED|VM_MAYSHARE)) {
 			referenced++;
 			goto out;
 		}
-		referenced += page_referenced_one(
-			page, vma->vm_mm, address, mapcount);
-		if (!*mapcount)
-			goto out;
+		if (vma->vm_mm->rss) {
+			address = vma_address(vma, pgoff);
+			referenced += page_referenced_one(
+				page, vma->vm_mm, address, mapcount);
+			if (!*mapcount)
+				goto out;
+		}
+		vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+						&iter, pgoff, pgoff);
 	}
 
-	list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
-		if (!vma->vm_mm->rss || (vma->vm_flags & VM_NONLINEAR))
-			continue;
-		address = vma_address(page, vma);
-		if (address == NOADDR)
-			continue;
+	vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+					&iter, pgoff, pgoff);
+	while (vma) {
 		if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
 			referenced++;
 			goto out;
 		}
-		referenced += page_referenced_one(
-			page, vma->vm_mm, address, mapcount);
-		if (!*mapcount)
-			goto out;
+		if (vma->vm_mm->rss) {
+			address = vma_address(vma, pgoff);
+			referenced += page_referenced_one(
+				page, vma->vm_mm, address, mapcount);
+			if (!*mapcount)
+				goto out;
+		}
+		vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+						&iter, pgoff, pgoff);
 	}
+
+	if (list_empty(&mapping->i_mmap_nonlinear))
+		WARN_ON(*mapcount > 0);
 out:
 	up(&mapping->i_shared_sem);
 	return referenced;
@@ -688,7 +702,9 @@ out:
 static inline int try_to_unmap_obj(struct page *page, int *mapcount)
 {
 	struct address_space *mapping = page->mapping;
+	unsigned long pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
 	unsigned long address;
 	int ret = SWAP_AGAIN;
 	unsigned long cursor;
@@ -698,47 +714,50 @@ static inline int try_to_unmap_obj(struc
 	if (down_trylock(&mapping->i_shared_sem))
 		return ret;
 
-	list_for_each_entry(vma, &mapping->i_mmap, shared) {
-		if (!vma->vm_mm->rss)
-			continue;
-		address = vma_address(page, vma);
-		if (address == NOADDR)
-			continue;
-		ret = try_to_unmap_one(
-			page, vma->vm_mm, address, mapcount, vma);
-		if (ret == SWAP_FAIL || !*mapcount)
-			goto out;
+	vma = __vma_prio_tree_first(&mapping->i_mmap,
+					&iter, pgoff, pgoff);
+	while (vma) {
+		if (vma->vm_mm->rss) {
+			address = vma_address(vma, pgoff);
+			ret = try_to_unmap_one(
+				page, vma->vm_mm, address, mapcount, vma);
+			if (ret == SWAP_FAIL || !*mapcount)
+				goto out;
+		}
+		vma = __vma_prio_tree_next(vma, &mapping->i_mmap,
+						&iter, pgoff, pgoff);
 	}
 
-	list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
-		if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-			/*
-			 * Defer unmapping nonlinear to the next loop,
-			 * but take notes while we're here e.g. don't
-			 * want to loop again when no nonlinear vmas.
-			 */
-			if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
-				continue;
-			cursor = (unsigned long) vma->vm_private_data;
-			if (cursor > max_nl_cursor)
-				max_nl_cursor = cursor;
-			cursor = vma->vm_end - vma->vm_start;
-			if (cursor > max_nl_size)
-				max_nl_size = cursor;
-			continue;
+	vma = __vma_prio_tree_first(&mapping->i_mmap_shared,
+					&iter, pgoff, pgoff);
+	while (vma) {
+		if (vma->vm_mm->rss) {
+			address = vma_address(vma, pgoff);
+			ret = try_to_unmap_one(
+				page, vma->vm_mm, address, mapcount, vma);
+			if (ret == SWAP_FAIL || !*mapcount)
+				goto out;
 		}
-		if (!vma->vm_mm->rss)
-			continue;
-		address = vma_address(page, vma);
-		if (address == NOADDR)
-			continue;
-		ret = try_to_unmap_one(
-			page, vma->vm_mm, address, mapcount, vma);
-		if (ret == SWAP_FAIL || !*mapcount)
-			goto out;
+		vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared,
+						&iter, pgoff, pgoff);
 	}
 
-	if (max_nl_size == 0)	/* no nonlinear vmas of this file */
+	if (list_empty(&mapping->i_mmap_nonlinear))
+		goto out;
+
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+		if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+			continue;
+		cursor = (unsigned long) vma->vm_private_data;
+		if (cursor > max_nl_cursor)
+			max_nl_cursor = cursor;
+		cursor = vma->vm_end - vma->vm_start;
+		if (cursor > max_nl_size)
+			max_nl_size = cursor;
+	}
+
+	if (max_nl_size == 0)
 		goto out;
 
 	/*
@@ -755,9 +774,9 @@ static inline int try_to_unmap_obj(struc
 		max_nl_cursor = CLUSTER_SIZE;
 
 	do {
-		list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
-			if (VM_NONLINEAR != (vma->vm_flags &
-			     (VM_NONLINEAR|VM_LOCKED|VM_RESERVED)))
+		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+			if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while (vma->vm_mm->rss &&
@@ -771,6 +790,7 @@ static inline int try_to_unmap_obj(struc
 				vma->vm_private_data = (void *) cursor;
 				if (*mapcount <= 0)
 					goto relock;
+				cond_resched();
 			}
 			if (ret != SWAP_FAIL)
 				vma->vm_private_data =
@@ -785,9 +805,9 @@ static inline int try_to_unmap_obj(struc
 	 * in locked vmas).  Reset cursor on all unreserved nonlinear
 	 * vmas, now forgetting on which ones it had fallen behind.
 	 */
-	list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
-		if ((vma->vm_flags & (VM_NONLINEAR|VM_RESERVED)) ==
-				VM_NONLINEAR)
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+						shared.vm_set.list) {
+		if (!(vma->vm_flags & VM_RESERVED))
 			vma->vm_private_data = 0;
 	}
 relock:
--- 2.6.5-rc3-mjb2/mm/shmem.c	2004-04-02 21:01:47.427843160 +0100
+++ anobjrmap9/mm/shmem.c	2004-04-04 13:05:41.413469368 +0100
@@ -1351,7 +1351,8 @@ static void do_shmem_file_read(struct fi
 			 * virtual addresses, take care about potential aliasing
 			 * before reading the page on the kernel side.
 			 */
-			if (!list_empty(&mapping->i_mmap_shared))
+			if (!prio_tree_empty(&mapping->i_mmap_shared) ||
+				!list_empty(&mapping->i_mmap_nonlinear))
 				flush_dcache_page(page);
 			/*
 			 * Mark the page accessed if we read the beginning.
--- 2.6.5-rc3-mjb2/mm/vmscan.c	2004-04-02 21:01:47.443840728 +0100
+++ anobjrmap9/mm/vmscan.c	2004-04-04 13:05:41.415469064 +0100
@@ -191,9 +191,11 @@ static inline int page_mapping_inuse(str
 		return 0;
 
 	/* File is mmap'd by somebody. */
-	if (!list_empty(&mapping->i_mmap))
+	if (!prio_tree_empty(&mapping->i_mmap))
 		return 1;
-	if (!list_empty(&mapping->i_mmap_shared))
+	if (!prio_tree_empty(&mapping->i_mmap_shared))
+		return 1;
+	if (!list_empty(&mapping->i_mmap_nonlinear))
 		return 1;
 
 	return 0;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/