Re: missing madvise functionality

From: Eric Dumazet
Date: Wed Apr 04 2007 - 05:35:50 EST


On Wed, 04 Apr 2007 18:55:18 +1000
Nick Piggin <nickpiggin@xxxxxxxxxxxx> wrote:

> Peter Zijlstra wrote:
> > On Wed, 2007-04-04 at 12:22 +1000, Nick Piggin wrote:
> >
> >>Eric Dumazet wrote:
> >
> >
> >>>I do think such workloads might benefit from a vma_cache not shared by
> >>>all threads but private to each thread. A sequence could invalidate the
> >>>cache(s).
> >>>
> >>>ie instead of a mm->mmap_cache, having a mm->sequence, and each thread
> >>>having a current->mmap_cache and current->mm_sequence
> >>
> >>I have a patchset to do exactly this, btw.
> >
> >
> > /me too
> >
> > However, I decided against pushing it because when it does happen that a
> > task is not involved with a vma lookup for longer than it takes the seq
> > count to wrap we have a stale pointer...
> >
> > We could go and walk the tasks once in a while to reset the pointer, but
> > it all got a tad involved.
>
> Well here is my core patch (against I think 2.6.16 + a set of vma cache
> cleanups and abstractions). I didn't think the wrapping aspect was
> terribly involved.

Well, I believe this one is too expensive. I was thinking of a light one :

I am not deleting mmap_sem, but adding a sequence number to mm_struct, that is incremented each time a vma is added/deleted, not each time mmap_sem is taken (read or write)

Each thread has its own copy of the sequence, taken at the time find_vma() had to do a full lookup.

I believe some optimized paths could call check_vma_cache() without mmap_sem read lock taken, and if it fails, take the mmap_sem lock and do the slow path.


--- linux-2.6.21-rc5/include/linux/sched.h
+++ linux-2.6.21-rc5-ed/include/linux/sched.h
@@ -319,10 +319,14 @@ typedef unsigned long mm_counter_t;
(mm)->hiwater_vm = (mm)->total_vm; \
} while (0)

+struct vm_area_cache {
+ struct vm_area_struct * mmap_cache; /* last find_vma result */
+ unsigned int sequence;
+ };
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
- struct vm_area_struct * mmap_cache; /* last find_vma result */
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
@@ -336,6 +340,7 @@ struct mm_struct {
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
int map_count; /* number of VMAs */
struct rw_semaphore mmap_sem;
+ unsigned int mm_sequence;
spinlock_t page_table_lock; /* Protects page tables and some counters */

struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
@@ -875,7 +880,7 @@ struct task_struct {
struct list_head tasks;

struct mm_struct *mm, *active_mm;
-
+ struct vm_area_cache vmacache;
/* task state */
struct linux_binfmt *binfmt;
int exit_state;
--- linux-2.6.21-rc5/include/linux/mm.h
+++ linux-2.6.21-rc5-ed/include/linux/mm.h
@@ -1176,15 +1176,18 @@ extern int expand_upwards(struct vm_area
#endif

/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+extern struct vm_area_struct * find_vma(struct mm_struct * mm,
+ unsigned long addr,
+ struct vm_area_cache *cache);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);

/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
-static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm,
+ unsigned long start_addr, unsigned long end_addr, struct vm_area_cache *cache)
{
- struct vm_area_struct * vma = find_vma(mm,start_addr);
+ struct vm_area_struct * vma = find_vma(mm,start_addr,cache);

if (vma && end_addr <= vma->vm_start)
vma = NULL;
--- linux-2.6.21-rc5/mm/mmap.c
+++ linux-2.6.21-rc5-ed/mm/mmap.c
@@ -267,7 +267,7 @@ asmlinkage unsigned long sys_brk(unsigne
}

/* Check against existing mmap mappings. */
- if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+ if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE, &current->vmacache))
goto out;

/* Ok, looks good - let it rip. */
@@ -447,6 +447,7 @@ static void vma_link(struct mm_struct *m
spin_unlock(&mapping->i_mmap_lock);

mm->map_count++;
+ mm->mm_sequence++;
validate_mm(mm);
}

@@ -473,8 +474,7 @@ __vma_unlink(struct mm_struct *mm, struc
{
prev->vm_next = vma->vm_next;
rb_erase(&vma->vm_rb, &mm->mm_rb);
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+ mm->mm_sequence++;
}

/*
@@ -1201,7 +1201,7 @@ arch_get_unmapped_area(struct file *filp

if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma(mm, addr, &current->vmacache);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
@@ -1214,7 +1214,7 @@ arch_get_unmapped_area(struct file *filp
}

full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr, &current->vmacache); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
if (TASK_SIZE - len < addr) {
/*
@@ -1275,7 +1275,7 @@ arch_get_unmapped_area_topdown(struct fi
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma(mm, addr, &current->vmacache);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
@@ -1292,7 +1292,7 @@ arch_get_unmapped_area_topdown(struct fi

/* make sure it can fit in the remaining address space */
if (addr > len) {
- vma = find_vma(mm, addr-len);
+ vma = find_vma(mm, addr-len, &current->vmacache);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
return (mm->free_area_cache = addr-len);
@@ -1309,7 +1309,7 @@ arch_get_unmapped_area_topdown(struct fi
* else if new region fits below vma->vm_start,
* return with success:
*/
- vma = find_vma(mm, addr);
+ vma = find_vma(mm, addr, &current->vmacache);
if (!vma || addr+len <= vma->vm_start)
/* remember the address as a hint for next time */
return (mm->free_area_cache = addr);
@@ -1397,16 +1397,28 @@ get_unmapped_area(struct file *file, uns

EXPORT_SYMBOL(get_unmapped_area);

+struct vm_area_struct * check_vma_cache(struct mm_struct * mm, unsigned long addr, struct vm_area_cache *cache)
+{
+ struct vm_area_struct *vma = cache->mmap_cache;
+ unsigned int mmseq = mm->mm_sequence;
+ smp_rmb();
+ if (cache->sequence == mmseq &&
+ vma &&
+ addr < vma->vm_end && vma->vm_start <= addr)
+ return vma;
+ return NULL;
+}
+
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr, struct vm_area_cache *cache)
{
struct vm_area_struct *vma = NULL;

if (mm) {
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ vma = check_vma_cache(mm, addr, cache);
+ if (!vma) {
struct rb_node * rb_node;

rb_node = mm->mm_rb.rb_node;
@@ -1426,8 +1438,10 @@ struct vm_area_struct * find_vma(struct
} else
rb_node = rb_node->rb_right;
}
- if (vma)
- mm->mmap_cache = vma;
+ if (vma) {
+ cache->mmap_cache = vma;
+ cache->sequence = mm->mm_sequence;
+ }
}
}
return vma;
@@ -1638,7 +1652,7 @@ find_extend_vma(struct mm_struct * mm, u
unsigned long start;

addr &= PAGE_MASK;
- vma = find_vma(mm,addr);
+ vma = find_vma(mm,addr,&current->vmacache);
if (!vma)
return NULL;
if (vma->vm_start <= addr)
@@ -1726,7 +1740,7 @@ detach_vmas_to_be_unmapped(struct mm_str
else
addr = vma ? vma->vm_end : mm->mmap_base;
mm->unmap_area(mm, addr);
- mm->mmap_cache = NULL; /* Kill the cache. */
+ mm->mm_sequence++;
}

/*
@@ -1823,7 +1837,7 @@ int do_munmap(struct mm_struct *mm, unsi
}

/* Does it split the last one? */
- last = find_vma(mm, end);
+ last = find_vma(mm, end, &current->vmacache);
if (last && end > last->vm_start) {
int error = split_vma(mm, last, end, 1);
if (error)
--- linux-2.6.21-rc5/kernel/fork.c
+++ linux-2.6.21-rc5-ed/kernel/fork.c
@@ -213,7 +213,6 @@ static inline int dup_mmap(struct mm_str

mm->locked_vm = 0;
mm->mmap = NULL;
- mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
@@ -564,6 +563,7 @@ good_mm:

tsk->mm = mm;
tsk->active_mm = mm;
+ tsk->vmacache.mmap_cache = NULL;
return 0;

fail_nomem:
--- linux-2.6.21-rc5/mm/mempolicy.c
+++ linux-2.6.21-rc5-ed/mm/mempolicy.c
@@ -532,7 +532,7 @@ long do_get_mempolicy(int *policy, nodem
return -EINVAL;
if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem);
- vma = find_vma_intersection(mm, addr, addr+1);
+ vma = find_vma_intersection(mm, addr, addr+1, &current->mmcache);
if (!vma) {
up_read(&mm->mmap_sem);
return -EFAULT;
--- linux-2.6.21-rc5/arch/i386/mm/fault.c
+++ linux-2.6.21-rc5-ed/arch/i386/mm/fault.c
@@ -374,7 +374,7 @@ fastcall void __kprobes do_page_fault(st
down_read(&mm->mmap_sem);
}

- vma = find_vma(mm, address);
+ vma = find_vma(mm, address, &tsk->vmacache);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
--- linux-2.6.21-rc5/kernel/futex.c
+++ linux-2.6.21-rc5-ed/kernel/futex.c
@@ -346,7 +346,7 @@ static int futex_handle_fault(unsigned l
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;

- if (attempt > 2 || !(vma = find_vma(mm, address)) ||
+ if (attempt > 2 || !(vma = find_vma(mm, address, &current->vmacache)) ||
vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
return -EFAULT;

--- linux-2.6.21-rc5/mm/fremap.c
+++ linux-2.6.21-rc5-ed/mm/fremap.c
@@ -146,7 +146,7 @@ asmlinkage long sys_remap_file_pages(uns
/* We need down_write() to change vma->vm_flags. */
down_read(&mm->mmap_sem);
retry:
- vma = find_vma(mm, start);
+ vma = find_vma(mm, start, &current->vmacache);

/*
* Make sure the vma is shared, that it supports prefaulting,
--- linux-2.6.21-rc5/mm/madvise.c
+++ linux-2.6.21-rc5-ed/mm/madvise.c
@@ -329,7 +329,7 @@ asmlinkage long sys_madvise(unsigned lon
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_sem */
- vma = find_vma(current->mm, start);
+ vma = find_vma(current->mm, start, &current->vmacache);
}
out:
up_write(&current->mm->mmap_sem);
--- linux-2.6.21-rc5/mm/memory.c
+++ linux-2.6.21-rc5-ed/mm/memory.c
@@ -2556,7 +2556,7 @@ int make_pages_present(unsigned long add
int ret, len, write;
struct vm_area_struct * vma;

- vma = find_vma(current->mm, addr);
+ vma = find_vma(current->mm, addr, &current->vmacache);
if (!vma)
return -1;
write = (vma->vm_flags & VM_WRITE) != 0;
--- linux-2.6.21-rc5/mm/mincore.c
+++ linux-2.6.21-rc5-ed/mm/mincore.c
@@ -63,7 +63,7 @@ static long do_mincore(unsigned long add
unsigned long nr;
int i;
pgoff_t pgoff;
- struct vm_area_struct *vma = find_vma(current->mm, addr);
+ struct vm_area_struct *vma = find_vma(current->mm, addr, &current->vmacache);

/*
* find_vma() didn't find anything above us, or we're
--- linux-2.6.21-rc5/mm/mremap.c
+++ linux-2.6.21-rc5-ed/mm/mremap.c
@@ -315,7 +315,7 @@ unsigned long do_mremap(unsigned long ad
* Ok, we need to grow.. or relocate.
*/
ret = -EFAULT;
- vma = find_vma(mm, addr);
+ vma = find_vma(mm, addr, &current->vmacache);
if (!vma || vma->vm_start > addr)
goto out;
if (is_vm_hugetlb_page(vma)) {
--- linux-2.6.21-rc5/mm/msync.c
+++ linux-2.6.21-rc5-ed/mm/msync.c
@@ -54,7 +54,7 @@ asmlinkage long sys_msync(unsigned long
* just ignore them, but return -ENOMEM at the end.
*/
down_read(&mm->mmap_sem);
- vma = find_vma(mm, start);
+ vma = find_vma(mm, start, &current->vmacache);
for (;;) {
struct file *file;

@@ -86,7 +86,7 @@ asmlinkage long sys_msync(unsigned long
if (error || start >= end)
goto out;
down_read(&mm->mmap_sem);
- vma = find_vma(mm, start);
+ vma = find_vma(mm, start, &current->vmacache);
} else {
if (start >= end) {
error = 0;
--- linux-2.6.21-rc5/fs/proc/task_mmu.c
+++ linux-2.6.21-rc5-ed/fs/proc/task_mmu.c
@@ -405,9 +405,15 @@ static void *m_start(struct seq_file *m,
down_read(&mm->mmap_sem);

/* Start with last addr hint */
- if (last_addr && (vma = find_vma(mm, last_addr))) {
- vma = vma->vm_next;
- goto out;
+ if (last_addr) {
+ struct vm_area_cache nocache = {
+ .sequence = mm->mm_sequence - 1,
+ };
+ vma = find_vma(mm, last_addr, &nocache);
+ if (vma) {
+ vma = vma->vm_next;
+ goto out;
+ }
}

/*
--- linux-2.6.21-rc5/drivers/char/mem.c
+++ linux-2.6.21-rc5-ed/drivers/char/mem.c
@@ -633,7 +633,7 @@ static inline size_t read_zero_pagealign
down_read(&mm->mmap_sem);

/* For private mappings, just map in zero pages. */
- for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+ for (vma = find_vma(mm, addr, &current->vmacache); vma; vma = vma->vm_next) {
unsigned long count;

if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/