[patch 1/4] MAP_NOZERO v2 - implement a new VM_NOZERO/MAP_NOZERO page retirement policy

From: Davide Libenzi
Date: Thu Jun 28 2007 - 14:49:48 EST


This is the core implementation of the new VM_NOZERO page retirement
policy (and the associated MAP_NOZERO).
A new field owner_uid is added the the mm_struct, and it is kept set to
the effective UID of the task that own the mm_struct.
At the moment, the patch re-uses the _mapcount member of the struct page to
store the last owner UID. This avoids increasing the struct page size.
When pages exit (unmapped from) a vma, they are marked with the effective
UID of the mm_struct that owns it.
When pages exit the allocator, their owner_uid is cleared, unless the
new flag __GFP_UIDKEEP is passed to it. So every page fetcher other than
the new alloc_zeroed_page_vma(), clears the owner_uid and blocks all the
following uses of the uncleared page itself.
The new alloc_zeroed_page_vma() calls __alloc_pages() with the __GFP_UIDKEEP
flag, and checks if the VM_NOZERO flag is set in the vma, and if the owner_uid
field of the page matches the one of the mm_struct owning the vma.
If any of these test fail, the page is cleared in the usual way, otherwise
it is passed back without being cleared.




Signed-off-by: Davide Libenzi <davidel@xxxxxxxxxxxxxxx>


- Davide



---
include/asm-alpha/page.h | 3 ++-
include/asm-cris/page.h | 3 ++-
include/asm-generic/mman.h | 1 +
include/asm-h8300/page.h | 3 ++-
include/asm-i386/page.h | 3 ++-
include/asm-ia64/page.h | 2 +-
include/asm-m32r/page.h | 3 ++-
include/asm-m68knommu/page.h | 3 ++-
include/asm-s390/page.h | 3 ++-
include/asm-x86_64/page.h | 3 ++-
include/linux/gfp.h | 5 +++++
include/linux/highmem.h | 7 +------
include/linux/mm.h | 29 ++++++++++++++++++++++++++++-
include/linux/mm_types.h | 3 +++
include/linux/mman.h | 3 ++-
include/linux/rmap.h | 1 +
include/linux/sched.h | 3 +++
kernel/fork.c | 1 +
kernel/sys.c | 3 +++
mm/mmap.c | 3 ++-
mm/page_alloc.c | 33 +++++++++++++++++++++++++++++++++
mm/rmap.c | 14 ++++++++++++++
22 files changed, 114 insertions(+), 18 deletions(-)

Index: linux-2.6.mod/include/linux/sched.h
===================================================================
--- linux-2.6.mod.orig/include/linux/sched.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/linux/sched.h 2007-06-28 11:45:06.000000000 -0700
@@ -386,6 +386,9 @@
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+ /* Effective UID of the owner of this mm_struct */
+ uid_t owner_uid;
};

struct sighand_struct {
Index: linux-2.6.mod/mm/rmap.c
===================================================================
--- linux-2.6.mod.orig/mm/rmap.c 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/mm/rmap.c 2007-06-27 22:41:33.000000000 -0700
@@ -627,6 +627,16 @@
}
#endif

+void page_set_owner(struct page *page, uid_t owner_uid)
+{
+ if (unlikely(PageCompound(page))) {
+ unsigned int nrpages = 1U << compound_order(page);
+ for (; nrpages; nrpages--, page++)
+ page_set_owner_uid(page, owner_uid);
+ } else
+ page_set_owner_uid(page, owner_uid);
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
@@ -649,6 +659,10 @@
print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
BUG();
}
+ /*
+ * Record the last owner of the page.
+ */
+ page_set_owner(page, vma->vm_mm->owner_uid);

/*
* It would be tidy to reset the PageAnon mapping here,
Index: linux-2.6.mod/kernel/fork.c
===================================================================
--- linux-2.6.mod.orig/kernel/fork.c 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/kernel/fork.c 2007-06-27 22:41:33.000000000 -0700
@@ -342,6 +342,7 @@
mm->ioctx_list = NULL;
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm->owner_uid = current->euid;

if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
Index: linux-2.6.mod/include/linux/highmem.h
===================================================================
--- linux-2.6.mod.orig/include/linux/highmem.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/linux/highmem.h 2007-06-27 22:41:33.000000000 -0700
@@ -76,12 +76,7 @@
static inline struct page *
alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
{
- struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
-
- if (page)
- clear_user_highpage(page, vaddr);
-
- return page;
+ return alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr);
}
#endif

Index: linux-2.6.mod/include/linux/mm.h
===================================================================
--- linux-2.6.mod.orig/include/linux/mm.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/linux/mm.h 2007-06-28 11:45:10.000000000 -0700
@@ -169,6 +169,7 @@
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
+#define VM_NOZERO 0x08000000 /* Do not zero the page, if possible */

#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -596,6 +597,8 @@
*/
#define PAGE_MAPPING_ANON 1

+#define PAGE_MAPCNT_UIDBIAS 1000
+
extern struct address_space swapper_space;
static inline struct address_space *page_mapping(struct page *page)
{
@@ -640,7 +643,8 @@

static inline int page_mapcount(struct page *page)
{
- return atomic_read(&(page)->_mapcount) + 1;
+ int cnt = atomic_read(&(page)->_mapcount);
+ return cnt <= -PAGE_MAPCNT_UIDBIAS ? 0: cnt + 1;
}

/*
@@ -652,6 +656,29 @@
}

/*
+ * Re-using the ->_mapcount member to store the last owner UID of the page.
+ * We use negative values starting from -PAGE_MAPCNT_UIDBIAS down though,
+ * since page_mapcount() is used even after the page has been unmapped, for
+ * consistency checks. We take care of that by adding PAGE_MAPCNT_UIDBIAS
+ * knowledge in page_mapcount().
+ */
+static inline void reset_owner_uid(struct page *page)
+{
+ reset_page_mapcount(page);
+}
+
+static inline uid_t page_get_owner_uid(struct page *page)
+{
+ int uid = atomic_read(&page->_mapcount);
+ return (uid_t) (-uid - PAGE_MAPCNT_UIDBIAS);
+}
+
+static inline void page_set_owner_uid(struct page *page, uid_t uid)
+{
+ atomic_set(&page->_mapcount, -(int) uid - PAGE_MAPCNT_UIDBIAS);
+}
+
+/*
* Error return values for the *_nopage functions
*/
#define NOPAGE_SIGBUS (NULL)
Index: linux-2.6.mod/include/asm-alpha/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-alpha/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-alpha/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -17,7 +17,8 @@
extern void clear_page(void *page);
#define clear_user_page(page, vaddr, pg) clear_page(page)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

extern void copy_page(void * _to, void * _from);
Index: linux-2.6.mod/include/asm-cris/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-cris/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-cris/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -20,7 +20,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

/*
Index: linux-2.6.mod/include/asm-h8300/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-h8300/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-h8300/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -22,7 +22,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

/*
Index: linux-2.6.mod/include/asm-i386/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-i386/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-i386/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -34,7 +34,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

/*
Index: linux-2.6.mod/include/asm-ia64/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-ia64/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-ia64/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -89,7 +89,7 @@

#define alloc_zeroed_user_highpage(vma, vaddr) \
({ \
- struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
+ struct page *page = alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr); \
if (page) \
flush_dcache_page(page); \
page; \
Index: linux-2.6.mod/include/asm-m32r/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-m32r/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-m32r/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -15,7 +15,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

/*
Index: linux-2.6.mod/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-m68knommu/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-m68knommu/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -22,7 +22,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

/*
Index: linux-2.6.mod/include/asm-s390/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-s390/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-s390/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -64,7 +64,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE

/*
Index: linux-2.6.mod/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-x86_64/page.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-x86_64/page.h 2007-06-27 22:41:33.000000000 -0700
@@ -48,7 +48,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+ alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
/*
* These are used to make use of C type-checking..
Index: linux-2.6.mod/include/asm-generic/mman.h
===================================================================
--- linux-2.6.mod.orig/include/asm-generic/mman.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/asm-generic/mman.h 2007-06-27 22:41:33.000000000 -0700
@@ -13,6 +13,7 @@
#define PROT_NONE 0x0 /* page can not be accessed */
#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */
#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */
+#define MAP_NOZERO 0x04000000 /* Do not zero the pages, if possible */

#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
Index: linux-2.6.mod/include/linux/mman.h
===================================================================
--- linux-2.6.mod.orig/include/linux/mman.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/linux/mman.h 2007-06-27 22:41:33.000000000 -0700
@@ -63,7 +63,8 @@
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
- _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
+ _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
+ _calc_vm_trans(flags, MAP_NOZERO, VM_NOZERO );
}
#endif /* __KERNEL__ */
#endif /* _LINUX_MMAN_H */
Index: linux-2.6.mod/mm/mmap.c
===================================================================
--- linux-2.6.mod.orig/mm/mmap.c 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/mm/mmap.c 2007-06-28 11:45:10.000000000 -0700
@@ -915,7 +915,8 @@

if (!len)
return -EINVAL;
-
+ if (file && (flags & MAP_NOZERO))
+ return -EINVAL;
error = arch_mmap_check(addr, len, flags);
if (error)
return error;
Index: linux-2.6.mod/mm/page_alloc.c
===================================================================
--- linux-2.6.mod.orig/mm/page_alloc.c 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/mm/page_alloc.c 2007-06-27 22:41:33.000000000 -0700
@@ -1370,11 +1370,44 @@
show_mem();
}
got_pg:
+ if (page && !(gfp_mask & __GFP_UIDKEEP)) {
+ unsigned int pgcount = 1U << order;
+ struct page *npage = page;
+
+ /*
+ * It'd be possible to remove the loop below by resetting
+ * page->owner_uid when the page is handed back to the buddy
+ * allocator. Here we would simply reset page->owner_uid only.
+ * This reduces the efficency of page reuse though, since pages
+ * used by a user may be reset too early.
+ */
+ for (; pgcount; pgcount--, npage++)
+ reset_owner_uid(npage);
+ }
return page;
}

EXPORT_SYMBOL(__alloc_pages);

+static inline int page_need_clear(struct vm_area_struct *vma, struct page *page)
+{
+ return (vma->vm_flags & VM_NOZERO) == 0 ||
+ page_get_owner_uid(page) != vma->vm_mm->owner_uid;
+}
+
+struct page *alloc_zeroed_page_vma(struct vm_area_struct *vma, gfp_t gfp_mask,
+ unsigned long vaddr)
+{
+ struct page *page = alloc_page_vma(gfp_mask | __GFP_UIDKEEP, vma, vaddr);
+
+ if (page) {
+ if (page_need_clear(vma, page))
+ clear_user_highpage(page, vaddr);
+ reset_owner_uid(page);
+ }
+ return page;
+}
+
/*
* Common helper functions.
*/
Index: linux-2.6.mod/include/linux/gfp.h
===================================================================
--- linux-2.6.mod.orig/include/linux/gfp.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/linux/gfp.h 2007-06-27 22:41:33.000000000 -0700
@@ -45,6 +45,7 @@
#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
+#define __GFP_UIDKEEP ((__force gfp_t)0x80000u) /* Do not clear owner UID */

#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -124,6 +125,10 @@
extern struct page *
FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));

+extern struct page *alloc_zeroed_page_vma(struct vm_area_struct *vma,
+ gfp_t gfp_mask,
+ unsigned long vaddr);
+
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
Index: linux-2.6.mod/include/linux/rmap.h
===================================================================
--- linux-2.6.mod.orig/include/linux/rmap.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/linux/rmap.h 2007-06-27 22:41:33.000000000 -0700
@@ -72,6 +72,7 @@
void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_file_rmap(struct page *);
+void page_set_owner(struct page *page, uid_t owner_uid);
void page_remove_rmap(struct page *, struct vm_area_struct *);

#ifdef CONFIG_DEBUG_VM
Index: linux-2.6.mod/kernel/sys.c
===================================================================
--- linux-2.6.mod.orig/kernel/sys.c 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/kernel/sys.c 2007-06-27 22:41:33.000000000 -0700
@@ -1149,6 +1149,7 @@

if (new_euid != old_euid) {
current->mm->dumpable = suid_dumpable;
+ current->mm->owner_uid = new_euid;
smp_wmb();
}
current->fsuid = current->euid = new_euid;
@@ -1199,6 +1200,7 @@

if (old_euid != uid) {
current->mm->dumpable = suid_dumpable;
+ current->mm->owner_uid = uid;
smp_wmb();
}
current->fsuid = current->euid = uid;
@@ -1244,6 +1246,7 @@
if (euid != (uid_t) -1) {
if (euid != current->euid) {
current->mm->dumpable = suid_dumpable;
+ current->mm->owner_uid = euid;
smp_wmb();
}
current->euid = euid;
Index: linux-2.6.mod/include/linux/mm_types.h
===================================================================
--- linux-2.6.mod.orig/include/linux/mm_types.h 2007-06-27 22:41:31.000000000 -0700
+++ linux-2.6.mod/include/linux/mm_types.h 2007-06-27 22:41:33.000000000 -0700
@@ -23,6 +23,9 @@
atomic_t _mapcount; /* Count of ptes mapped in mms,
* to show when page is mapped
* & limit reverse map searches.
+ * This field is also used to store
+ * the last page owner UID. See
+ * include/linux/mm.h
*/
struct { /* SLUB uses */
short unsigned int inuse;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/