[PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu

From: Arun Sharma
Date: Wed Jan 18 2012 - 13:53:41 EST


This enables malloc optimizations where we might
madvise(..,MADV_DONTNEED) a page only to fault it
back at a different virtual address.

To ensure that we don't leak sensitive data to
unprivileged processes, we enable this optimization
only for pages that are reused within a memory
cgroup.

The idea is to make this opt-in both at the mmap()
level and cgroup level so the default behavior is
unchanged after the patch.

TODO: Ask for a VM_UNINITIALIZED bit
TODO: Implement a cgroup level opt-in flag

To: linux-kernel@xxxxxxxxxxxxxxx
Cc: linux-mm@xxxxxxxxx
Cc: Balbir Singh <bsingharora@xxxxxxxxx>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
Cc: akpm@xxxxxxxxxxxxxxxxxxxx
Signed-off-by: Arun Sharma <asharma@xxxxxx>
---
include/asm-generic/mman-common.h | 6 +-----
include/linux/highmem.h | 6 ++++++
include/linux/mm.h | 2 ++
include/linux/mman.h | 1 +
include/linux/page_cgroup.h | 29 +++++++++++++++++++++++++++++
init/Kconfig | 2 +-
mm/mempolicy.c | 29 +++++++++++++++++++++++------
7 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 787abbb..71e079f 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -19,11 +19,7 @@
#define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */
-#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
-# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */
-#else
-# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
-#endif
+#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */

#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_INVALIDATE 2 /* invalidate the caches */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3a93f73..caae922 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -4,6 +4,7 @@
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/page_cgroup.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>

@@ -156,6 +157,11 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
vma, vaddr);

+#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
+ if (!page_needs_clearing(page, vma))
+ return page;
+#endif
+
if (page)
clear_user_highpage(page, vaddr);

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4baadd1..c6bab01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -118,6 +118,8 @@ extern unsigned int kobjsize(const void *objp);
#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
+#define VM_UNINITIALIZED VM_SAO /* Steal a powerpc bit for now, since we're out
+ of bits for 32 bit archs */

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8b74e9b..9bef6c9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -87,6 +87,7 @@ calc_vm_flag_bits(unsigned long flags)
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
+ _calc_vm_trans(flags, MAP_UNINITIALIZED, VM_UNINITIALIZED) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
}
#endif /* __KERNEL__ */
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 961ecc7..e959869 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -155,6 +155,17 @@ static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK;
}

+static int mm_match_cgroup(const struct mm_struct *mm,
+ const struct mem_cgroup *cgroup);
+static inline bool page_seen_by_cgroup(struct page *page,
+ const struct mm_struct *mm)
+{
+ struct page_cgroup *pcg = lookup_page_cgroup(page);
+ if (pcg == NULL)
+ return false;
+ return mm_match_cgroup(mm, pcg->mem_cgroup);
+}
+
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;

@@ -175,8 +186,26 @@ static inline void __init page_cgroup_init_flatmem(void)
{
}

+static inline bool page_seen_by_cgroup(struct page *page,
+ const struct mm_struct *mm)
+{
+ return false;
+}
+
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */

+static inline bool vma_requests_uninitialized(struct vm_area_struct *vma)
+{
+ return vma && !vma->vm_file && vma->vm_flags & VM_UNINITIALIZED;
+}
+
+static inline bool page_needs_clearing(struct page *page,
+ struct vm_area_struct *vma)
+{
+ return !(vma_requests_uninitialized(vma)
+ && page_seen_by_cgroup(page, vma->vm_mm));
+}
+
#include <linux/swap.h>

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/init/Kconfig b/init/Kconfig
index 43298f9..428e047 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1259,7 +1259,7 @@ endchoice

config MMAP_ALLOW_UNINITIALIZED
bool "Allow mmapped anonymous memory to be uninitialized"
- depends on EXPERT && !MMU
+ depends on EXPERT
default n
help
Normally, and according to the Linux spec, anonymous memory obtained
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c3fdbcb..7c9ab68 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>

#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -1847,6 +1848,11 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
struct zonelist *zl;
struct page *page;

+#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
+ if (vma_requests_uninitialized(vma))
+ gfp &= ~__GFP_ZERO;
+#endif
+
get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -1854,25 +1860,36 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- put_mems_allowed();
- return page;
+ goto out;
}
zl = policy_zonelist(gfp, pol, node);
if (unlikely(mpol_needs_cond_ref(pol))) {
/*
* slow path: ref counted shared policy
*/
- struct page *page = __alloc_pages_nodemask(gfp, order,
- zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, order,
+ zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
- put_mems_allowed();
- return page;
+ goto out;
}
+
/*
* fast path: default or task policy
*/
page = __alloc_pages_nodemask(gfp, order, zl,
policy_nodemask(gfp, pol));
+
+out:
+#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
+ if (page_needs_clearing(page, vma)) {
+ int i;
+ for (i = 0; i < (1 << order); i++) {
+ void *kaddr = kmap_atomic(page + i, KM_USER0);
+ clear_page(kaddr);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ }
+#endif
put_mems_allowed();
return page;
}
--
1.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/