Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings

From: David Hildenbrand
Date: Mon Oct 28 2019 - 10:55:38 EST


On 27.10.19 11:17, Mike Rapoport wrote:
From: Mike Rapoport <rppt@xxxxxxxxxxxxx>

The mappings created with MAP_EXCLUSIVE are visible only in the context of
the owning process and can be used by applications to store secret
information that will not be visible not only to other processes but to the
kernel as well.

The pages in these mappings are removed from the kernel direct map and
marked with PG_user_exclusive flag. When the exclusive area is unmapped,
the pages are mapped back into the direct map.

The MAP_EXCLUSIVE flag implies MAP_POPULATE and MAP_LOCKED.

Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxx>
---
arch/x86/mm/fault.c | 14 ++++++++++
fs/proc/task_mmu.c | 1 +
include/linux/mm.h | 9 +++++++
include/linux/page-flags.h | 7 +++++
include/linux/page_excl.h | 49 ++++++++++++++++++++++++++++++++++
include/trace/events/mmflags.h | 9 ++++++-
include/uapi/asm-generic/mman-common.h | 1 +
kernel/fork.c | 3 ++-
mm/Kconfig | 3 +++
mm/gup.c | 8 ++++++
mm/memory.c | 3 +++
mm/mmap.c | 16 +++++++++++
mm/page_alloc.c | 5 ++++
13 files changed, 126 insertions(+), 2 deletions(-)
create mode 100644 include/linux/page_excl.h

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9ceacd1..8f73a75 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -17,6 +17,7 @@
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_recover_from_page_fault()*/
+#include <linux/page_excl.h> /* page_is_user_exclusive() */
#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -1218,6 +1219,13 @@ static int fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
}
+static bool fault_in_user_exclusive_page(unsigned long address)
+{
+ struct page *page = virt_to_page(address);
+
+ return page_is_user_exclusive(page);
+}
+
/*
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
@@ -1261,6 +1269,12 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
if (spurious_kernel_fault(hw_error_code, address))
return;
+ /* FIXME: warn and handle gracefully */
+ if (unlikely(fault_in_user_exclusive_page(address))) {
+ pr_err("page fault in user exclusive page at %lx", address);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)address);
+ }
+
/* kprobes don't want to hook the spurious faults: */
if (kprobe_page_fault(regs, X86_TRAP_PF))
return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9442631..99e14d1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -655,6 +655,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_INTEL_MPX
[ilog2(VM_MPX)] = "mp",
#endif
+ [ilog2(VM_EXCLUSIVE)] = "xl",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc29227..9c43375 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -298,11 +298,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -340,6 +342,12 @@ extern unsigned int kobjsize(const void *objp);
# define VM_MPX VM_NONE
#endif
+#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
+# define VM_EXCLUSIVE VM_HIGH_ARCH_5
+#else
+# define VM_EXCLUSIVE VM_NONE
+#endif
+
#ifndef VM_GROWSUP
# define VM_GROWSUP VM_NONE
#endif
@@ -2594,6 +2602,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
+#define FOLL_EXCLUSIVE 0x40000 /* mapping is exclusive to owning mm */
/*
* NOTE on FOLL_LONGTERM:
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f91cb88..32d0aee 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -131,6 +131,9 @@ enum pageflags {
PG_young,
PG_idle,
#endif
+#if defined(CONFIG_EXCLUSIVE_USER_PAGES)
+ PG_user_exclusive,
+#endif

Last time I tried to introduce a new page flag I learned that this is very much frowned upon. Best you can usually do is reuse another flag - if valid in that context.

--

Thanks,

David / dhildenb