[PATCH 07/23] x86, kaiser: unmap kernel from userspace page tables (core patch)
From: Dave Hansen
Date: Tue Oct 31 2017 - 18:32:15 EST
These patches are based on work from a team at Graz University of
Technology: https://github.com/IAIK/KAISER . This work would not have
been possible without their work as a starting point.
KAISER is a countermeasure against side channel attacks against kernel
virtual memory. It leaves the existing page tables largely alone and
refers to them as the "kernel page tables. It adds a "shadow" pgd for
every process which is intended for use when we run userspace. The
shadow pgd maps all the same user memory as the "kernel" copy, but
only maps a minimal set of kernel memory.
Whenever we enter the kernel (syscalls, interrupts, exceptions), the
pgd is switched to the "kernel" copy. When the system switches back
to user mode, the shadow pgd is used.
The minimalistic kernel page tables try to map only what is needed to
enter/exit the kernel such as the entry/exit functions themselves and
the interrupt descriptors (IDT).
Changes from original KAISER patch:
* Gobs of coding style cleanups
* The original patch tried to allocate an order-2 page, then
8k-align the result. That's silly since order-2 is already
guaranteed to be 16k-aligned. Removed that gunk and just
allocate an order-1 page.
* Handle (or at least detect and warn on) allocation failures
* Use _KERNPG_TABLE, not _PAGE_TABLE when creating mappings for
the kernel in the shadow (user) page tables.
* BUG_ON() for !pte_none() case was totally insane: it checked
the physical address of the 'struct page' against the physical
address of the page being mapped.
* Added 5-level page table support
* Never free kaiser page tables. We don't have the locking to
keep them from getting used while we free them.
* Use a totally different scheme in the entry code. The
original code just fell apart in horrific ways in debug faults,
NMIs, or when iret faults. Big thanks to Andy Lutomirski for
reducing the number of places we had to patch. He made the
code a ton simpler.
Note: The original KAISER authors signed-off on their patch. Some of
their code has been broken out into other patches in this series, but
their SoB was only retained here.
Signed-off-by: Moritz Lipp <moritz.lipp@xxxxxxxxxxxxxx>
Signed-off-by: Daniel Gruss <daniel.gruss@xxxxxxxxxxxxxx>
Signed-off-by: Michael Schwarz <michael.schwarz@xxxxxxxxxxxxxx>
Signed-off-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Moritz Lipp <moritz.lipp@xxxxxxxxxxxxxx>
Cc: Daniel Gruss <daniel.gruss@xxxxxxxxxxxxxx>
Cc: Michael Schwarz <michael.schwarz@xxxxxxxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Cc: x86@xxxxxxxxxx
---
b/Documentation/x86/kaiser.txt | 128 ++++++++++++
b/arch/x86/entry/calling.h | 32 ++-
b/arch/x86/include/asm/kaiser.h | 59 +++++
b/arch/x86/include/asm/pgtable.h | 6
b/arch/x86/include/asm/pgtable_64.h | 93 ++++++++
b/arch/x86/kernel/espfix_64.c | 17 +
b/arch/x86/kernel/head_64.S | 14 +
b/arch/x86/mm/Makefile | 1
b/arch/x86/mm/kaiser.c | 380 ++++++++++++++++++++++++++++++++++++
b/arch/x86/mm/pageattr.c | 2
b/arch/x86/mm/pgtable.c | 16 +
b/include/linux/kaiser.h | 34 +++
b/init/main.c | 2
b/kernel/fork.c | 6
14 files changed, 781 insertions(+), 9 deletions(-)
diff -puN arch/x86/entry/calling.h~kaiser-base arch/x86/entry/calling.h
--- a/arch/x86/entry/calling.h~kaiser-base 2017-10-31 15:03:51.817182716 -0700
+++ b/arch/x86/entry/calling.h 2017-10-31 15:03:51.842183897 -0700
@@ -1,6 +1,7 @@
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
/*
@@ -218,10 +219,19 @@ For 32-bit we have the following convent
#endif
.endm
+#ifdef CONFIG_KAISER
+
+/* KAISER PGDs are 8k. We flip bit 12 to switch between the two halves: */
+#define KAISER_SWITCH_MASK (1<<PAGE_SHIFT)
+
.macro ADJUST_KERNEL_CR3 reg:req
+ /* Clear "KAISER bit", point CR3 at kernel pagetables: */
+ andq $(~KAISER_SWITCH_MASK), \reg
.endm
.macro ADJUST_USER_CR3 reg:req
+ /* Move CR3 up a page to the user page tables: */
+ orq $(KAISER_SWITCH_MASK), \reg
.endm
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -240,10 +250,10 @@ For 32-bit we have the following convent
movq %cr3, %r\scratch_reg
movq %r\scratch_reg, \save_reg
/*
- * Just stick a random bit in here that never gets set. Fixed
+ * Is the switch bit zero? This means the address is
* up in real KAISER patches in a moment.
*/
- bt $63, %r\scratch_reg
+ testq $(KAISER_SWITCH_MASK), %r\scratch_reg
jz .Ldone_\@
ADJUST_KERNEL_CR3 %r\scratch_reg
@@ -253,10 +263,26 @@ For 32-bit we have the following convent
.endm
.macro RESTORE_CR3 save_reg:req
- /* optimize this */
+ /*
+ * We could avoid the CR3 write if not changing its value,
+ * but that requires a CR3 read *and* a scratch register.
+ */
movq \save_reg, %cr3
.endm
+#else /* CONFIG_KAISER=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 save_reg:req
+.endm
+
+#endif
+
#endif /* CONFIG_X86_64 */
/*
diff -puN /dev/null arch/x86/include/asm/kaiser.h
--- /dev/null 2017-05-17 09:46:39.241182829 -0700
+++ b/arch/x86/include/asm/kaiser.h 2017-10-31 15:03:51.843183945 -0700
@@ -0,0 +1,59 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Based on work published here: https://github.com/IAIK/KAISER
+ * Modified by Dave Hansen <dave.hansen@xxxxxxxxx to actually work.
+ */
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_KAISER
+/**
+ * kaiser_add_mapping - map a kernel range into the user page tables
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * Use this on all data and code that need to be mapped into both
+ * copies of the page tables. This includes the code that switches
+ * to/from userspace and all of the hardware structures that are
+ * virtually-addressed and needed in userspace like the interrupt
+ * table.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
+ unsigned long flags);
+
+extern int kaiser_map_stack(struct task_struct *tsk);
+
+/**
+ * kaiser_remove_mapping - remove a kernel mapping from the userpage tables
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * kaiser_init - Initialize the shadow mapping
+ *
+ * Most parts of the shadow mapping can be mapped upon boot
+ * time. Only per-process things like the thread stacks
+ * or a new LDT have to be mapped at runtime. These boot-
+ * time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_KAISER_H */
diff -puN arch/x86/include/asm/pgtable_64.h~kaiser-base arch/x86/include/asm/pgtable_64.h
--- a/arch/x86/include/asm/pgtable_64.h~kaiser-base 2017-10-31 15:03:51.819182810 -0700
+++ b/arch/x86/include/asm/pgtable_64.h 2017-10-31 15:03:51.843183945 -0700
@@ -130,9 +130,88 @@ static inline pud_t native_pudp_get_and_
#endif
}
+#ifdef CONFIG_KAISER
+/*
+ * All top-level KAISER page tables are order-1 pages (8k-aligned
+ * and 8k in size). The kernel one is at the beginning 4k and
+ * the user (shadow) one is in the last 4k. To switch between
+ * them, you just need to flip the 12th bit in their addresses.
+ */
+#define KAISER_PGTABLE_SWITCH_BIT PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+ __ptr |= (1<<bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+ __ptr &= ~(1<<bit);
+ return (void *)__ptr;
+}
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, KAISER_PGTABLE_SWITCH_BIT);
+}
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, KAISER_PGTABLE_SWITCH_BIT);
+}
+static inline p4d_t *native_get_shadow_p4d(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, KAISER_PGTABLE_SWITCH_BIT);
+}
+static inline p4d_t *native_get_normal_p4d(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, KAISER_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_KAISER */
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
+#if defined(CONFIG_KAISER) && !defined(CONFIG_X86_5LEVEL)
+ /*
+ * set_pgd() does not get called when we are running
+ * CONFIG_X86_5LEVEL=y. So, just hack around it. We
+ * know here that we have a p4d but that it is really at
+ * the top level of the page tables; it is really just a
+ * pgd.
+ */
+ /* Do we need to also populate the shadow p4d? */
+ if (is_userspace_pgd(p4dp))
+ native_get_shadow_p4d(p4dp)->pgd = p4d.pgd;
+ /*
+ * Even if the entry is *mapping* userspace, ensure
+ * that userspace can not use it. This way, if we
+ * get out to userspace with the wrong CR3 value,
+ * userspace will crash instead of running.
+ */
+ if (!p4d.pgd.pgd)
+ p4dp->pgd.pgd = p4d.pgd.pgd | _PAGE_NX;
+#else /* CONFIG_KAISER */
*p4dp = p4d;
+#endif
}
static inline void native_p4d_clear(p4d_t *p4d)
@@ -146,7 +225,21 @@ static inline void native_p4d_clear(p4d_
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
+#ifdef CONFIG_KAISER
+ /* Do we need to also populate the shadow pgd? */
+ if (is_userspace_pgd(pgdp))
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * Even if the entry is mapping userspace, ensure
+ * that it is unusable for userspace. This way,
+ * if we get out to userspace with the wrong CR3
+ * value, userspace will crash instead of running.
+ */
+ if (!pgd_none(pgd))
+ pgdp->pgd = pgd.pgd | _PAGE_NX;
+#else /* CONFIG_KAISER */
*pgdp = pgd;
+#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
diff -puN arch/x86/include/asm/pgtable.h~kaiser-base arch/x86/include/asm/pgtable.h
--- a/arch/x86/include/asm/pgtable.h~kaiser-base 2017-10-31 15:03:51.821182905 -0700
+++ b/arch/x86/include/asm/pgtable.h 2017-10-31 15:03:51.844183992 -0700
@@ -1105,6 +1105,12 @@ static inline void pmdp_set_wrprotect(st
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff -puN arch/x86/kernel/espfix_64.c~kaiser-base arch/x86/kernel/espfix_64.c
--- a/arch/x86/kernel/espfix_64.c~kaiser-base 2017-10-31 15:03:51.823182999 -0700
+++ b/arch/x86/kernel/espfix_64.c 2017-10-31 15:03:51.844183992 -0700
@@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -128,6 +129,22 @@ void __init init_espfix_bsp(void)
pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
+ /*
+ * Just copy the top-level PGD that is mapping the espfix
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ *
+ * For 5-level paging, we should have already populated
+ * the espfix pgd when kaiser_init() pre-populated all
+ * the pgd entries. The above p4d_alloc() would never do
+ * anything and the p4d_populate() would be done to a p4d
+ * already mapped in the userspace pgd.
+ */
+#ifdef CONFIG_KAISER
+ if (CONFIG_PGTABLE_LEVELS <= 4)
+ set_pgd(native_get_shadow_pgd(pgd),
+ __pgd(_KERNPG_TABLE | (p4d_pfn(*p4d) << PAGE_SHIFT)));
+#endif
/* Randomize the locations */
init_espfix_random();
diff -puN arch/x86/kernel/head_64.S~kaiser-base arch/x86/kernel/head_64.S
--- a/arch/x86/kernel/head_64.S~kaiser-base 2017-10-31 15:03:51.826183141 -0700
+++ b/arch/x86/kernel/head_64.S 2017-10-31 15:03:51.844183992 -0700
@@ -339,6 +339,14 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -348,7 +356,7 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
@@ -362,10 +370,10 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0
#else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
diff -puN /dev/null arch/x86/mm/kaiser.c
--- /dev/null 2017-05-17 09:46:39.241182829 -0700
+++ b/arch/x86/mm/kaiser.c 2017-10-31 15:03:51.845184039 -0700
@@ -0,0 +1,380 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Based on work published here: https://github.com/IAIK/KAISER
+ * Modified by Dave Hansen <dave.hansen@xxxxxxxxx to actually work.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/kaiser.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes. No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte(). We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(vaddr);
+ /*
+ * We made all the kernel PGDs present in kaiser_init().
+ * We expect them to stay that way.
+ */
+ if (pgd_none(*pgd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ /*
+ * PGDs are either 512GB or 128TB on all x86_64
+ * configurations. We don't handle these.
+ */
+ if (pgd_large(*pgd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_none(*p4d)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ pud = pud_offset(p4d, vaddr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pud_large(*pud))
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pmd_large(*pmd))
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_none(*pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+#define KAISER_WALK_ATOMIC 0x1
+static pte_t *kaiser_pagetable_walk(unsigned long address, unsigned long flags)
+{
+ pmd_t *pmd;
+ pud_t *pud;
+ p4d_t *p4d;
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (flags & KAISER_WALK_ATOMIC) {
+ gfp &= ~GFP_KERNEL;
+ gfp |= __GFP_HIGH | __GFP_ATOMIC;
+ }
+
+ if (pgd_none(*pgd)) {
+ WARN_ONCE(1, "All shadow pgds should have been populated");
+ return NULL;
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ p4d = p4d_offset(pgd, address);
+ BUILD_BUG_ON(p4d_large(*p4d) != 0);
+ if (p4d_none(*p4d)) {
+ unsigned long new_pud_page = __get_free_page(gfp);
+ if (!new_pud_page)
+ return NULL;
+
+ spin_lock(&shadow_table_allocation_lock);
+ if (p4d_none(*p4d))
+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+ else
+ free_page(new_pud_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pud = pud_offset(p4d, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud))
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pmd = pmd_offset(pud, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd))
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Given a kernel address, @__start_addr, copy that mapping into
+ * the user (shadow) page tables. This may need to allocate page
+ * table pages.
+ */
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
+{
+ pte_t *pte;
+ unsigned long start_addr = (unsigned long)__start_addr;
+ unsigned long address = start_addr & PAGE_MASK;
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1)
+ return -EIO;
+
+ pte = kaiser_pagetable_walk(address, false);
+ /*
+ * Errors come from either -ENOMEM for a page
+ * table page, or something screwy that did a
+ * WARN_ON(). Just return -ENOMEM.
+ */
+ if (!pte)
+ return -ENOMEM;
+ if (pte_none(*pte)) {
+ set_pte(pte, __pte(flags | target_address));
+ } else {
+ pte_t tmp;
+ set_pte(&tmp, __pte(flags | target_address));
+ WARN_ON_ONCE(!pte_same(*pte, tmp));
+ }
+ }
+ return 0;
+}
+
+/*
+ * The stack mapping is called in generic code and can't use
+ * __PAGE_KERNEL
+ */
+int kaiser_map_stack(struct task_struct *tsk)
+{
+ return kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
+ __PAGE_KERNEL);
+}
+
+int kaiser_add_user_map_ptrs(const void *__start_addr,
+ const void *__end_addr,
+ unsigned long flags)
+{
+ return kaiser_add_user_map(__start_addr,
+ __end_addr - __start_addr,
+ flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated. This ensures that all processes that get
+ * forked have the same entries. This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k(0UL));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+ unsigned long addr = PAGE_OFFSET + i * PGDIR_SIZE;
+#if CONFIG_PGTABLE_LEVELS > 4
+ p4d_t *p4d = p4d_alloc_one(&init_mm, addr);
+ if (!p4d) {
+ WARN_ON(1);
+ break;
+ }
+ set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(p4d)));
+#else /* CONFIG_PGTABLE_LEVELS <= 4 */
+ pud_t *pud = pud_alloc_one(&init_mm, addr);
+ if (!pud) {
+ WARN_ON(1);
+ break;
+ }
+ set_pgd(pgd + i, __pgd(_KERNPG_TABLE | __pa(pud)));
+#endif /* CONFIG_PGTABLE_LEVELS */
+ }
+}
+
+/*
+ * The page table allocations in here can theoretically fail, but
+ * we can not do much about it in early boot. Do the checking
+ * and warning in a macro to make it more readable.
+ */
+#define kaiser_add_user_map_early(start, size, flags) do { \
+ int __ret = kaiser_add_user_map(start, size, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it. If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+ int cpu;
+
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+ void *percpu_vaddr = __per_cpu_user_mapped_start +
+ per_cpu_offset(cpu);
+ unsigned long percpu_sz = __per_cpu_user_mapped_end -
+ __per_cpu_user_mapped_start;
+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+ __PAGE_KERNEL);
+ }
+
+ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+ __PAGE_KERNEL_RX);
+
+ /* the fixed map address of the idt_table */
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+}
+
+int kaiser_add_mapping(unsigned long addr, unsigned long size,
+ unsigned long flags)
+{
+ return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ unsigned long addr;
+
+ /* The shadow page tables always use small pages: */
+ for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+ /*
+ * Do an "atomic" walk in case this got called from an atomic
+ * context. This should not do any allocations because we
+ * should only be walking things that are known to be mapped.
+ */
+ pte_t *pte = kaiser_pagetable_walk(addr, KAISER_WALK_ATOMIC);
+
+ /*
+ * We are removing a mapping that shoud
+ * exist. WARN if it was not there:
+ */
+ if (!pte) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, pte);
+ }
+ /*
+ * This ensures that the TLB entries used to map this data are
+ * no longer usable on *this* CPU. We theoretically want to
+ * flush the entries on all CPUs here, but that's too
+ * expensive right now: this is called to unmap process
+ * stacks in the exit() path path.
+ *
+ * This can change if we get to the point where this is not
+ * in a remotely hot path, like only called via write_ldt().
+ *
+ * Note: we could probably also just invalidate the individual
+ * addresses to take care of *this* PCID and then do a
+ * tlb_flush_shared_nonglobals() to ensure that all other
+ * PCIDs get flushed before being used again.
+ */
+ __native_flush_tlb_global();
+}
diff -puN arch/x86/mm/Makefile~kaiser-base arch/x86/mm/Makefile
--- a/arch/x86/mm/Makefile~kaiser-base 2017-10-31 15:03:51.828183236 -0700
+++ b/arch/x86/mm/Makefile 2017-10-31 15:03:51.845184039 -0700
@@ -45,6 +45,7 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulatio
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_KAISER) += kaiser.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff -puN arch/x86/mm/pageattr.c~kaiser-base arch/x86/mm/pageattr.c
--- a/arch/x86/mm/pageattr.c~kaiser-base 2017-10-31 15:03:51.830183330 -0700
+++ b/arch/x86/mm/pageattr.c 2017-10-31 15:03:51.847184134 -0700
@@ -859,7 +859,7 @@ static void unmap_pmd_range(pud_t *pud,
pud_clear(pud);
}
-static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
+void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
{
pud_t *pud = pud_offset(p4d, start);
diff -puN arch/x86/mm/pgtable.c~kaiser-base arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c~kaiser-base 2017-10-31 15:03:51.833183472 -0700
+++ b/arch/x86/mm/pgtable.c 2017-10-31 15:03:51.847184134 -0700
@@ -354,14 +354,26 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
+
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pgd, we aquire two pgds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
diff -puN /dev/null Documentation/x86/kaiser.txt
--- /dev/null 2017-05-17 09:46:39.241182829 -0700
+++ b/Documentation/x86/kaiser.txt 2017-10-31 15:03:51.848184181 -0700
@@ -0,0 +1,128 @@
+KAISER is a countermeasure against attacks on kernel address
+information. There are at least three existing, published,
+approaches using the shared user/kernel mapping and hardware features
+to defeat KASLR. One approach referenced in the paper locates the
+kernel by observing differences in page fault timing between
+present-but-inaccessable kernel pages and non-present pages.
+
+When we enter the kernel via syscalls, interrupts or exceptions,
+page tables are switched to the full "kernel" copy. When the
+system switches back to user mode, the user/shadow copy is used.
+
+The minimalistic kernel portion of the user page tables try to
+map only what is needed to enter/exit the kernel such as the
+entry/exit functions themselves and the interrupt descriptor
+table (IDT).
+
+This helps ensure that side-channel attacks that leverage the
+paging structures do not function when KAISER is enabled. It
+can be enabled by setting CONFIG_KAISER=y
+
+Protection against side-channel attacks is important. But,
+this protection comes at a cost:
+
+1. Increased Memory Use
+ a. Each process now needs an order-1 PGD instead of order-0.
+ (Consumes 4k per process).
+ b. The pre-allocated second-level (p4d or pud) kernel page
+ table pages cost ~1MB of additional memory at boot. This
+ is not totally wasted because some of these pages would
+ have been needed eventually for normal kernel page tables
+ and things in the vmalloc() area like vmemmap[].
+ c. Statically-allocated structures and entry/exit text must
+ be padded out to 4k (or 8k for PGDs) so they can be mapped
+ into the user page tables. This bloats the kernel image
+ by ~20-30k.
+ d. The shadow page tables eventually grow to map all of used
+ vmalloc() space. They can have roughly the same memory
+ consumption as the vmalloc() page tables.
+
+2. Runtime Cost
+ a. CR3 manipulation to switch between the page table copies
+ must be done at interrupt, syscall, and exception entry
+ and exit (it can be skipped when the kernel is interrupted,
+ though.) Moves to CR3 are on the order of a hundred
+ cycles, and we need one at entry and another at exit.
+ b. Task stacks must be mapped/unmapped. We need to walk
+ and modify the shadow page tables at fork() and exit().
+ c. Global pages are disabled. This feature of the MMU
+ allows different processes to share TLB entries mapping
+ the kernel. Losing the feature means potentially more
+ TLB misses after a context switch.
+ d. Process Context IDentifiers (PCID) is a CPU feature that
+ allows us to skip flushing the entire TLB when we switch
+ the page tables. This makes switching the page tables
+ (at context switch, or kernel entry/exit) cheaper. But,
+ on systems with PCID support, the context switch code
+ must flush both the user and kernel entries out of the
+ TLB, with an INVPCID in addition to the CR3 write. This
+ INVPCID is generally slower than a CR3 write, but still
+ on the order of a hundred cycles.
+ e. The shadow page tables must be populated for each new
+ process. Even without KAISER, since we share all of the
+ kernel mappings in all processes, we can do all this
+ population for kernel addresses at the top level of the
+ page tables (the PGD level). But, with KAISER, we now
+ have *two* kernel mappings: one in the kernel page tables
+ that maps everything and one in the user/shadow page
+ tables mapping the "minimal" kernel. At fork(), we
+ copy the portion of the shadow PGD that maps the minimal
+ kernel structures in addition to the normal kernel one.
+ f. In addition to the fork()-time copying, we must also
+ update the shadow PGD any time a set_pgd() is done on a
+ PGD used to map userspace. This ensures that the kernel
+ and user/shadow copies always map the same userspace
+ memory.
+ g. On systems without PCID support, each CR3 write flushes
+ the entire TLB. That means that each syscall, interrupt
+ or exception flushes the TLB.
+
+Possible Future Work:
+1. We can be more careful about not actually writing to CR3
+ unless we actually switch it.
+2. Try to have dedicated entry/exit kernel stacks so we do
+ not have to map/unmap the task/thread stacks.
+3. Compress the user/shadow-mapped data to be mapped together
+ underneath a single PGD entry.
+4. Re-enable global pages, but use them for mappings in the
+ user/shadow page tables. This would allow the kernel to
+ take advantage of TLB entries that were established from
+ the user page tables. This might speed up the entry/exit
+ code or userspace since it will not have to reload all of
+ its TLB entries. However, its upside is limited by PCID
+ being used.
+5. Allow KAISER to enabled/disabled at runtime so folks can
+ run a single kernel image.
+
+Debugging:
+
+Bugs in KAISER cause a few different signatures of crashes
+that are worth noting here.
+
+ * Crashes in early boot, especially around CPU bringup. Bugs
+ in the trampoline code or mappings cause these.
+ * Crashes at the first interrupt. Caused by bugs in entry_64.S,
+ like screwing up a page table switch. Also caused by
+ incorrectly mapping the IRQ handler entry code.
+ * Crashes at the first NMI. The NMI code is separate from main
+ interrupt handlers and can have bugs that do not affect
+ normal interrupts. Also caused by incorrectly mapping NMI
+ code. NMIs that interrupt the entry code must be very
+ careful and can be the cause of crashes that show up when
+ running perf.
+ * Kernel crashes at the first exit to userspace. entry_64.S
+ bugs, or failing to map some of the exit code.
+ * Crashes at first interrupt that interrupts userspace. The paths
+ in entry_64.S that return to userspace are sometimes separate
+ from the ones that return to the kernel.
+ * Double faults: overflowing the kernel stack because of page
+ faults upon page faults. Caused by touching non-kaiser-mapped
+ data in the entry code, or forgetting to switch to kernel
+ CR3 before calling into C functions which are not kaiser-mapped.
+ * Failures of the selftests/x86 code. Usually a bug in one of the
+ more obscure corners of entry_64.S
+ * Userspace segfaults early in boot, sometimes manifesting
+ as mount(8) failing to mount the rootfs. These have
+ tended to be TLB invalidation issues. Usually invalidating
+ the wrong PCID, or otherwise missing an invalidation.
+
diff -puN /dev/null include/linux/kaiser.h
--- /dev/null 2017-05-17 09:46:39.241182829 -0700
+++ b/include/linux/kaiser.h 2017-10-31 15:03:51.848184181 -0700
@@ -0,0 +1,34 @@
+#ifndef _INCLUDE_KAISER_H
+#define _INCLUDE_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it
+ * disabled.
+ */
+
+static inline int kaiser_map_stack(struct task_struct *tsk)
+{
+ return 0;
+}
+
+static inline void kaiser_init(void)
+{
+}
+
+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+}
+
+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size,
+ unsigned long flags)
+{
+ return 0;
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _INCLUDE_KAISER_H */
diff -puN init/main.c~kaiser-base init/main.c
--- a/init/main.c~kaiser-base 2017-10-31 15:03:51.836183614 -0700
+++ b/init/main.c 2017-10-31 15:03:51.848184181 -0700
@@ -75,6 +75,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
+#include <linux/kaiser.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/sched_clock.h>
@@ -504,6 +505,7 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+ kaiser_init();
}
asmlinkage __visible void __init start_kernel(void)
diff -puN kernel/fork.c~kaiser-base kernel/fork.c
--- a/kernel/fork.c~kaiser-base 2017-10-31 15:03:51.838183708 -0700
+++ b/kernel/fork.c 2017-10-31 15:03:51.849184228 -0700
@@ -70,6 +70,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -247,6 +248,8 @@ static unsigned long *alloc_thread_stack
static inline void free_thread_stack(struct task_struct *tsk)
{
+ kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
+
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
int i;
@@ -536,6 +539,9 @@ static struct task_struct *dup_task_stru
* functions again.
*/
tsk->stack = stack;
+ err = kaiser_map_stack(tsk);
+ if (err)
+ goto free_stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
_