kernel page table isolation for x86-32 was Re: Linux 4.15-rc7

From: Pavel Machek
Date: Sat Jan 13 2018 - 07:52:18 EST


Hi!

> > I'll try to do the right thing. OTOH... I don't like the fact that
> > kernel memory on my machine is currently readable, probably even from
> > javascript.
>
> Oh, absolutely. I'm just saying that it's probably best to try to
> start from the x86-64 KPTI model, and see how that works for x86-32.

Ok, it should not be too bad. Here's something... getting it to
compile should be easy, getting it to work might be trickier. Not sure
what needs to be done for the LDT.

Pavel

diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt
index d11eff6..e13e1e5 100644
--- a/Documentation/x86/pti.txt
+++ b/Documentation/x86/pti.txt
@@ -124,7 +124,7 @@ Possible Future Work
boot-time switching.

Testing
-========
+=======

To test stability of PTI, the following test procedure is recommended,
ideally doing all of these in parallel:
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e0..b0485cc 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -332,6 +332,99 @@ For 32-bit we have the following conventions - kernel is built with

#endif

+#else
+
+/*
+ * x86-32 kernel page table isolation.
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
+#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+
+.macro ADJUST_KERNEL_CR3 reg:req
+ /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ andl $(~PTI_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ movl %cr3, \scratch_reg
+ ADJUST_KERNEL_CR3 \scratch_reg
+ movl \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+
+.Lwrcr3_\@:
+ /* Flip the PGD and ASID to the user version */
+ orl $(PTI_SWITCH_MASK), \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ pushl %eax
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%eax
+ popl %eax
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ movl %cr3, \scratch_reg
+ movl \scratch_reg, \save_reg
+ /*
+ * Is the "switch mask" all zero? That means that both of
+ * these are zero:
+ *
+ * 1. The user/kernel PCID bit, and
+ * 2. The user/kernel "bit" that points CR3 to the
+ * bottom half of the 8k PGD
+ *
+ * That indicates a kernel CR3 value, not a user CR3.
+ */
+ testl $(PTI_SWITCH_MASK), \scratch_reg
+ jz .Ldone_\@
+
+ ADJUST_KERNEL_CR3 \scratch_reg
+ movl \scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ /*
+ * The CR3 write could be avoided when not changing its value,
+ * but would require a CR3 read *and* a scratch register.
+ */
+ movl \save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
#endif /* CONFIG_X86_64 */

/*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d2ef7f32..be8759b 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -46,6 +46,8 @@
#include <asm/frame.h>
#include <asm/nospec-branch.h>

+#include "calling.h"
+
.section .entry.text, "ax"

/*
@@ -428,6 +430,7 @@ ENTRY(entry_SYSENTER_32)
pushl $0 /* pt_regs->ip = 0 (placeholder) */
pushl %eax /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%edx save_reg=%ecx

/*
* SYSENTER doesn't filter flags, so we need to clear NT, AC
@@ -464,6 +467,7 @@ ENTRY(entry_SYSENTER_32)
ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
"jmp .Lsyscall_32_done", X86_FEATURE_XENPV

+ RESTORE_CR3 scratch_reg=%edx save_reg=%ecx
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
movl PT_EIP(%esp), %edx /* pt_regs->ip */
@@ -539,6 +543,7 @@ ENTRY(entry_INT80_32)
ASM_CLAC
pushl %eax /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%edx save_reg=%ecx

/*
* User mode is traced as though IRQs are on, and the interrupt gate
@@ -552,6 +557,7 @@ ENTRY(entry_INT80_32)

restore_all:
TRACE_IRQS_IRET
+ RESTORE_CR3 scratch_reg=%eax save_reg=%ecx
.Lrestore_all_notrace:
#ifdef CONFIG_X86_ESPFIX32
ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e51b65a..a87fb89 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -4,7 +4,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
- * Copyright (C) 2000 Pavel Machek <pavel@xxxxxxx>
+ * Copyright (C) 2000 Pavel Machek SuSE Labs
*
* entry.S contains the system-call and fault low-level handling routines.
*
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index e67c062..1b36e56 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -107,4 +107,78 @@ do { \
*/
#define LOWMEM_PAGES ((((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT))

+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
+ * the user one is in the last 4k. To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
+
+#ifndef __ASSEMBLY__
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr |= BIT(bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr &= ~BIT(bit);
+ return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return pgd;
+ return __pti_set_user_pgd(pgdp, pgd);
+}
+#else
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif
+#endif
+
#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index ce245b0..804fc33 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -62,4 +62,6 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */

#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)

+#define LDT_BASE_ADDR 0 /* FIXME */
+
#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 6a60fea..8f1cf71 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -39,10 +39,6 @@
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)

-#ifdef CONFIG_PAGE_TABLE_ISOLATION
-# define X86_CR3_PTI_SWITCH_BIT 11
-#endif
-
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
@@ -53,4 +49,8 @@
#define CR3_NOFLUSH 0
#endif

+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_SWITCH_BIT 11
+#endif
+
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index dbaf14d..849e073 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,5 +1,7 @@
#define pr_fmt(fmt) "SMP alternatives: " fmt

+#define DEBUG
+
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mutex.h>
@@ -28,7 +30,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched);

#define MAX_PATCH_LEN (255-1)

-static int __initdata_or_module debug_alternative;
+static int __initdata_or_module debug_alternative = 1;

static int __init debug_alt(char *str)
{
@@ -60,7 +62,7 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
#define DPRINTK(fmt, args...) \
do { \
if (debug_alternative) \
- printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+ printk( "%s: " fmt "\n", __func__, ##args); \
} while (0)

#define DUMP_BYTES(buf, len, fmt, args...) \
@@ -71,7 +73,7 @@ do { \
if (!(len)) \
break; \
\
- printk(KERN_DEBUG fmt, ##args); \
+ printk( fmt, ##args); \
for (j = 0; j < (len) - 1; j++) \
printk(KERN_CONT "%02hhx ", buf[j]); \
printk(KERN_CONT "%02hhx\n", buf[j]); \
@@ -373,6 +375,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];

+ printk("apply_alternatives: entering\n");
+
DPRINTK("alt table %p -> %p", start, end);
/*
* The scan order should be from start to end. A later scanned
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c290209..002ffaf 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -505,6 +505,31 @@ __INITDATA
GLOBAL(early_recursion_flag)
.long 0

+#define NEXT_PAGE(name) \
+ .balign PAGE_SIZE; \
+GLOBAL(name)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL 1024
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL 0
+#endif
+
__REFDATA
.align 4
ENTRY(initial_code)
@@ -516,24 +541,26 @@ ENTRY(setup_once_ref)
* BSS section
*/
__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
#ifdef CONFIG_X86_PAE
-.globl initial_pg_pmd
+NEXT_PGD_PAGE(initial_pg_pmd)
initial_pg_pmd:
.fill 1024*KPMDS,4,0
+ .fill PTI_USER_PGD_FILL,4,0
#else
-.globl initial_page_table
+NEXT_PGD_PAGE(initial_page_table)
initial_page_table:
.fill 1024,4,0
+ .fill PTI_USER_PGD_FILL,4,0
#endif
initial_pg_fixmap:
.fill 1024,4,0
.globl empty_zero_page
empty_zero_page:
.fill 4096,1,0
-.globl swapper_pg_dir
+NEXT_PGD_PAGE(swapper_pg_dir)
swapper_pg_dir:
.fill 1024,4,0
+ .fill PTI_USER_PGD_FILL,4,0
EXPORT_SYMBOL(empty_zero_page)

/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 04a625f..57f5cd4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -3,7 +3,7 @@
* linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@xxxxxxx> SuSE
- * Copyright (C) 2000 Pavel Machek <pavel@xxxxxxx>
+ * Copyright (C) 2000 Pavel Machek
* Copyright (C) 2000 Karsten Keil <kkeil@xxxxxxx>
* Copyright (C) 2001,2002 Andi Kleen <ak@xxxxxxx>
* Copyright (C) 2005 Eric Biederman <ebiederm@xxxxxxxxxxxx>
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2a4849e..896b53b 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -543,7 +543,11 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
static void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_X86_64
pgd_t *pgd = (pgd_t *) &init_top_pgt;
+#else
+ pgd_t *pgd = swapper_pg_dir;
+#endif

if (!static_cpu_has(X86_FEATURE_PTI))
return;
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index ce38f16..029b5c8 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -113,8 +113,11 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
* Top-level entries added to init_mm's usermode pgd after boot
* will not be automatically propagated to other mms.
*/
+#ifdef X86_64
+ /* FIXME? */
if (!pgdp_maps_userspace(pgdp))
return pgd;
+#endif

/*
* The user page tables get the full PGD, accessible from
@@ -166,7 +169,9 @@ static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)

set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
}
+#ifdef X86_64
BUILD_BUG_ON(pgd_large(*pgd) != 0);
+#endif

return p4d_offset(pgd, address);
}
diff --git a/security/Kconfig b/security/Kconfig
index 1f96e19..ad77de4 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -57,7 +57,7 @@ config SECURITY_NETWORK
config PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
default y
- depends on X86_64 && !UML
+ depends on (X86_32 || X86_64) && !UML
help
This feature reduces the number of hardware side channels by
ensuring that the majority of kernel addresses are not mapped


--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

Attachment: signature.asc
Description: Digital signature