[PATCH 2/2] x86/kpti: Reference all cpu_entry_area pagetables in the usermode tables

From: Andy Lutomirski
Date: Sat Dec 02 2017 - 01:20:28 EST


We were manually configuring cpu_entry_area in the usermode tables.
This was error-prone and wasted memory. (Not much memory, but
still.) Instead, just reference the same pagetables.

This avoids needing to keep the KPTI code and the normal
cpu_entry_area code in sync, since the KPTI code no longer cares
what's in cpu_entry_area.

[This does *not* work on the current KPTI series. It requires that
all the kernelmode cpu_entry_tables are pre-allocated. That
happens in the series as I submitted it, but tglx changed it for
reasons that I haven't figured out.]

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---
arch/x86/include/asm/fixmap.h | 14 +++++---
arch/x86/include/asm/kpti.h | 8 +++--
arch/x86/kernel/cpu/common.c | 3 --
arch/x86/mm/kpti.c | 82 ++++++++++++++++++++++++++-----------------
4 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 839addd1eaec..a630cd2861f7 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -142,16 +142,20 @@ enum fixed_addresses {
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#endif
- FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
- FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
- /* Fixmap entries to remap the GDTs, one per processor. */
- FIX_CPU_ENTRY_AREA_TOP,
+ FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
+ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
+
+ /*
+ * Fixmap entries to remap the GDTs, one per processor. Align
+ * to a PMD boundary.
+ */
+ FIX_CPU_ENTRY_AREA_TOP = round_up(FIX_TEXT_POKE0 + 1, PTRS_PER_PMD),
FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,

- __end_of_permanent_fixed_addresses,
+ __end_of_permanent_fixed_addresses = round_up(FIX_CPU_ENTRY_AREA_BOTTOM + 1, PTRS_PER_PMD),

/*
* 512 temporary boot-time mappings, used by early_ioremap(),
diff --git a/arch/x86/include/asm/kpti.h b/arch/x86/include/asm/kpti.h
index 0c10e86ae3f8..df52cec2a53b 100644
--- a/arch/x86/include/asm/kpti.h
+++ b/arch/x86/include/asm/kpti.h
@@ -1,5 +1,8 @@
#ifndef _ASM_X86_KPTI_H
#define _ASM_X86_KPTI_H
+
+#include <linux/init.h>
+
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
@@ -34,10 +37,9 @@ extern int kpti_add_mapping(unsigned long addr, unsigned long size,
unsigned long flags);

/**
- * kpti_add_mapping_cpu_entry - map the cpu entry area
- * @cpu: the CPU for which the entry area is being mapped
+ * kpti_clone_cpu_entry_areas - clone cpu_entry_areas to the usermode tables
*/
-extern void kpti_add_mapping_cpu_entry(int cpu);
+extern void __init kpti_clone_cpu_entry_areas(void);

/**
* kpti_remove_mapping - remove a kernel mapping from the userpage tables
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 00697119f983..3dc814519c92 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -606,9 +606,6 @@ void __init setup_cpu_entry_area(int cpu)
sizeof(struct debug_store) / PAGE_SIZE,
PAGE_KERNEL);
#endif
- /* CPU 0's mapping is done in kpti_init() */
- if (cpu)
- kpti_add_mapping_cpu_entry(cpu);
}

/* Load the original GDT from the per-cpu structure */
diff --git a/arch/x86/mm/kpti.c b/arch/x86/mm/kpti.c
index 52fd833845ba..cd81a7432f49 100644
--- a/arch/x86/mm/kpti.c
+++ b/arch/x86/mm/kpti.c
@@ -240,7 +240,7 @@ static pmd_t *kpti_shadow_pagetable_walk_pmd(unsigned long address,
* Returns a pointer to a PTE on success, or NULL on failure.
*/
static pte_t *kpti_shadow_pagetable_walk(unsigned long address,
- unsigned long flags)
+ unsigned long flags)
{
pmd_t *pmd = kpti_shadow_pagetable_walk_pmd(address, flags);
pte_t *pte;
@@ -401,28 +401,55 @@ static void __init kpti_init_all_pgds(void)
WARN_ON(__ret); \
} while (0)

-void kpti_add_mapping_cpu_entry(int cpu)
+void __init kpti_clone_cpu_entry_areas(void)
{
- kpti_add_user_map_early(get_cpu_gdt_ro(cpu), PAGE_SIZE,
- __PAGE_KERNEL_RO);
-
- kpti_add_user_map_early(&get_cpu_entry_area(cpu)->tss,
- sizeof(get_cpu_entry_area(cpu)->tss),
- __PAGE_KERNEL | _PAGE_GLOBAL);
-
- /* entry stack */
- kpti_add_user_map_early(&get_cpu_entry_area(cpu)->SYSENTER_stack_page,
- sizeof(get_cpu_entry_area(cpu)->SYSENTER_stack_page),
- __PAGE_KERNEL | _PAGE_GLOBAL);
-
- /* Entry code, so needs to be EXEC */
- kpti_add_user_map_early(&get_cpu_entry_area(cpu)->entry_trampoline,
- sizeof(get_cpu_entry_area(cpu)->entry_trampoline),
- __PAGE_KERNEL_RX | _PAGE_GLOBAL);
-
- kpti_add_user_map_early(&get_cpu_entry_area(cpu)->exception_stacks,
- sizeof(get_cpu_entry_area(cpu)->exception_stacks),
- __PAGE_KERNEL | _PAGE_GLOBAL);
+ int cpu;
+ unsigned long last_pmd_addr = 0;
+
+ /* The top of the cpu_entry_area block is meant to be PMD-aligned. */
+ WARN_ON((unsigned long)(get_cpu_entry_area(NR_CPUS-1) + 1) & ~PMD_MASK);
+
+ /*
+ * Iterate over possible CPUs, not addresses: it's possible that
+ * NR_CPUs is enough larger than the actual number of possible CPUs
+ * that we have unpopulated PMDs in the cpu_entry_area range.
+ */
+ for_each_possible_cpu(cpu) {
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd, *target_pmd;
+ unsigned long addr =
+ (unsigned long)get_cpu_entry_area(cpu) & PMD_MASK;
+
+ if (addr == last_pmd_addr)
+ continue;
+ last_pmd_addr = addr;
+
+ pgd = pgd_offset_k(addr);
+ if (WARN_ON(pgd_none(*pgd)))
+ return;
+ p4d = p4d_offset(pgd, addr);
+ if (WARN_ON(p4d_none(*p4d)))
+ return;
+ pud = pud_offset(p4d, addr);
+ if (WARN_ON(pud_none(*pud)))
+ return;
+ pmd = pmd_offset(pud, addr);
+ if (WARN_ON(pmd_none(*pmd)))
+ return;
+
+ target_pmd = kpti_shadow_pagetable_walk_pmd(addr, 0);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode tables
+ * will share all last-level page tables containing
+ * cpu_entry_area mappings.
+ */
+ *target_pmd = *pmd;
+ }
}

/*
@@ -459,16 +486,7 @@ void __init kpti_init(void)
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL_RO | _PAGE_GLOBAL);

- /*
- * We delay CPU 0's mappings because these structures are created
- * before the page allocator is up. Deferring it until here lets
- * us use the plain page allocator unconditionally in the page
- * table code above.
- *
- * This is OK because kpti_init() is called long before we ever run
- * userspace and need the KERNEL_PAGE_TABLE_ISOLATION mappings.
- */
- kpti_add_mapping_cpu_entry(0);
+ kpti_clone_cpu_entry_areas();
}

int kpti_add_mapping(unsigned long addr, unsigned long size,
--
2.13.6