[PATCH PTI v3 10/10] x86/pti: Put the LDT in its own PGD if PTI is on
From: Andy Lutomirski
Date: Tue Dec 12 2017 - 10:57:54 EST
With PTI on, we need the LDT to be in the usermode tables somewhere,
and the LDT is per-mm.
tglx had a hack to have a per-cpu LDT and context switch it, but it
was probably insanely slow due to the required TLB flushes.
Instead, take advantage of the fact that we have an address space
hole that gives us a completely unused pgd and make that pgd be
per-mm. We can put the LDT in it.
This has a down side: the LDT isn't (currently) randomized, and an
attack that can write the LDT is instant root due to call gates
(thanks, AMD, for leaving call gates in AMD64 but designing them
wrong so they're only useful for exploits). We could mitigate this
by making the LDT read-only or randomizing it, either of which is
strightforward on top of this patch.
This will significantly slow down LDT users, but that shouldn't
matter for important workloads -- the LDT is only used by DOSEMU(2),
Wine, and very old libc implementations.
Cc: "Kirill A. Shutemov" <kirill@xxxxxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxx>
Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---
Documentation/x86/x86_64/mm.txt | 3 +-
arch/x86/include/asm/mmu_context.h | 48 +++++++++-
arch/x86/include/asm/pgtable_64_types.h | 4 +
arch/x86/include/asm/processor.h | 23 +++--
arch/x86/kernel/ldt.c | 155 ++++++++++++++++++++++++++++++--
arch/x86/mm/dump_pagetables.c | 12 +++
6 files changed, 225 insertions(+), 20 deletions(-)
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 6a28aeaccd53..0a9925881e37 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@@ -28,7 +29,7 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff9fffffffffffff (=52 bits) hole
+ff90000000000000 - ff9fffffffffffff (=49 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e1a1ecb65c6..6088f508b623 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,13 +52,33 @@ struct ldt_struct {
*/
struct desc_struct *entries;
unsigned int nr_entries;
+
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot). We use two slots so that we can allocate
+ * and map, and enable a new LDT without invalidating the mapping
+ * of an older, still-in-use LDT.
+ *
+ * slot will be -1 if this LDT doesn't have an alias mapping.
+ */
+ int slot;
};
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static void *ldt_slot_va(int slot)
+{
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
/*
* Used for LDT copy/destruction.
*/
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context_ldt(struct task_struct *tsk,
struct mm_struct *mm)
@@ -90,10 +110,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see.
*/
- if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->nr_entries);
- else
+ if (unlikely(ldt)) {
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
+
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
clear_LDT();
+ }
#else
clear_LDT();
#endif
@@ -185,6 +226,7 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6e2f50843c7d..10ea949df97f 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -81,10 +81,14 @@ typedef struct { pteval_t pte; } pte_t;
#define VMALLOC_SIZE_TB _AC(12800, UL)
#define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+#define LDT_PGD_ENTRY _AC(-112, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else
#define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+#define LDT_PGD_ENTRY _AC(-3, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
#else
/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ae5615b03def..fe5e024b3a20 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -46,13 +47,11 @@ static void refresh_ldt_segments(void)
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
- mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+ load_mm_ldt(mm);
refresh_ldt_segments();
}
@@ -89,10 +88,124 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return NULL;
}
+ /* The new LDT isn't aliased for PTI yet. */
+ new_ldt->slot = -1;
+
new_ldt->nr_entries = num_entries;
return new_ldt;
}
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ *
+ * There is no corresponding unmap function. Even if the LDT is freed, we
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
+ * This is harmless: the LDT is always in ordinary memory, and no one will
+ * access the freed slot.
+ *
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
+ * it useful, and the flush would slow down modify_ldt().
+ */
+static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ spinlock_t *ptl;
+ bool is_vmalloc;
+ bool had_top_level_entry;
+ pgd_t *pgd;
+ int i;
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return 0;
+
+ /*
+ * Any given ldt_struct should have map_ldt_struct() called at most
+ * once.
+ */
+ WARN_ON(ldt->slot != -1);
+
+ /*
+ * Did we already have the top level entry allocated? We can't
+ * use pgd_none() for this because it doens't do anything on
+ * 4-level page table kernels.
+ */
+ pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ had_top_level_entry = (pgd->pgd != 0);
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ unsigned long va = (unsigned long)ldt_slot_va(slot) + offset;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+ pte_t pte, *ptep;
+
+ /*
+ * Treat the PTI LDT range as a *userspace* range.
+ * get_locked_pte() will allocate all needed pagetables
+ * and account for them in this mm.
+ */
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_top_level_entry);
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_top_level_entry);
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+ WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+ }
+ }
+
+ flush_tlb_mm_range(mm,
+ (unsigned long)ldt_slot_va(slot),
+ (unsigned long)ldt_slot_va(slot) + LDT_SLOT_STRIDE,
+ 0);
+
+ ldt->slot = slot;
+
+ return 0;
+#else
+ return -EINVAL;
+#endif
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
@@ -134,17 +247,15 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
int retval = 0;
mutex_init(&mm->context.lock);
+ mm->context.ldt = NULL;
+
old_mm = current->mm;
- if (!old_mm) {
- mm->context.ldt = NULL;
+ if (!old_mm)
return 0;
- }
mutex_lock(&old_mm->context.lock);
- if (!old_mm->context.ldt) {
- mm->context.ldt = NULL;
+ if (!old_mm->context.ldt)
goto out_unlock;
- }
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
@@ -155,8 +266,17 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval)
+ goto out_free;
mm->context.ldt = new_ldt;
+ goto out_unlock;
+
+out_free:
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ return retval;
out_unlock:
mutex_unlock(&old_mm->context.lock);
@@ -174,6 +294,11 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
@@ -286,6 +411,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
+ /*
+ * If we are using PTI, map the new LDT into the userspace pagetables.
+ * If there is already an LDT, use the other slot so that other CPUs
+ * will continue to use the old LDT until install_ldt() switches
+ * them over to the new LDT.
+ */
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ free_ldt_struct(old_ldt);
+ goto out_unlock;
+ }
+
install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
error = 0;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e5a2df886130..878b3fdc00a5 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -50,12 +50,18 @@ enum address_markers_idx {
#ifdef CONFIG_X86_64
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
# ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
# endif
@@ -79,12 +85,18 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_X86_64
{ 0x8000000000000000UL, "Kernel Space" },
{ 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+ { LDT_BASE_ADDR, "LDT remap" },
+#endif
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/* VMEMMAP_START */, "Vmemmap" },
#ifdef CONFIG_KASAN
{ KASAN_SHADOW_START, "KASAN shadow" },
{ KASAN_SHADOW_END, "KASAN shadow end" },
#endif
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+ { LDT_BASE_ADDR, "LDT remap" },
+#endif
# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
# endif
--
2.13.6