[RFC PTI 3/3] x86/pti: Put the LDT in its own PGD if PTI is on
From: Andy Lutomirski
Date: Fri Dec 08 2017 - 14:59:32 EST
With PTI on, we need the LDT to be in the usermode tables somewhere,
and the LDT is per-mm.
tglx had a hack to have a per-cpu LDT and context switch it, but it
was probably insanely slow due to the required TLB flushes.
Instead, take advantage of the fact that we have an address space
hole that gives us a completely unused pgd and make that pgd be
per-mm. We can put the LDT in it.
This has a down side: the LDT isn't (currently) randomized, and an
attack that can write the LDT is instant root due to call gates
(thanks, AMD, for leaving call gates in AMD64 but designing them
wrong so they're only useful for exploits). We could mitigate this
by making the LDT read-only or randomizing it, either of which is
strightforward on top of this patch.
XXX: The 5-level case needs testing and docs updates
Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---
Documentation/x86/x86_64/mm.txt | 11 ++-
arch/x86/include/asm/mmu_context.h | 33 +++++++-
arch/x86/include/asm/pgtable_64_types.h | 2 +
arch/x86/include/asm/processor.h | 23 ++++--
arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++---
5 files changed, 185 insertions(+), 23 deletions(-)
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 2d7d6590ade8..bfa44e1cb293 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT range
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables:
@@ -39,8 +41,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Architecture defines a 64-bit virtual address. Implementations can support
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e1a1ecb65c6..eb87bbeddacc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,13 +52,29 @@ struct ldt_struct {
*/
struct desc_struct *entries;
unsigned int nr_entries;
+
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot).
+ */
+ int slot;
};
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static void *ldt_slot_va(int slot)
+{
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
/*
* Used for LDT copy/destruction.
*/
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context_ldt(struct task_struct *tsk,
struct mm_struct *mm)
@@ -90,10 +106,20 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see.
*/
- if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->nr_entries);
- else
+ if (unlikely(ldt)) {
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ clear_LDT();
+ return;
+ }
+
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
clear_LDT();
+ }
#else
clear_LDT();
#endif
@@ -185,6 +211,7 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6d5f45dcd4a1..130f575f8d1e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -100,6 +100,8 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define LDT_PGD_ENTRY _AC(-3, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
#else
/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ae5615b03def..8170b57fc498 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -46,13 +47,12 @@ static void refresh_ldt_segments(void)
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
- mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+ __flush_tlb_all();
+ load_mm_ldt(mm);
refresh_ldt_segments();
}
@@ -90,9 +90,113 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
}
new_ldt->nr_entries = num_entries;
+ new_ldt->slot = -1;
return new_ldt;
}
+static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ bool is_vmalloc;
+ bool had_top_level_entry;
+ int i;
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return 0;
+
+ WARN_ON(ldt->slot != -1);
+
+ /*
+ * Both LDT slots are contained in a single PMD, so we can pass
+ * LDT_BASE_ADDR instead of the real address to all the _offset
+ * helpers.
+ */
+ pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ /*
+ * Did we already have the top level entry allocated? We can't
+ * use pgd_none() for this because it doens't do anything on
+ * 4-level page table kernels.
+ */
+ had_top_level_entry = (pgd->pgd != 0);
+
+ p4d = p4d_alloc(mm, pgd, LDT_BASE_ADDR);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = pud_alloc(mm, p4d, LDT_BASE_ADDR);
+ if (!pud)
+ return -ENOMEM;
+ pmd = pmd_alloc(mm, pud, LDT_BASE_ADDR);
+ if (!pmd)
+ return -ENOMEM;
+ if (pte_alloc(mm, pmd, LDT_BASE_ADDR))
+ return -ENOMEM;
+
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_top_level_entry);
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+ } else {
+ WARN_ON(had_top_level_entry);
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+ WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+ }
+ }
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ unsigned long va = (unsigned long)ldt_slot_va(slot) + offset;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+
+ pte = pte_offset_kernel(pmd, va);
+ set_pte(pte, pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
+ }
+
+ flush_tlb_mm_range(mm,
+ (unsigned long)ldt_slot_va(slot),
+ (unsigned long)ldt_slot_va(slot) + LDT_SLOT_STRIDE,
+ 0);
+
+ ldt->slot = slot;
+
+ return 0;
+#else
+ return -EINVAL;
+#endif
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_64
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
@@ -134,17 +238,15 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
int retval = 0;
mutex_init(&mm->context.lock);
+ mm->context.ldt = NULL;
+
old_mm = current->mm;
- if (!old_mm) {
- mm->context.ldt = NULL;
+ if (!old_mm)
return 0;
- }
mutex_lock(&old_mm->context.lock);
- if (!old_mm->context.ldt) {
- mm->context.ldt = NULL;
+ if (!old_mm->context.ldt)
goto out_unlock;
- }
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
@@ -155,8 +257,17 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval)
+ goto out_free;
mm->context.ldt = new_ldt;
+ goto out_unlock;
+
+out_free:
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ return retval;
out_unlock:
mutex_unlock(&old_mm->context.lock);
@@ -174,6 +285,11 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
@@ -285,6 +401,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ free_ldt_struct(old_ldt);
+ goto out_unlock;
+ }
install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
--
2.13.6