[tip: x86/urgent] x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit

From: tip-bot2 for Andy Lutomirski
Date: Wed Nov 27 2019 - 03:19:51 EST


The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e
Gitweb: https://git.kernel.org/tip/7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e
Author: Andy Lutomirski <luto@xxxxxxxxxx>
AuthorDate: Wed, 20 Nov 2019 23:06:41 -08:00
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitterDate: Tue, 26 Nov 2019 22:00:04 +01:00

x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit

The old x86_32 doublefault_fn() was old and crufty, and it did not
even try to recover. do_double_fault() is much nicer. Rewrite the
32-bit double fault code to sanitize CPU state and call
do_double_fault(). This is mostly an exercise i386 archaeology.

With this patch applied, 32-bit double faults get a real stack trace,
just like 64-bit double faults.

[ mingo: merged the patch to a later kernel base. ]

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
arch/x86/entry/entry_32.S | 42 ++++++++++++-
arch/x86/include/asm/traps.h | 3 +-
arch/x86/kernel/doublefault_32.c | 107 +++++++++++++++++++++---------
arch/x86/kernel/traps.c | 19 ++++-
4 files changed, 138 insertions(+), 33 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 5832b11..632432b 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1537,6 +1537,48 @@ SYM_CODE_START(debug)
jmp common_exception
SYM_CODE_END(debug)

+#ifdef CONFIG_DOUBLEFAULT
+SYM_CODE_START(double_fault)
+1:
+ /*
+ * This is a task gate handler, not an interrupt gate handler.
+ * The error code is on the stack, but the stack is otherwise
+ * empty. Interrupts are off. Our state is sane with the following
+ * exceptions:
+ *
+ * - CR0.TS is set. "TS" literally means "task switched".
+ * - EFLAGS.NT is set because we're a "nested task".
+ * - The doublefault TSS has back_link set and has been marked busy.
+ * - TR points to the doublefault TSS and the normal TSS is busy.
+ * - CR3 is the normal kernel PGD. This would be delightful, except
+ * that the CPU didn't bother to save the old CR3 anywhere. This
+ * would make it very awkward to return back to the context we came
+ * from.
+ *
+ * The rest of EFLAGS is sanitized for us, so we don't need to
+ * worry about AC or DF.
+ *
+ * Don't even bother popping the error code. It's always zero,
+ * and ignoring it makes us a bit more robust against buggy
+ * hypervisor task gate implementations.
+ *
+ * We will manually undo the task switch instead of doing a
+ * task-switching IRET.
+ */
+
+ clts /* clear CR0.TS */
+ pushl $X86_EFLAGS_FIXED
+ popfl /* clear EFLAGS.NT */
+
+ call doublefault_shim
+
+ /* We don't support returning, so we have no IRET here. */
+1:
+ hlt
+ jmp 1b
+SYM_CODE_END(double_fault)
+#endif
+
/*
* NMI is doubly nasty. It can happen on the first instruction of
* entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b25e633..ffa0dc8 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code);
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code);
dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code);
dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code);
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2);
+#endif
dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code);
dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 4eecfe4..3793646 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -9,42 +9,83 @@
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
+#include <asm/traps.h>

+extern void double_fault(void);
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)

-static void doublefault_fn(void)
-{
- struct desc_ptr gdt_desc = {0, 0};
- unsigned long gdt, tss;
+#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)

- BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
+static void set_df_gdt_entry(unsigned int cpu);

- native_store_gdt(&gdt_desc);
- gdt = gdt_desc.address;
+/*
+ * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
+ * we're running the doublefault task. Cannot return.
+ */
+asmlinkage notrace void __noreturn doublefault_shim(void)
+{
+ unsigned long cr2;
+ struct pt_regs regs;

- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+ BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);

- if (ptr_ok(gdt)) {
- gdt += GDT_ENTRY_TSS << 3;
- tss = get_desc_base((struct desc_struct *)gdt);
- printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+ cr2 = native_read_cr2();

- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ /* Reset back to the normal kernel task. */
+ force_reload_TR();
+ set_df_gdt_entry(smp_processor_id());

- printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
- t->ip, t->sp);
+ trace_hardirqs_off();

- printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
- t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
- }
- }
+ /*
+ * Fill in pt_regs. A downside of doing this in C is that the unwinder
+ * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
+ * won't successfully unwind to the source of the double fault.
+ * The main dump from do_double_fault() is fine, though, since it
+ * uses these regs directly.
+ *
+ * If anyone ever cares, this could be moved to asm.
+ */
+ regs.ss = TSS(ss);
+ regs.__ssh = 0;
+ regs.sp = TSS(sp);
+ regs.flags = TSS(flags);
+ regs.cs = TSS(cs);
+ /* We won't go through the entry asm, so we can leave __csh as 0. */
+ regs.__csh = 0;
+ regs.ip = TSS(ip);
+ regs.orig_ax = 0;
+ regs.gs = TSS(gs);
+ regs.__gsh = 0;
+ regs.fs = TSS(fs);
+ regs.__fsh = 0;
+ regs.es = TSS(es);
+ regs.__esh = 0;
+ regs.ds = TSS(ds);
+ regs.__dsh = 0;
+ regs.ax = TSS(ax);
+ regs.bp = TSS(bp);
+ regs.di = TSS(di);
+ regs.si = TSS(si);
+ regs.dx = TSS(dx);
+ regs.cx = TSS(cx);
+ regs.bx = TSS(bx);
+
+ do_double_fault(&regs, 0, cr2);

- for (;;)
- cpu_relax();
+ /*
+ * x86_32 does not save the original CR3 anywhere on a task switch.
+ * This means that, even if we wanted to return, we would need to find
+ * some way to reconstruct CR3. We could make a credible guess based
+ * on cpu_tlbstate, but that would be racy and would not account for
+ * PTI.
+ *
+ * Instead, don't bother. We can return through
+ * rewind_stack_do_exit() instead.
+ */
+ panic("cannot return from double fault\n");
}
+NOKPROBE_SYMBOL(doublefault_shim);

DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.tss = {
@@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,

- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
+ .ip = (unsigned long) double_fault,
+ .flags = X86_EFLAGS_FIXED,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
@@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
},
};

+static void set_df_gdt_entry(unsigned int cpu)
+{
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+ &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
+
void doublefault_init_cpu_tss(void)
{
unsigned int cpu = smp_processor_id();
@@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void)
(unsigned long)&cea->doublefault_stack.stack +
sizeof(doublefault_stack.stack));

- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
- &get_cpu_entry_area(cpu)->doublefault_stack.tss);
-
+ set_df_gdt_entry(cpu);
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 76381b0..a9b16c3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
}
#endif

-#ifdef CONFIG_X86_64
-/* Runs on IST stack */
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+/*
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
+ *
+ * On x86_64, this is more or less a normal kernel entry. Notwithstanding the
+ * SDM's warnings about double faults being unrecoverable, returning works as
+ * expected. Presumably what the SDM actually means is that the CPU may get
+ * the register state wrong on entry, so returning could be a bad idea.
+ *
+ * Various CPU engineers have promised that double faults due to an IRET fault
+ * while the stack is read-only are, in fact, recoverable.
+ *
+ * On x86_32, this is entered through a task gate, and regs are synthesized
+ * from the TSS. Returning is, in principle, okay, but changes to regs will
+ * be lost. If, for some reason, we need to return to a context with modified
+ * regs, the shim code could be adjusted to synchronize the registers.
+ */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{
static const char str[] = "double fault";