Re: [PATCH 4.4 00/37] 4.4.110-stable review

From: Pavel Tatashin
Date: Thu Jan 11 2018 - 13:37:03 EST


I have root caused the memory corruption panics/hangs that I've been
experiencing during boot with the latest 4.4.110 kernel. The problem
as was suspected by Andy Lutomirski is with interaction between PTI
and EFI. It may affect any system that has EFI bios. I have not
verified if it can affect any other kernel beside 4.4.110

Attached is the fix for this issue with explanations that Steve
Sistare and I developed.
From 1189f3568a90ddd40e1418b9687def5d89153ee3 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@xxxxxxxxxx>
Date: Thu, 11 Jan 2018 06:50:25 -0800
Subject: [PATCH] x86/pti/efi: broken conversion from efi to kernel page table

In entry_64.S we have code like this:

/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3
2:

/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi

With this instruction:
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

We unconditionally switch from whatever our CR3 was to kernel page table.
But, in arch/x86/platform/efi/efi_64.c We temporarily set a different page
table, that does not have the kernel page table with 0x1000 offset from it.

Look in efi_thunk() and efi_thunk_set_virtual_address_map().

So, while CR3 points to the other page table, we get an NMI interrupt,
and clear 0x1000 from CR3, resulting in a bogus CR3 if the 0x1000 bit was
set.

The efi page table comes from realmode/rm/trampoline_64.S:

arch/x86/realmode/rm/trampoline_64.S

141 .bss
142 .balign PAGE_SIZE
143 GLOBAL(trampoline_pgd) .space PAGE_SIZE

Notice: alignment is PAGE_SIZE, so after applying KAISER_SHADOW_PGD_OFFSET
which equal to PAGE_SIZE, we can get a different page table.

But, even if we fix alignment, here the trampoline binary is later copied
into dynamically allocated memory in reserve_real_mode(), so we need to
fix that place as well.

Fixes: 8a43ddfb93a0 ("KAISER: Kernel Address Isolation")

Signed-off-by: Pavel Tatashin <pasha.tatashin@xxxxxxxxxx>
Reviewed-by: Steven Sistare <steven.sistare@xxxxxxxxxx>
---
arch/x86/include/asm/kaiser.h | 8 ++++++++
arch/x86/realmode/init.c | 4 +++-
arch/x86/realmode/rm/trampoline_64.S | 3 ++-
3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 802bbbdfe143..e087bd7a8d29 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -19,6 +19,12 @@

#define KAISER_SHADOW_PGD_OFFSET 0x1000

+/*
+ * A page table address must have this alignment to stay the same when
+ * KAISER_SHADOW_PGD_OFFSET mask is applied
+ */
+#define KAISER_KERNEL_PGD_ALIGNMENT (KAISER_SHADOW_PGD_OFFSET << 1)
+
#ifdef __ASSEMBLY__
#ifdef CONFIG_PAGE_TABLE_ISOLATION

@@ -71,6 +77,8 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax

#else /* CONFIG_PAGE_TABLE_ISOLATION */

+#define KAISER_KERNEL_PGD_ALIGNMENT PAGE_SIZE
+
.macro SWITCH_KERNEL_CR3
.endm
.macro SWITCH_USER_CR3
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 0b7a63d98440..cfecb7d6c6a8 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,5 +1,6 @@
#include <linux/io.h>
#include <linux/memblock.h>
+#include <linux/kaiser.h>

#include <asm/cacheflush.h>
#include <asm/pgtable.h>
@@ -15,7 +16,8 @@ void __init reserve_real_mode(void)
size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);

/* Has to be under 1M so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+ mem = memblock_find_in_range(0, 1 << 20, size,
+ KAISER_KERNEL_PGD_ALIGNMENT);
if (!mem)
panic("Cannot allocate trampoline\n");

diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index dac7b20d2f9d..781cca63f795 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -30,6 +30,7 @@
#include <asm/msr.h>
#include <asm/segment.h>
#include <asm/processor-flags.h>
+#include <asm/kaiser.h>
#include "realmode.h"

.text
@@ -139,7 +140,7 @@ tr_gdt:
tr_gdt_end:

.bss
- .balign PAGE_SIZE
+ .balign KAISER_KERNEL_PGD_ALIGNMENT
GLOBAL(trampoline_pgd) .space PAGE_SIZE

.balign 8
--
1.8.3.1