[GIT pull] x86/pti fixes for 4.15

From: Thomas Gleixner
Date: Wed Jan 03 2018 - 19:24:20 EST


Linus,

please pull the latest x86-pti-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-pti-for-linus

A couple of urgent fixes for PTI:

- Fix a PTE mismatch between user and kernel visible mapping of the cpu
entry area (differs vs. the GLB bit) and causes a TLB mismatch MCE on
older AMD K8 machines

- Fix the misplaced CR3 switch in the SYSCALL compat entry code which
causes access to unmapped kernel memory resulting in double faults.

- Fix the section mismatch of the cpu_tss_rw percpu storage caused by
using a different mechanism for declaration and definition.

- Two fixes for dumpstack which help to decode entry stack issues better

- Enable PTI by default in Kconfig. We should have done that earlier, but
it slipped through the cracks.

- Exclude AMD from the PTI enforcement. Not necessarily a fix, but if AMD
is so confident that they are not affected, then we should not burden
users with the overhead.

Thanks,

tglx

------------------>
Josh Poimboeuf (2):
x86/dumpstack: Fix partial register dumps
x86/dumpstack: Print registers for first stack frame

Nick Desaulniers (1):
x86/process: Define cpu_tss_rw in same section as declaration

Thomas Gleixner (3):
x86/pti: Enable PTI by default
x86/pti: Make sure the user/kernel PTEs match
x86/pti: Switch to kernel CR3 at early in entry_SYSCALL_compat()

Tom Lendacky (1):
x86/cpu, x86/pti: Do not enable PTI on AMD processors


arch/x86/entry/entry_64_compat.S | 13 ++++++-------
arch/x86/include/asm/unwind.h | 17 +++++++++++++----
arch/x86/kernel/cpu/common.c | 4 ++--
arch/x86/kernel/dumpstack.c | 31 ++++++++++++++++++++++---------
arch/x86/kernel/process.c | 2 +-
arch/x86/kernel/stacktrace.c | 2 +-
arch/x86/mm/pti.c | 3 ++-
security/Kconfig | 1 +
8 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 40f17009ec20..98d5358e4041 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -190,8 +190,13 @@ ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
swapgs

- /* Stash user ESP and switch to the kernel stack. */
+ /* Stash user ESP */
movl %esp, %r8d
+
+ /* Use %rsp as scratch reg. User ESP is stashed in r8 */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+ /* Switch to the kernel stack */
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

/* Construct struct pt_regs on stack */
@@ -220,12 +225,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
pushq $0 /* pt_regs->r15 = 0 */

/*
- * We just saved %rdi so it is safe to clobber. It is not
- * preserved during the C calls inside TRACE_IRQS_OFF anyway.
- */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-
- /*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
*/
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index c1688c2d0a12..1f86e1b0a5cd 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -56,18 +56,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,

#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
- * WARNING: The entire pt_regs may not be safe to dereference. In some cases,
- * only the iret frame registers are accessible. Use with caution!
+ * If 'partial' returns true, only the iret frame registers are valid.
*/
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
{
if (unwind_done(state))
return NULL;

+ if (partial) {
+#ifdef CONFIG_UNWINDER_ORC
+ *partial = !state->full_regs;
+#else
+ *partial = false;
+#endif
+ }
+
return state->regs;
}
#else
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
{
return NULL;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2a94dfb434e..b1be494ab4e8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -899,8 +899,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)

setup_force_cpu_cap(X86_FEATURE_ALWAYS);

- /* Assume for now that ALL x86 CPUs are insecure */
- setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+ if (c->x86_vendor != X86_VENDOR_AMD)
+ setup_force_cpu_bug(X86_BUG_CPU_INSECURE);

fpu__init_system(c);

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 5fa110699ed2..afbecff161d1 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs)
regs->sp, regs->flags);
}

-static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+ bool partial)
{
- if (on_stack(info, regs, sizeof(*regs)))
+ /*
+ * These on_stack() checks aren't strictly necessary: the unwind code
+ * has already validated the 'regs' pointer. The checks are done for
+ * ordering reasons: if the registers are on the next stack, we don't
+ * want to print them out yet. Otherwise they'll be shown as part of
+ * the wrong stack. Later, when show_trace_log_lvl() switches to the
+ * next stack, this function will be called again with the same regs so
+ * they can be printed in the right context.
+ */
+ if (!partial && on_stack(info, regs, sizeof(*regs))) {
__show_regs(regs, 0);
- else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
- IRET_FRAME_SIZE)) {
+
+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
/*
* When an interrupt or exception occurs in entry code, the
* full pt_regs might not have been saved yet. In that case
@@ -98,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
struct stack_info stack_info = {0};
unsigned long visit_mask = 0;
int graph_idx = 0;
+ bool partial;

printk("%sCall Trace:\n", log_lvl);

unwind_start(&state, task, regs, stack);
stack = stack ? : get_stack_pointer(task, regs);
+ regs = unwind_get_entry_regs(&state, &partial);

/*
* Iterate through the stacks, starting with the current stack pointer.
@@ -120,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
- for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;

if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
@@ -140,7 +153,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%s <%s>\n", log_lvl, stack_name);

if (regs)
- show_regs_safe(&stack_info, regs);
+ show_regs_if_on_stack(&stack_info, regs, partial);

/*
* Scan the stack, printing any text addresses we find. At the
@@ -164,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,

/*
* Don't print regs->ip again if it was already printed
- * by show_regs_safe() below.
+ * by show_regs_if_on_stack().
*/
if (regs && stack == &regs->ip)
goto next;
@@ -199,9 +212,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unwind_next_frame(&state);

/* if the frame has entry regs, print them */
- regs = unwind_get_entry_regs(&state);
+ regs = unwind_get_entry_regs(&state, &partial);
if (regs)
- show_regs_safe(&stack_info, regs);
+ show_regs_if_on_stack(&stack_info, regs, partial);
}

if (stack_name)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 517415978409..3cb2486c47e4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8dabd7bf1673..60244bfaf88f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace,
for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
unwind_next_frame(&state)) {

- regs = unwind_get_entry_regs(&state);
+ regs = unwind_get_entry_regs(&state, NULL);
if (regs) {
/*
* Kernel mode registers on the stack indicate an
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index bce8aea65606..2da28ba97508 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void)
static void __init pti_clone_entry_text(void)
{
pti_clone_pmds((unsigned long) __entry_text_start,
- (unsigned long) __irqentry_text_end, _PAGE_RW);
+ (unsigned long) __irqentry_text_end,
+ _PAGE_RW | _PAGE_GLOBAL);
}

/*
diff --git a/security/Kconfig b/security/Kconfig
index a623d13bf288..3d4debd0257e 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -56,6 +56,7 @@ config SECURITY_NETWORK

config PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
+ default y
depends on X86_64 && !UML
help
This feature reduces the number of hardware side channels by