[RFC] x86: Avoid CR3 load on compatibility mode with PTI
From: Nadav Amit
Date: Sun Jan 14 2018 - 22:41:10 EST
Currently, when page-table isolation is on to prevent the Meltdown bug
(CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
However, it appears that this is an unnecessary measure when programs
run in compatibility mode. In this mode only 32-bit registers are
available, which means that there *should* be no way for the CPU to
access, even speculatively, memory that belongs to the kernel, which
sits in high addresses.
While this may seem as an "uninteresting" case, it opens the possibility
to run I/O intensive applications in compatibility mode with improved
performance relatively to their 64-bit counterpart. These applications
are arguably also the ones that are least affected by the less efficient
32-code.
For example, on Dell R630 (Intel E5-2670 v3)
Redis (redis-benchmark)
======================
x86_64 i386 improvement
-------------------------------------------
PING_INLINE: 103519.66 122249.39 18.09%
PING_BULK: 104493.2 120918.98 15.72%
SET: 86580.09 126103.41 45.65%
GET: 87719.3 124533 41.97%
INCR: 88573.96 123915.73 39.90%
LPUSH: 88731.15 99502.48 12.14%
RPUSH: 88417.33 99108.03 12.09%
LPOP: 88261.25 99304.87 12.51%
RPOP: 88183.43 98716.68 11.94%
SADD: 88028.16 98911.97 12.36%
SPOP: 88261.25 97465.89 10.43%
LPUSH 87873.46 99108.03 12.78%
LRANGE_100 51572.98 47326.08 -8.23%
LRANGE_300 20383.2 16528.93 -18.91%
LRANGE_500 13259.08 10617.97 -19.92%
LRANGE_600 10246.95 8389.26 -18.13%
MSET 89847.26 93370.68 3.92%
Apache (wrk -c 8 -t 4 --latency -d 30s http://localhost)
========================================================
Latency: 137.11us 312.18us -56%
Req/Sec: 17.19k 17.02k -1%
This patch provides a rough implementation. It is mainly intended to
provide the idea and gather feedback whether compatibility mode can be
considered safe. It does overlap with Willy Tarreau work and indeed the
NX-bit should be resolved more nicely.
Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
---
arch/x86/entry/calling.h | 29 +++++++++++++++++++++++++++++
arch/x86/entry/entry_64_compat.S | 4 ++--
arch/x86/include/asm/desc.h | 17 +++++++++++++++++
arch/x86/include/asm/tlbflush.h | 9 ++++++++-
arch/x86/kernel/process_64.c | 11 +++++++++++
arch/x86/mm/pti.c | 17 -----------------
6 files changed, 67 insertions(+), 20 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e00a6af..9b0190f4a96b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -213,7 +213,14 @@ For 32-bit we have the following conventions - kernel is built with
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ /*
+ * Do not switch on compatibility mode.
+ */
mov %cr3, \scratch_reg
+ testq $PTI_SWITCH_MASK, \scratch_reg
+ jz .Lend_\@
+
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
@@ -224,6 +231,16 @@ For 32-bit we have the following conventions - kernel is built with
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ /*
+ * Do not switch on compatibility mode. If there is no need for a
+ * flush, run lfence to avoid speculative execution returning to user
+ * with the wrong CR3.
+ */
+ movq PER_CPU_VAR(current_task), \scratch_reg
+ testl $_TIF_IA32, TASK_TI_flags(\scratch_reg)
+ jnz .Lno_spec_\@
+
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
@@ -241,6 +258,10 @@ For 32-bit we have the following conventions - kernel is built with
movq \scratch_reg2, \scratch_reg
jmp .Lwrcr3_\@
+.Lno_spec_\@:
+ lfence
+ jmp .Lend_\@
+
.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
@@ -286,6 +307,10 @@ For 32-bit we have the following conventions - kernel is built with
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+ movq PER_CPU_VAR(current_task), \scratch_reg
+ testl $_TIF_IA32, TASK_TI_flags(\scratch_reg)
+ jnz .Lno_spec_\@
+
/*
* KERNEL pages can always resume with NOFLUSH as we do
* explicit flushes.
@@ -305,6 +330,10 @@ For 32-bit we have the following conventions - kernel is built with
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
jmp .Lwrcr3_\@
+.Lno_spec_\@:
+ lfence
+ jmp .Lend_\@
+
.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 98d5358e4041..0d520c80ef36 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -4,7 +4,7 @@
*
* Copyright 2000-2002 Andi Kleen, SuSE Labs.
*/
-#include "calling.h"
+#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
@@ -14,8 +14,8 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
-#include <linux/linkage.h>
#include <linux/err.h>
+#include "calling.h"
.section .entry.text, "ax"
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 13c5ee878a47..5d139f915b2e 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -431,6 +431,23 @@ static inline void load_current_idt(void)
load_idt((const struct desc_ptr *)&idt_descr);
}
+static inline void remove_user_cs64(void)
+{
+ struct desc_struct *d = get_cpu_gdt_rw(smp_processor_id());
+ struct desc_struct user_cs = {0};
+
+ write_gdt_entry(d, GDT_ENTRY_DEFAULT_USER_CS, &user_cs, DESCTYPE_S);
+}
+
+static inline void restore_user_cs64(void)
+{
+ struct desc_struct *d = get_cpu_gdt_rw(smp_processor_id());
+ struct desc_struct user_cs = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff);
+
+ write_gdt_entry(d, GDT_ENTRY_DEFAULT_USER_CS, &user_cs, DESCTYPE_S);
+}
+
+
extern void idt_setup_early_handler(void);
extern void idt_setup_early_traps(void);
extern void idt_setup_traps(void);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4a08dd2ab32a..d11480130ef1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -355,7 +355,8 @@ static inline void __native_flush_tlb(void)
*/
WARN_ON_ONCE(preemptible());
- invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ if (!(current->mm && test_thread_flag(TIF_IA32)))
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
native_write_cr3(__native_read_cr3());
@@ -407,6 +408,9 @@ static inline void __native_flush_tlb_single(unsigned long addr)
if (!static_cpu_has(X86_FEATURE_PTI))
return;
+ if (current->mm && test_thread_flag(TIF_IA32))
+ return;
+
/*
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
* Just use invalidate_user_asid() in case we are called early.
@@ -443,6 +447,9 @@ static inline void __flush_tlb_one(unsigned long addr)
if (!static_cpu_has(X86_FEATURE_PTI))
return;
+ if (current->mm && test_thread_flag(TIF_IA32))
+ return;
+
/*
* __flush_tlb_single() will have cleared the TLB entry for this ASID,
* but since kernel space is replicated across all, we must also
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c75466232016..01235402cbbe 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -473,6 +473,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_IA32_EMULATION)
+ if (unlikely(static_cpu_has(X86_FEATURE_PTI) &&
+ (task_thread_info(next_p)->flags ^
+ task_thread_info(prev_p)->flags) & _TIF_IA32)) {
+ if (task_thread_info(next_p)->flags & _TIF_IA32)
+ remove_user_cs64();
+ else
+ restore_user_cs64();
+ }
+#endif
+
#ifdef CONFIG_XEN_PV
/*
* On Xen PV, IOPL bits in pt_regs->flags have no effect, and
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 43d4a4a29037..c7d7f5a35d8c 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -122,23 +122,6 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
*/
kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
- /*
- * If this is normal user memory, make it NX in the kernel
- * pagetables so that, if we somehow screw up and return to
- * usermode with the kernel CR3 loaded, we'll get a page fault
- * instead of allowing user code to execute with the wrong CR3.
- *
- * As exceptions, we don't set NX if:
- * - _PAGE_USER is not set. This could be an executable
- * EFI runtime mapping or something similar, and the kernel
- * may execute from it
- * - we don't have NX support
- * - we're clearing the PGD (i.e. the new pgd is not present).
- */
- if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
- (__supported_pte_mask & _PAGE_NX))
- pgd.pgd |= _PAGE_NX;
-
/* return the copy of the PGD we want the kernel to use: */
return pgd;
}
--
2.14.1