If a task has the TIF_NOPTI flag set, it doesn't want to experience
page table isolation. In this case, returns from kernel to user will
not switch the CR3, leaving it to the kernel one which already maps
both user and kernel pages. Upon entry in the kernel, we can't check
this flag so we simply check if CR3 was pointing to the kernel's PGD,
indicating an earlier absence of switch, and in this case we don't
change it.
Thanks to these changes, haproxy running under KVM went back from
12400 conn/s to 21000 once loaded after calling prctl().
Signed-off-by: Willy Tarreau <w@xxxxxx>
---
arch/x86/entry/calling.h | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e0..054b8b7 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h>
+#include <asm/thread_info.h>
#include <asm/unwind_hints.h>
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
@@ -214,6 +215,11 @@
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
+
+ /* if we're already on the kernel PGD, we don't switch */
+ testq $(PTI_SWITCH_PGTABLES_MASK), \scratch_reg
+ jz .Lend_\@
+
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
@@ -224,6 +230,12 @@
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ /* "NOPTI" taskflag avoids the switch */
+ movq PER_CPU_VAR(current_task), \scratch_reg
+ btq $TIF_NOPTI, TASK_TI_flags(\scratch_reg)
+ jc .Lend_\@
+
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
@@ -262,6 +274,13 @@
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
movq %cr3, \scratch_reg
movq \scratch_reg, \save_reg
+
+ /* if we're already on the kernel PGD, we don't switch,
+ * we just save the current cr3.
+ */
+ testq $(PTI_SWITCH_PGTABLES_MASK), \scratch_reg
+ jz .Ldone_\@
+
/*
* Is the "switch mask" all zero? That means that both of
* these are zero:
@@ -284,6 +303,10 @@
.macro RESTORE_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ /* if we saved a kernel context, we didn't switch so we don't switch */
+ testq $(PTI_SWITCH_PGTABLES_MASK), \save_reg
+ jz .Lend_\@
+