[RFC v2-fix-v3 1/1] x86/boot: Avoid #VE during boot for TDX platforms
From: Kuppuswamy Sathyanarayanan
Date: Mon May 24 2021 - 19:28:14 EST
From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
In TDX guests, Virtualization Exceptions (#VE) are delivered
to TDX guests due to specific guest actions like MSR writes,
CPUID leaf accesses or I/O access. But in early boot code, #VE
cannot be allowed because the required exception handler setup
support code is missing. If #VE is triggered without proper
handler support, it would lead to triple fault or kernel hang.
So, avoid operations which will inject #VE during boot process.
They're easy to avoid and it is less complex than handling the
exceptions.
There are a few MSRs and control register bits which the kernel
normally needs to modify during boot. But, TDX disallows
modification of these registers to help provide consistent
security guarantees. Fortunately, TDX ensures that these are all
in the correct state before the kernel loads, which means the
kernel has no need to modify them.
The conditions to avoid are:
* Any writes to the EFER MSR
* Clearing CR0.NE
* Clearing CR3.MCE
This theoretically makes guest boot more fragile. If, for
instance, EFER was set up incorrectly and a WRMSR was performed,
the resulting (unhandled) #VE would triple fault. However, this
is likely to trip up the guest BIOS long before control reaches
the kernel. In any case, these kinds of problems are unlikely to
occur in production environments, and developers have good debug
tools to fix them quickly.
Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Reviewed-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx>
---
Changes since RFC v2-fix-v2:
* Fixed commit log as per review comments.
Changes since RFC v2-fix:
* Fixed commit and comments as per Dave and Dan's suggestions.
* Merged CR0.NE related change in pa_trampoline_compat() from patch
titled "x86/boot: Add a trampoline for APs booting in 64-bit mode"
to this patch. It belongs in this patch.
* Merged TRAMPOLINE_32BIT_CODE_SIZE related change from patch titled
"x86/boot: Add a trampoline for APs booting in 64-bit mode" to this
patch (since it was wrongly merged to that patch during patch split).
arch/x86/boot/compressed/head_64.S | 16 ++++++++++++----
arch/x86/boot/compressed/pgtable.h | 2 +-
arch/x86/kernel/head_64.S | 20 ++++++++++++++++++--
arch/x86/realmode/rm/trampoline_64.S | 23 +++++++++++++++++++----
4 files changed, 50 insertions(+), 11 deletions(-)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index e94874f4bbc1..f848569e3fb0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -616,12 +616,20 @@ SYM_CODE_START(trampoline_32bit_src)
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
wrmsr
- popl %edx
+1: popl %edx
popl %ecx
/* Enable PAE and LA57 (if required) paging modes */
- movl $X86_CR4_PAE, %eax
+ movl %cr4, %eax
+ /*
+ * Clear all bits except CR4.MCE, which is preserved.
+ * Clearing CR4.MCE will #VE in TDX guests.
+ */
+ andl $X86_CR4_MCE, %eax
+ orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
@@ -635,8 +643,8 @@ SYM_CODE_START(trampoline_32bit_src)
pushl $__KERNEL_CS
pushl %eax
- /* Enable paging again */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ /* Enable paging again. Avoid clearing X86_CR0_NE for TDX */
+ movl $(X86_CR0_PG | X86_CR0_NE | X86_CR0_PE), %eax
movl %eax, %cr0
lret
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 6ff7e81b5628..cc9b2529a086 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x80
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 04bddaaba8e2..6cf8d126b80a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -141,7 +141,13 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
1:
/* Enable PAE mode, PGE and LA57 */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ movq %cr4, %rcx
+ /*
+ * Clear all bits except CR4.MCE, which is preserved.
+ * Clearing CR4.MCE will #VE in TDX guests.
+ */
+ andl $X86_CR4_MCE, %ecx
+ orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
@@ -229,13 +235,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
+ /*
+ * Preserve current value of EFER for comparison and to skip
+ * EFER writes if no change was made (for TDX guest)
+ */
+ movl %eax, %edx
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
-1: wrmsr /* Make changes effective */
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 957bb21ce105..cf14d0326a48 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -143,13 +143,27 @@ SYM_CODE_START(startup_32)
movl %eax, %cr3
# Set up EFER
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /*
+ * Skip writing to EFER if the register already has desiered
+ * value (to avoid #VE for TDX guest).
+ */
+ cmp pa_tr_efer, %eax
+ jne .Lwrite_efer
+ cmp pa_tr_efer + 4, %edx
+ je .Ldone_efer
+.Lwrite_efer:
movl pa_tr_efer, %eax
movl pa_tr_efer + 4, %edx
- movl $MSR_EFER, %ecx
wrmsr
- # Enable paging and in turn activate Long Mode
- movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
+.Ldone_efer:
+ /*
+ * Enable paging and in turn activate Long Mode. Avoid clearing
+ * X86_CR0_NE for TDX.
+ */
+ movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_NE | X86_CR0_PE), %eax
movl %eax, %cr0
/*
@@ -169,7 +183,8 @@ SYM_CODE_START(pa_trampoline_compat)
movl $rm_stack_end, %esp
movw $__KERNEL_DS, %dx
- movl $X86_CR0_PE, %eax
+ /* Avoid clearing X86_CR0_NE for TDX */
+ movl $(X86_CR0_NE | X86_CR0_PE), %eax
movl %eax, %cr0
ljmpl $__KERNEL32_CS, $pa_startup_32
SYM_CODE_END(pa_trampoline_compat)
--
2.25.1