[RFC PATCH 04/73] x86/entry: Implement direct switching for the switcher

From: Lai Jiangshan
Date: Mon Feb 26 2024 - 09:37:23 EST


From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>

During VM running, all VM exits in the switcher will be forwarded to the
hypervisor and then returned to the switcher to re-enter the VM after
handling the VM exit. In some situations, the switcher can handle the VM
exit directly without involving the hypervisor. This is referred to as
direct switching, and it can reduce the overhead of guest/host state
switching. Currently, for simplicity, only the syscall event from user
mode and ERETU synthetic instruction are allowed for direct switching.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
Signed-off-by: Hou Wenlong <houwenlong.hwl@xxxxxxxxxxxx>
---
arch/x86/entry/entry_64_switcher.S | 145 ++++++++++++++++++++++++++++-
arch/x86/include/asm/ptrace.h | 2 +
arch/x86/include/asm/switcher.h | 60 ++++++++++++
arch/x86/kernel/asm-offsets_64.c | 23 +++++
4 files changed, 229 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64_switcher.S b/arch/x86/entry/entry_64_switcher.S
index 2b99a46421cc..6f166d15635c 100644
--- a/arch/x86/entry/entry_64_switcher.S
+++ b/arch/x86/entry/entry_64_switcher.S
@@ -75,7 +75,7 @@ SYM_FUNC_START(switcher_enter_guest)

/* Switch to guest GSBASE and return to guest */
swapgs
- jmp native_irq_return_iret
+ jmp .L_switcher_return_to_guest

SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
/* switch back to host cr3 when still on sp0/ist stack */
@@ -99,6 +99,23 @@ SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
SYM_FUNC_END(switcher_enter_guest)
EXPORT_SYMBOL_GPL(switcher_enter_guest)

+.macro canonical_rcx
+ /*
+ * If width of "canonical tail" ever becomes variable, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ *
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
+ */
+#ifdef CONFIG_X86_5LEVEL
+ ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
+ "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
+#else
+ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+ sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
+.endm
+
SYM_CODE_START(entry_SYSCALL_64_switcher)
UNWIND_HINT_ENTRY
ENDBR
@@ -117,7 +134,133 @@ SYM_INNER_LABEL(entry_SYSCALL_64_switcher_safe_stack, SYM_L_GLOBAL)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+ pushq %rdi /* put rdi on ORIG_RAX */
+
+ /* check if it can do direct switch from umod to smod */
+ testq $SWITCH_FLAGS_NO_DS_TO_SMOD, TSS_extra(switch_flags)
+ jnz .L_switcher_check_return_umod_instruction
+
+ /* Now it must be umod, start to do direct switch from umod to smod */
+ movq TSS_extra(pvcs), %rdi
+ movl %r11d, PVCS_eflags(%rdi)
+ movq %rcx, PVCS_rip(%rdi)
+ movq %rcx, PVCS_rcx(%rdi)
+ movq %r11, PVCS_r11(%rdi)
+ movq RSP-ORIG_RAX(%rsp), %rcx
+ movq %rcx, PVCS_rsp(%rdi)
+
+ /* switch umod to smod (switch_flags & cr3) */
+ xorb $SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+ movq TSS_extra(smod_cr3), %rcx
+ movq %rcx, %cr3
+
+ /* load smod registers from TSS_extra to sp0 stack or %r11 */
+ movq TSS_extra(smod_rsp), %rcx
+ movq %rcx, RSP-ORIG_RAX(%rsp)
+ movq TSS_extra(smod_entry), %rcx
+ movq %rcx, RIP-ORIG_RAX(%rsp)
+ movq TSS_extra(smod_gsbase), %r11
+
+ /* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+ swapgs
+
+ /* save guest gsbase as user_gsbase and switch to smod_gsbase */
+ rdgsbase %rcx
+ movq %rcx, PVCS_user_gsbase(%rdi)
+ wrgsbase %r11
+
+ /* restore umod rdi and smod rflags/r11, rip/rcx and rsp for sysretq */
+ popq %rdi
+ movq $SWITCH_ENTER_EFLAGS_FIXED, %r11
+ movq RIP-RIP(%rsp), %rcx
+
+.L_switcher_sysretq:
+ UNWIND_HINT_IRET_REGS
+ /* now everything is ready for sysretq except for %rsp */
+ movq RSP-RIP(%rsp), %rsp
+ /* No instruction can be added between seting the guest %rsp and doing sysretq */
+SYM_INNER_LABEL(entry_SYSRETQ_switcher_unsafe_stack, SYM_L_GLOBAL)
+ sysretq
+
+.L_switcher_check_return_umod_instruction:
+ UNWIND_HINT_IRET_REGS offset=8
+
+ /* check if it can do direct switch from smod to umod */
+ testq $SWITCH_FLAGS_NO_DS_TO_UMOD, TSS_extra(switch_flags)
+ jnz .L_switcher_return_to_hypervisor
+
+ /*
+ * Now it must be smod, check if it is the return-umod instruction.
+ * Switcher and the PVM specification defines a SYSCALL instrucion
+ * at TSS_extra(retu_rip) - 2 in smod as the return-umod instruction.
+ */
+ cmpq %rcx, TSS_extra(retu_rip)
+ jne .L_switcher_return_to_hypervisor
+
+ /* only handle for the most common cs/ss */
+ movq TSS_extra(pvcs), %rdi
+ cmpl $((__USER_DS << 16) | __USER_CS), PVCS_user_cs(%rdi)
+ jne .L_switcher_return_to_hypervisor
+
+ /* Switcher and the PVM specification requires the smod RSP to be saved */
+ movq RSP-ORIG_RAX(%rsp), %rcx
+ movq %rcx, TSS_extra(smod_rsp)
+
+ /* switch smod to umod (switch_flags & cr3) */
+ xorb $SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+ movq TSS_extra(umod_cr3), %rcx
+ movq %rcx, %cr3
+
+ /* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+ swapgs
+
+ /* write umod gsbase */
+ movq PVCS_user_gsbase(%rdi), %rcx
+ canonical_rcx
+ wrgsbase %rcx
+
+ /* load sp, flags, ip to sp0 stack and cx, r11, rdi to registers */
+ movq PVCS_rsp(%rdi), %rcx
+ movq %rcx, RSP-ORIG_RAX(%rsp)
+ movl PVCS_eflags(%rdi), %r11d
+ movq %r11, EFLAGS-ORIG_RAX(%rsp)
+ movq PVCS_rip(%rdi), %rcx
+ movq %rcx, RIP-ORIG_RAX(%rsp)
+ movq PVCS_rcx(%rdi), %rcx
+ movq PVCS_r11(%rdi), %r11
+ popq %rdi // saved rdi (on ORIG_RAX)
+
+.L_switcher_return_to_guest:
+ /*
+ * Now the RSP points to an IRET frame with guest state on the
+ * top of the sp0 stack. Check if it can do sysretq.
+ */
+ UNWIND_HINT_IRET_REGS
+
+ andq $SWITCH_ENTER_EFLAGS_ALLOWED, EFLAGS-RIP(%rsp)
+ orq $SWITCH_ENTER_EFLAGS_FIXED, EFLAGS-RIP(%rsp)
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), EFLAGS-RIP(%rsp)
+ jnz native_irq_return_iret
+ cmpq %r11, EFLAGS-RIP(%rsp)
+ jne native_irq_return_iret
+
+ cmpq %rcx, RIP-RIP(%rsp)
+ jne native_irq_return_iret
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the guest take over
+ * the host, since guest controls RSP.
+ */
+ canonical_rcx
+ cmpq %rcx, RIP-RIP(%rsp)
+ je .L_switcher_sysretq
+
+ /* RCX matches for RIP only before RCX is canonicalized, restore RCX and do IRET. */
+ movq RIP-RIP(%rsp), %rcx
+ jmp native_irq_return_iret

+.L_switcher_return_to_hypervisor:
+ popq %rdi /* saved rdi */
pushq $0 /* pt_regs->orig_ax */
movl $SWITCH_EXIT_REASONS_SYSCALL, 4(%rsp)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 9eeeb5fdd387..322697877a2d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -198,6 +198,8 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_switcher &&
regs->ip < (unsigned long)entry_SYSCALL_64_switcher_safe_stack);

+ ret = ret || (regs->ip == (unsigned long)entry_SYSRETQ_switcher_unsafe_stack);
+
return ret;
}
#endif
diff --git a/arch/x86/include/asm/switcher.h b/arch/x86/include/asm/switcher.h
index dbf1970ca62f..35a60f4044c4 100644
--- a/arch/x86/include/asm/switcher.h
+++ b/arch/x86/include/asm/switcher.h
@@ -8,6 +8,40 @@
#define SWITCH_EXIT_REASONS_SYSCALL 1024
#define SWITCH_EXIT_REASONS_FAILED_VMETNRY 1025

+/*
+ * SWITCH_FLAGS control the way how the switcher code works,
+ * mostly dictate whether it should directly do the guest ring
+ * switch or just go back to hypervisor.
+ *
+ * SMOD and UMOD
+ * Current vcpu mode. Use two parity bits to simplify direct-switch
+ * flags checking.
+ *
+ * NO_DS_CR3
+ * Not to direct switch due to smod_cr3 or umod_cr3 not having been
+ * prepared.
+ */
+#define SWITCH_FLAGS_SMOD _BITULL(0)
+#define SWITCH_FLAGS_UMOD _BITULL(1)
+#define SWITCH_FLAGS_NO_DS_CR3 _BITULL(2)
+
+#define SWITCH_FLAGS_MOD_TOGGLE (SWITCH_FLAGS_SMOD | SWITCH_FLAGS_UMOD)
+
+/*
+ * Direct switching disabling bits are all the bits other than
+ * SWITCH_FLAGS_SMOD or SWITCH_FLAGS_UMOD. Bits 8-64 are defined by the driver
+ * using the switcher. Direct switching is enabled if all the disabling bits
+ * are cleared.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_SMOD: not to direct switch to smod due to any
+ * disabling bit or smod bit being set.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_UMOD: not to direct switch to umod due to any
+ * disabling bit or umod bit being set.
+ */
+#define SWITCH_FLAGS_NO_DS_TO_SMOD (~SWITCH_FLAGS_UMOD)
+#define SWITCH_FLAGS_NO_DS_TO_UMOD (~SWITCH_FLAGS_SMOD)
+
/* Bits allowed to be set in the underlying eflags */
#define SWITCH_ENTER_EFLAGS_ALLOWED (X86_EFLAGS_FIXED | X86_EFLAGS_IF |\
X86_EFLAGS_TF | X86_EFLAGS_RF |\
@@ -24,6 +58,7 @@
#include <linux/cache.h>

struct pt_regs;
+struct pvm_vcpu_struct;

/*
* Extra per CPU control structure lives in the struct tss_struct.
@@ -46,6 +81,31 @@ struct tss_extra {
unsigned long host_rsp;
/* Prepared guest CR3 to be loaded before VM enter. */
unsigned long enter_cr3;
+
+ /*
+ * Direct switching flag indicates whether direct switching
+ * is allowed.
+ */
+ unsigned long switch_flags ____cacheline_aligned;
+ /*
+ * Guest supervisor mode hardware CR3 for direct switching of guest
+ * user mode syscall.
+ */
+ unsigned long smod_cr3;
+ /*
+ * Guest user mode hardware CR3 for direct switching of guest ERETU
+ * synthetic instruction.
+ */
+ unsigned long umod_cr3;
+ /*
+ * The current PVCS for saving and restoring guest user mode context
+ * in direct switching.
+ */
+ struct pvm_vcpu_struct *pvcs;
+ unsigned long retu_rip;
+ unsigned long smod_entry;
+ unsigned long smod_gsbase;
+ unsigned long smod_rsp;
} ____cacheline_aligned;

extern struct pt_regs *switcher_enter_guest(void);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1485cbda6dc4..8230bd27f0b3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,6 +4,7 @@
#endif

#include <asm/ia32.h>
+#include <asm/pvm_para.h>

#if defined(CONFIG_KVM_GUEST)
#include <asm/kvm_para.h>
@@ -65,6 +66,28 @@ int main(void)
ENTRY(host_cr3);
ENTRY(host_rsp);
ENTRY(enter_cr3);
+ ENTRY(switch_flags);
+ ENTRY(smod_cr3);
+ ENTRY(umod_cr3);
+ ENTRY(pvcs);
+ ENTRY(retu_rip);
+ ENTRY(smod_entry);
+ ENTRY(smod_gsbase);
+ ENTRY(smod_rsp);
+ BLANK();
+#undef ENTRY
+
+#define ENTRY(entry) OFFSET(PVCS_ ## entry, pvm_vcpu_struct, entry)
+ ENTRY(event_flags);
+ ENTRY(event_errcode);
+ ENTRY(user_cs);
+ ENTRY(user_ss);
+ ENTRY(user_gsbase);
+ ENTRY(rsp);
+ ENTRY(eflags);
+ ENTRY(rip);
+ ENTRY(rcx);
+ ENTRY(r11);
BLANK();
#undef ENTRY

--
2.19.1.6.gb485710b