[RFC PATCH 64/73] x86/pvm: Enable PVM event delivery

From: Lai Jiangshan
Date: Mon Feb 26 2024 - 10:00:44 EST


From: Hou Wenlong <houwenlong.hwl@xxxxxxxxxxxx>

Invoke pvm_early_setup() after idt_setup_early_handler() to enable early
kernel event delivery. Also, modify cpu_init_exception_handling() to
call pvm_setup_event_handling() in order to enable event delivery for
the current CPU. Additionally, for the syscall event, change MSR_LSTAR
to PVM specific entry.

Signed-off-by: Hou Wenlong <houwenlong.hwl@xxxxxxxxxxxx>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
---
arch/x86/entry/entry_64.S | 9 ++++++--
arch/x86/include/asm/pvm_para.h | 5 +++++
arch/x86/kernel/cpu/common.c | 11 ++++++++++
arch/x86/kernel/head64.c | 3 +++
arch/x86/kernel/idt.c | 2 ++
arch/x86/kernel/pvm.c | 37 +++++++++++++++++++++++++++++++++
6 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 5b25ea4a16ae..fe12605b3c05 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -124,10 +124,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
+ * In the PVM guest case we must use eretu synthetic instruction.
*/

- ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
- "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+ ALTERNATIVE_2 "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV, \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_KVM_PVM_GUEST

/*
* We win! This label is here just for ease of understanding
@@ -597,6 +599,9 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
+#ifdef CONFIG_PVM_GUEST
+ ALTERNATIVE "", "jmp pvm_restore_regs_and_return_to_usermode", X86_FEATURE_KVM_PVM_GUEST
+#endif

POP_REGS pop_rdi=0

diff --git a/arch/x86/include/asm/pvm_para.h b/arch/x86/include/asm/pvm_para.h
index 72c74545dba6..f5d40a57c423 100644
--- a/arch/x86/include/asm/pvm_para.h
+++ b/arch/x86/include/asm/pvm_para.h
@@ -15,6 +15,7 @@ typedef void (*idtentry_t)(struct pt_regs *regs);
void __init pvm_early_setup(void);
void __init pvm_setup_early_traps(void);
void __init pvm_install_sysvec(unsigned int sysvec, idtentry_t handler);
+void pvm_setup_event_handling(void);
bool __init pvm_kernel_layout_relocate(void);

static inline void pvm_cpuid(unsigned int *eax, unsigned int *ebx,
@@ -79,6 +80,10 @@ static inline void pvm_install_sysvec(unsigned int sysvec, idtentry_t handler)
{
}

+static inline void pvm_setup_event_handling(void)
+{
+}
+
static inline bool pvm_kernel_layout_relocate(void)
{
return false;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 45f214e41a9a..89874559dbc2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -66,6 +66,7 @@
#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
+#include <asm/pvm_para.h>

#include "cpu.h"

@@ -2066,7 +2067,15 @@ static void wrmsrl_cstar(unsigned long val)
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
+#ifdef CONFIG_PVM_GUEST
+ if (boot_cpu_has(X86_FEATURE_KVM_PVM_GUEST))
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64_pvm);
+ else
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+#else
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+#endif

if (ia32_enabled()) {
wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
@@ -2217,6 +2226,8 @@ void cpu_init_exception_handling(void)

/* Finally load the IDT */
load_current_idt();
+
+ pvm_setup_event_handling();
}

/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d0e8d648bd38..17cd11dd1f03 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -42,6 +42,7 @@
#include <asm/sev.h>
#include <asm/tdx.h>
#include <asm/init.h>
+#include <asm/pvm_para.h>

/*
* Manage page tables very early on.
@@ -286,6 +287,8 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode

idt_setup_early_handler();

+ pvm_early_setup();
+
/* Needed before cc_platform_has() can be used for TDX */
tdx_early_init();

diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 660b601f1d6c..0dc3ded6da01 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -12,6 +12,7 @@
#include <asm/hw_irq.h>
#include <asm/ia32.h>
#include <asm/idtentry.h>
+#include <asm/pvm_para.h>

#define DPL0 0x0
#define DPL3 0x3
@@ -259,6 +260,7 @@ void __init idt_setup_early_pf(void)
{
idt_setup_from_table(idt_table, early_pf_idts,
ARRAY_SIZE(early_pf_idts), true);
+ pvm_setup_early_traps();
}
#endif

diff --git a/arch/x86/kernel/pvm.c b/arch/x86/kernel/pvm.c
index 352d74394c4a..c38e46a96ad3 100644
--- a/arch/x86/kernel/pvm.c
+++ b/arch/x86/kernel/pvm.c
@@ -286,12 +286,49 @@ __visible noinstr void pvm_event(struct pt_regs *regs)
common_interrupt(regs, vector);
}

+extern void pvm_early_kernel_event_entry(void);
+
+/*
+ * Reserve a fixed-size area in the current stack during an event from
+ * supervisor mode. This is for the int3 handler to emulate a call instruction.
+ */
+#define PVM_SUPERVISOR_REDZONE_SIZE (2*8UL)
+
void __init pvm_early_setup(void)
{
if (!pvm_range_end)
return;

setup_force_cpu_cap(X86_FEATURE_KVM_PVM_GUEST);
+
+ wrmsrl(MSR_PVM_VCPU_STRUCT, __pa(this_cpu_ptr(&pvm_vcpu_struct)));
+ wrmsrl(MSR_PVM_EVENT_ENTRY, (unsigned long)(void *)pvm_early_kernel_event_entry - 256);
+ wrmsrl(MSR_PVM_SUPERVISOR_REDZONE, PVM_SUPERVISOR_REDZONE_SIZE);
+ wrmsrl(MSR_PVM_RETS_RIP, (unsigned long)(void *)pvm_rets_rip);
+}
+
+void pvm_setup_event_handling(void)
+{
+ if (boot_cpu_has(X86_FEATURE_KVM_PVM_GUEST)) {
+ u64 xpa = slow_virt_to_phys(this_cpu_ptr(&pvm_vcpu_struct));
+
+ wrmsrl(MSR_PVM_VCPU_STRUCT, xpa);
+ wrmsrl(MSR_PVM_EVENT_ENTRY, (unsigned long)(void *)pvm_user_event_entry);
+ wrmsrl(MSR_PVM_SUPERVISOR_REDZONE, PVM_SUPERVISOR_REDZONE_SIZE);
+ wrmsrl(MSR_PVM_RETU_RIP, (unsigned long)(void *)pvm_retu_rip);
+ wrmsrl(MSR_PVM_RETS_RIP, (unsigned long)(void *)pvm_rets_rip);
+
+ /*
+ * PVM spec requires the hypervisor-maintained
+ * MSR_KERNEL_GS_BASE to be the same as the kernel GSBASE for
+ * event delivery for user mode. wrmsrl(MSR_KERNEL_GS_BASE)
+ * accesses only the user GSBASE in the PVCS via
+ * pvm_write_msr() without hypervisor involved, so use
+ * PVM_HC_WRMSR instead.
+ */
+ pvm_hypercall2(PVM_HC_WRMSR, MSR_KERNEL_GS_BASE,
+ cpu_kernelmode_gs_base(smp_processor_id()));
+ }
}

#define TB_SHIFT 40
--
2.19.1.6.gb485710b