[RFC PATCH 43/73] KVM: x86/PVM: Enable direct switching

From: Lai Jiangshan
Date: Mon Feb 26 2024 - 09:53:03 EST


From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>

To enable direct switching, certain necessary information needs to be
prepared in TSS for the switcher. Since only syscall and RETU hypercalls
are allowed for now, CPL switching-related information is needed before
VM enters. Additionally, after VM exit, the states in the hypervisor
should be updated if direct switching has occurred.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
Signed-off-by: Hou Wenlong <houwenlong.hwl@xxxxxxxxxxxx>
---
arch/x86/kvm/pvm/pvm.c | 87 +++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/pvm/pvm.h | 15 ++++++++
2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/pvm/pvm.c b/arch/x86/kvm/pvm/pvm.c
index 6ac599587567..138d0c255cb8 100644
--- a/arch/x86/kvm/pvm/pvm.c
+++ b/arch/x86/kvm/pvm/pvm.c
@@ -559,23 +559,70 @@ static void pvm_flush_hwtlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
put_cpu();
}

+static bool check_switch_cr3(struct vcpu_pvm *pvm, u64 switch_host_cr3)
+{
+ u64 root = pvm->vcpu.arch.mmu->prev_roots[0].hpa;
+
+ if (pvm->vcpu.arch.mmu->prev_roots[0].pgd != pvm->msr_switch_cr3)
+ return false;
+ if (!VALID_PAGE(root))
+ return false;
+ if (host_pcid_owner(switch_host_cr3 & X86_CR3_PCID_MASK) != pvm)
+ return false;
+ if (host_pcid_root(switch_host_cr3 & X86_CR3_PCID_MASK) != root)
+ return false;
+ if (root != (switch_host_cr3 & CR3_ADDR_MASK))
+ return false;
+
+ return true;
+}
+
static void pvm_set_host_cr3_for_guest_with_host_pcid(struct vcpu_pvm *pvm)
{
u64 root_hpa = pvm->vcpu.arch.mmu->root.hpa;
bool flush = false;
u32 host_pcid = host_pcid_get(pvm, root_hpa, &flush);
u64 hw_cr3 = root_hpa | host_pcid;
+ u64 switch_host_cr3;

if (!flush)
hw_cr3 |= CR3_NOFLUSH;
this_cpu_write(cpu_tss_rw.tss_ex.enter_cr3, hw_cr3);
+
+ if (is_smod(pvm)) {
+ this_cpu_write(cpu_tss_rw.tss_ex.smod_cr3, hw_cr3 | CR3_NOFLUSH);
+ switch_host_cr3 = this_cpu_read(cpu_tss_rw.tss_ex.umod_cr3);
+ } else {
+ this_cpu_write(cpu_tss_rw.tss_ex.umod_cr3, hw_cr3 | CR3_NOFLUSH);
+ switch_host_cr3 = this_cpu_read(cpu_tss_rw.tss_ex.smod_cr3);
+ }
+
+ if (check_switch_cr3(pvm, switch_host_cr3))
+ pvm->switch_flags &= ~SWITCH_FLAGS_NO_DS_CR3;
+ else
+ pvm->switch_flags |= SWITCH_FLAGS_NO_DS_CR3;
}

static void pvm_set_host_cr3_for_guest_without_host_pcid(struct vcpu_pvm *pvm)
{
u64 root_hpa = pvm->vcpu.arch.mmu->root.hpa;
+ u64 switch_root = 0;
+
+ if (pvm->vcpu.arch.mmu->prev_roots[0].pgd == pvm->msr_switch_cr3) {
+ switch_root = pvm->vcpu.arch.mmu->prev_roots[0].hpa;
+ pvm->switch_flags &= ~SWITCH_FLAGS_NO_DS_CR3;
+ } else {
+ pvm->switch_flags |= SWITCH_FLAGS_NO_DS_CR3;
+ }

this_cpu_write(cpu_tss_rw.tss_ex.enter_cr3, root_hpa);
+ if (is_smod(pvm)) {
+ this_cpu_write(cpu_tss_rw.tss_ex.smod_cr3, root_hpa);
+ this_cpu_write(cpu_tss_rw.tss_ex.umod_cr3, switch_root);
+ } else {
+ this_cpu_write(cpu_tss_rw.tss_ex.umod_cr3, root_hpa);
+ this_cpu_write(cpu_tss_rw.tss_ex.smod_cr3, switch_root);
+ }
}

static void pvm_set_host_cr3_for_hypervisor(struct vcpu_pvm *pvm)
@@ -591,6 +638,8 @@ static void pvm_set_host_cr3_for_hypervisor(struct vcpu_pvm *pvm)

// Set tss_ex.host_cr3 for VMExit.
// Set tss_ex.enter_cr3 for VMEnter.
+// Set tss_ex.smod_cr3 and tss_ex.umod_cr3 and set or clear
+// SWITCH_FLAGS_NO_DS_CR3 for direct switching.
static void pvm_set_host_cr3(struct vcpu_pvm *pvm)
{
pvm_set_host_cr3_for_hypervisor(pvm);
@@ -1058,6 +1107,11 @@ static bool pvm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)

static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
+ /* disable direct switch when single step debugging */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ to_pvm(vcpu)->switch_flags |= SWITCH_FLAGS_SINGLE_STEP;
+ else
+ to_pvm(vcpu)->switch_flags &= ~SWITCH_FLAGS_SINGLE_STEP;
}

static struct pvm_vcpu_struct *pvm_get_vcpu_struct(struct vcpu_pvm *pvm)
@@ -1288,10 +1342,12 @@ static void pvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
if (!need_update || !is_smod(pvm))
return;

- if (rflags & X86_EFLAGS_IF)
+ if (rflags & X86_EFLAGS_IF) {
+ pvm->switch_flags &= ~SWITCH_FLAGS_IRQ_WIN;
pvm_event_flags_update(vcpu, X86_EFLAGS_IF, PVM_EVENT_FLAGS_IP);
- else
+ } else {
pvm_event_flags_update(vcpu, 0, X86_EFLAGS_IF);
+ }
}

static bool pvm_get_if_flag(struct kvm_vcpu *vcpu)
@@ -1311,6 +1367,7 @@ static void pvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)

static void enable_irq_window(struct kvm_vcpu *vcpu)
{
+ to_pvm(vcpu)->switch_flags |= SWITCH_FLAGS_IRQ_WIN;
pvm_event_flags_update(vcpu, PVM_EVENT_FLAGS_IP, 0);
}

@@ -1332,6 +1389,7 @@ static void pvm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)

static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
+ to_pvm(vcpu)->switch_flags |= SWITCH_FLAGS_NMI_WIN;
}

static int pvm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -1361,6 +1419,8 @@ static void pvm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)

trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, false);

+ to_pvm(vcpu)->switch_flags &= ~SWITCH_FLAGS_IRQ_WIN;
+
if (do_pvm_event(vcpu, irq, false, 0))
kvm_clear_interrupt_queue(vcpu);

@@ -1397,6 +1457,7 @@ static int handle_synthetic_instruction_return_user(struct kvm_vcpu *vcpu)

// instruction to return user means nmi allowed.
pvm->nmi_mask = false;
+ pvm->switch_flags &= ~(SWITCH_FLAGS_IRQ_WIN | SWITCH_FLAGS_NMI_WIN);

/*
* switch to user mode before kvm_set_rflags() to avoid PVM_EVENT_FLAGS_IF
@@ -1448,6 +1509,7 @@ static int handle_synthetic_instruction_return_supervisor(struct kvm_vcpu *vcpu)

// instruction to return supervisor means nmi allowed.
pvm->nmi_mask = false;
+ pvm->switch_flags &= ~SWITCH_FLAGS_NMI_WIN;

kvm_set_rflags(vcpu, frame.rflags);
kvm_rip_write(vcpu, frame.rip);
@@ -1461,6 +1523,7 @@ static int handle_synthetic_instruction_return_supervisor(struct kvm_vcpu *vcpu)
static int handle_hc_interrupt_window(struct kvm_vcpu *vcpu)
{
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ to_pvm(vcpu)->switch_flags &= ~SWITCH_FLAGS_IRQ_WIN;
pvm_event_flags_update(vcpu, 0, PVM_EVENT_FLAGS_IP);

++vcpu->stat.irq_window_exits;
@@ -2199,6 +2262,7 @@ static __always_inline void load_regs(struct kvm_vcpu *vcpu, struct pt_regs *gue

static noinstr void pvm_vcpu_run_noinstr(struct kvm_vcpu *vcpu)
{
+ struct tss_extra *tss_ex = this_cpu_ptr(&cpu_tss_rw.tss_ex);
struct vcpu_pvm *pvm = to_pvm(vcpu);
struct pt_regs *sp0_regs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
struct pt_regs *ret_regs;
@@ -2208,12 +2272,25 @@ static noinstr void pvm_vcpu_run_noinstr(struct kvm_vcpu *vcpu)
// Load guest registers into the host sp0 stack for switcher.
load_regs(vcpu, sp0_regs);

+ // Prepare context for direct switching.
+ tss_ex->switch_flags = pvm->switch_flags;
+ tss_ex->pvcs = pvm->pvcs_gpc.khva;
+ tss_ex->retu_rip = pvm->msr_retu_rip_plus2;
+ tss_ex->smod_entry = pvm->msr_lstar;
+ tss_ex->smod_gsbase = pvm->msr_kernel_gs_base;
+ tss_ex->smod_rsp = pvm->msr_supervisor_rsp;
+
if (unlikely(pvm->guest_dr7 & DR7_BP_EN_MASK))
set_debugreg(pvm_eff_dr7(vcpu), 7);

// Call into switcher and enter guest.
ret_regs = switcher_enter_guest();

+ // Get the resulted mode and PVM MSRs which might be changed
+ // when direct switching.
+ pvm->switch_flags = tss_ex->switch_flags;
+ pvm->msr_supervisor_rsp = tss_ex->smod_rsp;
+
// Get the guest registers from the host sp0 stack.
save_regs(vcpu, ret_regs);
pvm->exit_vector = (ret_regs->orig_ax >> 32);
@@ -2293,6 +2370,7 @@ static inline void pvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
static fastpath_t pvm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_pvm *pvm = to_pvm(vcpu);
+ bool is_smod_befor_run = is_smod(pvm);

trace_kvm_entry(vcpu);

@@ -2307,6 +2385,11 @@ static fastpath_t pvm_vcpu_run(struct kvm_vcpu *vcpu)

pvm_vcpu_run_noinstr(vcpu);

+ if (is_smod_befor_run != is_smod(pvm)) {
+ swap(pvm->vcpu.arch.mmu->root, pvm->vcpu.arch.mmu->prev_roots[0]);
+ swap(pvm->msr_switch_cr3, pvm->vcpu.arch.cr3);
+ }
+
/* MSR_IA32_DEBUGCTLMSR is zeroed before vmenter. Restore it if needed */
if (pvm->host_debugctlmsr)
update_debugctlmsr(pvm->host_debugctlmsr);
diff --git a/arch/x86/kvm/pvm/pvm.h b/arch/x86/kvm/pvm/pvm.h
index 2f8fdb0ae3df..e49d9dc70a94 100644
--- a/arch/x86/kvm/pvm/pvm.h
+++ b/arch/x86/kvm/pvm/pvm.h
@@ -5,6 +5,21 @@
#include <linux/kvm_host.h>
#include <asm/switcher.h>

+/*
+ * Extra switch flags:
+ *
+ * IRQ_WIN:
+ * There is an irq window request, and the vcpu should not directly
+ * switch to context with IRQ enabled, e.g. user mode.
+ * NMI_WIN:
+ * There is an NMI window request.
+ * SINGLE_STEP:
+ * KVM_GUESTDBG_SINGLESTEP is set.
+ */
+#define SWITCH_FLAGS_IRQ_WIN _BITULL(8)
+#define SWITCH_FLAGS_NMI_WIN _BITULL(9)
+#define SWITCH_FLAGS_SINGLE_STEP _BITULL(10)
+
#define SWITCH_FLAGS_INIT (SWITCH_FLAGS_SMOD)

#define PVM_SYSCALL_VECTOR SWITCH_EXIT_REASONS_SYSCALL
--
2.19.1.6.gb485710b