[RFC PATCH 49/73] KVM: x86/PVM: Implement emulation for non-PVM mode

From: Lai Jiangshan
Date: Mon Feb 26 2024 - 09:55:14 EST


From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>

The PVM hypervisor supports a modified long mode with PVM ABI, known as
PVM mode. PVM mode includes a 64-bit supervisor mode, 64-bit user mode,
and a 32-bit compatible user mode. The 32-bit supervisor mode and other
operating modes are considered non-PVM modes. In PVM mode, the states of
system registers are standard, and the guest is allowed to run on the
hardware. So far, non-PVM mode is required for booting the guest and
bringing up vCPUs. Currently, there is only basic support for non-PVM
mode through instruction emulation.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
Signed-off-by: Hou Wenlong <houwenlong.hwl@xxxxxxxxxxxx>
---
arch/x86/kvm/pvm/pvm.c | 145 +++++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/pvm/pvm.h | 1 +
2 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/pvm/pvm.c b/arch/x86/kvm/pvm/pvm.c
index b261309fc946..e4b8f0108c31 100644
--- a/arch/x86/kvm/pvm/pvm.c
+++ b/arch/x86/kvm/pvm/pvm.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
+#include <linux/entry-kvm.h>

#include <asm/gsseg.h>
#include <asm/io_bitmap.h>
@@ -218,6 +219,104 @@ static void pvm_update_guest_cpuid_faulting(struct kvm_vcpu *vcpu, u64 data)
preempt_enable();
}

+/*
+ * Non-PVM mode is not a part of PVM. Basic support for it via emulation.
+ * Non-PVM mode is required for booting the guest and bringing up vCPUs so far.
+ *
+ * In future, when VMM can directly boot the guest and bring vCPUs up from
+ * 64-bit mode without any help from non-64-bit mode, then the support non-PVM
+ * mode will be removed.
+ */
+#define CONVERT_TO_PVM_CR0_OFF (X86_CR0_NW | X86_CR0_CD)
+#define CONVERT_TO_PVM_CR0_ON (X86_CR0_NE | X86_CR0_AM | X86_CR0_WP | \
+ X86_CR0_PG | X86_CR0_PE)
+
+static bool try_to_convert_to_pvm_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
+
+ if (!is_long_mode(vcpu))
+ return false;
+ if (!pvm->segments[VCPU_SREG_CS].l) {
+ if (is_smod(pvm))
+ return false;
+ if (!pvm->segments[VCPU_SREG_CS].db)
+ return false;
+ }
+
+ /* Atomically set EFER_SCE converting to PVM mode. */
+ if ((vcpu->arch.efer | EFER_SCE) != vcpu->arch.efer)
+ vcpu->arch.efer |= EFER_SCE;
+
+ /* Change CR0 on converting to PVM mode. */
+ cr0 &= ~CONVERT_TO_PVM_CR0_OFF;
+ cr0 |= CONVERT_TO_PVM_CR0_ON;
+ if (cr0 != vcpu->arch.cr0)
+ kvm_set_cr0(vcpu, cr0);
+
+ /* Atomically set MSR_STAR on converting to PVM mode. */
+ if (!kernel_cs_by_msr(pvm->msr_star))
+ pvm->msr_star = ((u64)pvm->segments[VCPU_SREG_CS].selector << 32) |
+ ((u64)__USER32_CS << 48);
+
+ pvm->non_pvm_mode = false;
+
+ return true;
+}
+
+static int handle_non_pvm_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+ int ret = 1;
+ unsigned int count = 130;
+
+ if (try_to_convert_to_pvm_mode(vcpu))
+ return 1;
+
+ while (pvm->non_pvm_mode && count-- != 0) {
+ if (kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return 1;
+
+ if (try_to_convert_to_pvm_mode(vcpu))
+ return 1;
+
+ ret = kvm_emulate_instruction(vcpu, 0);
+
+ if (!ret)
+ goto out;
+
+ /* don't do mode switch in emulation */
+ if (!is_smod(pvm))
+ goto emulation_error;
+
+ if (vcpu->arch.exception.pending)
+ goto emulation_error;
+
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ ret = kvm_emulate_halt_noskip(vcpu);
+ goto out;
+ }
+ /*
+ * Note, return 1 and not 0, vcpu_run() will invoke
+ * xfer_to_guest_mode() which will create a proper return
+ * code.
+ */
+ if (__xfer_to_guest_mode_work_pending())
+ return 1;
+ }
+
+out:
+ return ret;
+
+emulation_error:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+}
+
// switch_to_smod() and switch_to_umod() switch the mode (smod/umod) and
// the CR3. No vTLB flushing when switching the CR3 per PVM Spec.
static inline void switch_to_smod(struct kvm_vcpu *vcpu)
@@ -359,6 +458,10 @@ static void pvm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
if (pvm->loaded_cpu_state)
return;

+ // we can't load guest state to hardware when guest is not on long mode
+ if (unlikely(pvm->non_pvm_mode))
+ return;
+
pvm->loaded_cpu_state = 1;

#ifdef CONFIG_X86_IOPL_IOPERM
@@ -1138,6 +1241,11 @@ static void pvm_get_segment(struct kvm_vcpu *vcpu,
{
struct vcpu_pvm *pvm = to_pvm(vcpu);

+ if (pvm->non_pvm_mode) {
+ *var = pvm->segments[seg];
+ return;
+ }
+
// Update CS or SS to reflect the current mode.
if (seg == VCPU_SREG_CS) {
if (is_smod(pvm)) {
@@ -1209,7 +1317,7 @@ static void pvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int
if (cpl != var->dpl)
goto invalid_change;
if (cpl == 0 && !var->l)
- goto invalid_change;
+ pvm->non_pvm_mode = true;
}
break;
case VCPU_SREG_LDTR:
@@ -1231,12 +1339,17 @@ static void pvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
struct vcpu_pvm *pvm = to_pvm(vcpu);

- if (pvm->hw_cs == __USER_CS) {
- *db = 0;
- *l = 1;
+ if (pvm->non_pvm_mode) {
+ *db = pvm->segments[VCPU_SREG_CS].db;
+ *l = pvm->segments[VCPU_SREG_CS].l;
} else {
- *db = 1;
- *l = 0;
+ if (pvm->hw_cs == __USER_CS) {
+ *db = 0;
+ *l = 1;
+ } else {
+ *db = 1;
+ *l = 0;
+ }
}
}

@@ -1513,7 +1626,7 @@ static void pvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
* user mode, so that when the guest switches back to supervisor mode,
* the X86_EFLAGS_IF is already cleared.
*/
- if (!need_update || !is_smod(pvm))
+ if (unlikely(pvm->non_pvm_mode) || !need_update || !is_smod(pvm))
return;

if (rflags & X86_EFLAGS_IF) {
@@ -1536,7 +1649,11 @@ static u32 pvm_get_interrupt_shadow(struct kvm_vcpu *vcpu)

static void pvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+
/* PVM spec: ignore interrupt shadow when in PVM mode. */
+ if (pvm->non_pvm_mode)
+ pvm->int_shadow = mask;
}

static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -2212,6 +2329,9 @@ static int pvm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct vcpu_pvm *pvm = to_pvm(vcpu);
u32 exit_reason = pvm->exit_vector;

+ if (unlikely(pvm->non_pvm_mode))
+ return handle_non_pvm_mode(vcpu);
+
if (exit_reason == PVM_SYSCALL_VECTOR)
return handle_exit_syscall(vcpu);
else if (exit_reason >= 0 && exit_reason < FIRST_EXTERNAL_VECTOR)
@@ -2546,6 +2666,13 @@ static fastpath_t pvm_vcpu_run(struct kvm_vcpu *vcpu)
struct vcpu_pvm *pvm = to_pvm(vcpu);
bool is_smod_befor_run = is_smod(pvm);

+ /*
+ * Don't enter guest if guest state is invalid, let the exit handler
+ * start emulation until we arrive back to a valid state.
+ */
+ if (pvm->non_pvm_mode)
+ return EXIT_FASTPATH_NONE;
+
trace_kvm_entry(vcpu);

pvm_load_guest_xsave_state(vcpu);
@@ -2657,6 +2784,10 @@ static void pvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
vcpu->arch.msr_platform_info &= ~MSR_PLATFORM_INFO_CPUID_FAULT;

+ // Non-PVM mode resets
+ pvm->non_pvm_mode = true;
+ pvm->msr_star = 0;
+
// X86 resets
for (i = 0; i < ARRAY_SIZE(pvm->segments); i++)
reset_segment(&pvm->segments[i], i);
diff --git a/arch/x86/kvm/pvm/pvm.h b/arch/x86/kvm/pvm/pvm.h
index e49d9dc70a94..1a4feddb13b3 100644
--- a/arch/x86/kvm/pvm/pvm.h
+++ b/arch/x86/kvm/pvm/pvm.h
@@ -106,6 +106,7 @@ struct vcpu_pvm {

int loaded_cpu_state;
int int_shadow;
+ bool non_pvm_mode;
bool nmi_mask;

unsigned long guest_dr7;
--
2.19.1.6.gb485710b