Re: [PATCH 4/7] KVM: nVMX: move inject_page_fault tweak to .complete_mmu_init

From: Sean Christopherson
Date: Wed Feb 17 2021 - 12:30:35 EST


On Wed, Feb 17, 2021, Maxim Levitsky wrote:
> This fixes a (mostly theoretical) bug which can happen if ept=0
> on host and we run a nested guest which triggers a mmu context
> reset while running nested.
> In this case the .inject_page_fault callback will be lost.
>
> Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
> ---
> arch/x86/kvm/vmx/nested.c | 8 +-------
> arch/x86/kvm/vmx/nested.h | 1 +
> arch/x86/kvm/vmx/vmx.c | 5 ++++-
> 3 files changed, 6 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 0b6dab6915a3..f9de729dbea6 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -419,7 +419,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
> }
>
>
> -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> struct x86_exception *fault)
> {
> struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> @@ -2620,9 +2620,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
> }
>
> - if (!enable_ept)
> - vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
> -
> if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
> WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
> vmcs12->guest_ia32_perf_global_ctrl)))
> @@ -4224,9 +4221,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
> if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
> nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
>
> - if (!enable_ept)
> - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;

Oof, please explicitly call out these types of side effects in the changelog,
it took me a while to piece together that this can be dropped because a MMU
reset is guaranteed and is also guaranteed to restore inject_page_fault.

I would even go so far as to say this particular line of code should be removed
in a separate commit. Unless I'm overlooking something, this code is
effectively a nop, which means it doesn't need to be removed to make the bug fix
functionally correct.

All that being said, I'm pretty we can eliminate setting inject_page_fault
dynamically. I think that would yield more maintainable code. Following these
flows is a nightmare. The change itself will be scarier, but I'm pretty sure
the end result will be a lot cleaner.

And I believe there's also a second bug that would be fixed by such an approach.
Doesn't vmx_inject_page_fault_nested() need to be used for the nested_mmu when
ept=1? E.g. if the emulator injects a #PF to L2, L1 should still be able to
intercept the #PF even if L1 is using EPT. This likely hasn't been noticed
because hypervisors typically don't intercept #PF when EPT is enabled.

Something like this (very incomplete):

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 30e9b0cb9abd..f957514a4d65 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4497,7 +4497,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->direct_map = true;
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
- context->inject_page_fault = kvm_inject_page_fault;

if (!is_paging(vcpu)) {
context->nx = false;
@@ -4687,7 +4686,6 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)

context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
- context->inject_page_fault = kvm_inject_page_fault;
}

static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -4701,7 +4699,6 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->mmu_role.as_u64 = new_role.as_u64;
g_context->get_guest_pgd = get_cr3;
g_context->get_pdptr = kvm_pdptr_read;
- g_context->inject_page_fault = kvm_inject_page_fault;

/*
* L2 page tables are never shadowed, so there is no need to sync
@@ -5272,6 +5269,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
if (ret)
goto fail_allocate_root;

+ static_call(kvm_x86_mmu_create)(vcpu);
+
return ret;
fail_allocate_root:
free_mmu_pages(&vcpu->arch.guest_mmu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a63da447ede9..aa6c48295117 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -425,15 +425,14 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
}


-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

- WARN_ON(!is_guest_mode(vcpu));
-
- if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
- !to_vmx(vcpu)->nested.nested_run_pending) {
+ if (guest_mode(vcpu) &&
+ nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
+ !to_vmx(vcpu)->nested.nested_run_pending) {
vmcs12->vm_exit_intr_error_code = fault->error_code;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
@@ -2594,9 +2593,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}

- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->guest_ia32_perf_global_ctrl)))
@@ -4198,9 +4194,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);

- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);

vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1204e5f0fe67..0e5ee22eea77 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3081,6 +3081,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}

+static void vmx_mmu_create(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.root_mmu.inject_page_fault = vmx_inject_page_fault;
+ vcpu->arch.guest_mmu.inject_page_fault = nested_ept_inject_page_fault;
+ vcpu->arch.nested_mmu.inject_page_fault = vmx_inject_page_fault;
+}
+
static int vmx_get_max_tdp_level(void)
{
if (cpu_has_vmx_ept_5levels())
@@ -7721,6 +7728,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {

.write_l1_tsc_offset = vmx_write_l1_tsc_offset,

+ .mmu_create = vmx_mmu_create,
.load_mmu_pgd = vmx_load_mmu_pgd,

.check_intercept = vmx_check_intercept,

> -
> nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
>
> vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
> diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
> index 197148d76b8f..2ab279744d38 100644
> --- a/arch/x86/kvm/vmx/nested.h
> +++ b/arch/x86/kvm/vmx/nested.h
> @@ -36,6 +36,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
> void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
> bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
> int size);
> +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,struct x86_exception *fault);
>
> static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
> {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index bf6ef674d688..c43324df4877 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3254,7 +3254,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
>
> static void vmx_complete_mmu_init(struct kvm_vcpu *vcpu)
> {
> -
> + if (!enable_ept && is_guest_mode(vcpu)) {
> + WARN_ON(mmu_is_nested(vcpu));
> + vcpu->arch.mmu->inject_page_fault = vmx_inject_page_fault_nested;
> + }
> }
>
> static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> --
> 2.26.2
>