Re: [RFC/RFT PATCH] KVM: nVMX: fixes to nested virt interrupt injection

From: Wanpeng Li
Date: Wed Jul 26 2017 - 22:45:17 EST


2017-07-24 22:20 GMT+08:00 Paolo Bonzini <pbonzini@xxxxxxxxxx>:
> There are three issues in nested_vmx_check_exception:
>
> 1) it is not taking PFEC_MATCH/PFEC_MASK into account, as reported
> by Wanpeng Li;
>
> 2) it should rebuild the interruption info and exit qualification fields
> from scratch, as reported by Jim Mattson, because the values from the
> L2->L0 vmexit may be invalid (e.g. if an emulated instruction causes
> a page fault, the EPT misconfig's exit qualification is incorrect).
>
> 3) CR2 and DR6 should not be written for exception intercept vmexits
> (CR2 only for AMD).
>
> This patch fixes the first two and adds a comment about the last,
> outlining the fix.
>
> Cc: Jim Mattson <jmattson@xxxxxxxxxx>
> Cc: Wanpeng Li <wanpeng.li@xxxxxxxxxxx>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
> Wanpeng, can you test this on the testcases you had for commit

I didn't observe any issue when testing the latest version of your
patch in kvm/queue.

Regards,
Wanpeng Li

> d4912215d103 ("KVM: nVMX: Fix exception injection", 2017-06-05)?
> Also, do you have a testcase for PFEC matching?
>
> arch/x86/kvm/svm.c | 10 ++++++++
> arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++++++++++++++++++++-----------
> 2 files changed, 67 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 4d8141e533c3..1107626938cc 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -2430,6 +2430,16 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
> svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
> svm->vmcb->control.exit_code_hi = 0;
> svm->vmcb->control.exit_info_1 = error_code;
> +
> + /*
> + * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
> + * The fix is to add the ancillary datum (CR2 or DR6) to structs
> + * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
> + * written only when inject_pending_event runs (DR6 would written here
> + * too). This should be conditional on a new capability---if the
> + * capability is disabled, kvm_multiple_exception would write the
> + * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
> + */
> if (svm->vcpu.arch.exception.nested_apf)
> svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
> else
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index d04092f821b6..b520614f9d46 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -927,6 +927,10 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
> static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
> static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
> static int alloc_identity_pagetable(struct kvm *kvm);
> +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
> +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
> +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
> + u16 error_code);
>
> static DEFINE_PER_CPU(struct vmcs *, vmxarea);
> static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> @@ -2432,28 +2436,67 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
> * KVM wants to inject page-faults which it got to the guest. This function
> * checks whether in a nested guest, we need to inject them to L1 or L2.
> */
> +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
> + unsigned long exit_qual)
> +{
> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> + unsigned int nr = vcpu->arch.exception.nr;
> + u32 intr_info = nr | INTR_INFO_VALID_MASK;
> +
> + if (vcpu->arch.exception.has_error_code) {
> + vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
> + intr_info |= INTR_INFO_DELIVER_CODE_MASK;
> + }
> +
> + if (kvm_exception_is_soft(nr))
> + intr_info |= INTR_TYPE_SOFT_EXCEPTION;
> + else
> + intr_info |= INTR_TYPE_HARD_EXCEPTION;
> +
> + if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
> + vmx_get_nmi_mask(vcpu))
> + intr_info |= INTR_INFO_UNBLOCK_NMI;
> +
> + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
> +}
> +
> static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
> {
> struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> unsigned int nr = vcpu->arch.exception.nr;
>
> - if (!((vmcs12->exception_bitmap & (1u << nr)) ||
> - (nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
> - return 0;
> + if (nr == PF_VECTOR) {
> + if (vcpu->arch.exception.nested_apf) {
> + nested_vmx_inject_exception_vmexit(vcpu,
> + vcpu->arch.apf.nested_apf_token);
> + return 1;
> + }
> + /*
> + * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
> + * The fix is to add the ancillary datum (CR2 or DR6) to structs
> + * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
> + * can be written only when inject_pending_event runs. This should be
> + * conditional on a new capability---if the capability is disabled,
> + * kvm_multiple_exception would write the ancillary information to
> + * CR2 or DR6, for backwards ABI-compatibility.
> + */
> + if (nested_vmx_is_page_fault_vmexit(vmcs12,
> + vcpu->arch.exception.error_code)) {
> + nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
> + return 1;
> + }
> + } else {
> + unsigned long exit_qual = 0;
> + if (nr == DB_VECTOR)
> + exit_qual = vcpu->arch.dr6;
>
> - if (vcpu->arch.exception.nested_apf) {
> - vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
> - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
> - PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
> - INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
> - vcpu->arch.apf.nested_apf_token);
> - return 1;
> + if (vmcs12->exception_bitmap & (1u << nr)) {
> + nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
> + return 1;
> + }
> }
>
> - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
> - vmcs_read32(VM_EXIT_INTR_INFO),
> - vmcs_readl(EXIT_QUALIFICATION));
> - return 1;
> + return 0;
> }
>
> static void vmx_queue_exception(struct kvm_vcpu *vcpu)
> --
> 1.8.3.1
>