Re: [PATCH v6 05/13] KVM: nVMX: implement enlightened VMPTRLD and VMCLEAR

From: Jim Mattson
Date: Wed Dec 12 2018 - 18:20:09 EST


On Tue, Oct 16, 2018 at 9:50 AM Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> wrote:
>
> Per Hyper-V TLFS 5.0b:
>
> "The L1 hypervisor may choose to use enlightened VMCSs by writing 1 to
> the corresponding field in the VP assist page (see section 7.8.7).
> Another field in the VP assist page controls the currently active
> enlightened VMCS. Each enlightened VMCS is exactly one page (4 KB) in
> size and must be initially zeroed. No VMPTRLD instruction must be
> executed to make an enlightened VMCS active or current.
>
> After the L1 hypervisor performs a VM entry with an enlightened VMCS,
> the VMCS is considered active on the processor. An enlightened VMCS
> can only be active on a single processor at the same time. The L1
> hypervisor can execute a VMCLEAR instruction to transition an
> enlightened VMCS from the active to the non-active state. Any VMREAD
> or VMWRITE instructions while an enlightened VMCS is active is
> unsupported and can result in unexpected behavior."
>
> Keep Enlightened VMCS structure for the current L2 guest permanently mapped
> from struct nested_vmx instead of mapping it every time.
>
> Suggested-by: Ladi Prosek <lprosek@xxxxxxxxxx>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
> ---
> arch/x86/kvm/vmx.c | 115 ++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 108 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index aebd008ccccb..cfb44acd4291 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -20,6 +20,7 @@
> #include "mmu.h"
> #include "cpuid.h"
> #include "lapic.h"
> +#include "hyperv.h"
>
> #include <linux/kvm_host.h>
> #include <linux/module.h>
> @@ -889,6 +890,8 @@ struct nested_vmx {
> bool guest_mode;
> } smm;
>
> + gpa_t hv_evmcs_vmptr;
> + struct page *hv_evmcs_page;
> struct hv_enlightened_vmcs *hv_evmcs;
> };
>
> @@ -8111,11 +8114,13 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
> static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
> u32 vm_instruction_error)
> {
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> /*
> * failValid writes the error number to the current VMCS, which
> * can't be done if there isn't a current VMCS.
> */
> - if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
> + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
> return nested_vmx_failInvalid(vcpu);
>
> vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
> @@ -8441,6 +8446,20 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
> vmcs_write64(VMCS_LINK_POINTER, -1ull);
> }
>
> +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> + if (!vmx->nested.hv_evmcs)
> + return;
> +
> + kunmap(vmx->nested.hv_evmcs_page);
> + kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
> + vmx->nested.hv_evmcs_vmptr = -1ull;
> + vmx->nested.hv_evmcs_page = NULL;
> + vmx->nested.hv_evmcs = NULL;
> +}
> +
> static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -8509,6 +8528,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
>
> kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
>
> + nested_release_evmcs(vcpu);
> +
> free_loaded_vmcs(&vmx->nested.vmcs02);
> }
>
> @@ -8542,12 +8563,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
> return nested_vmx_failValid(vcpu,
> VMXERR_VMCLEAR_VMXON_POINTER);
>
> - if (vmptr == vmx->nested.current_vmptr)
> - nested_release_vmcs12(vcpu);
> + if (vmx->nested.hv_evmcs_page) {
> + if (vmptr == vmx->nested.hv_evmcs_vmptr)
> + nested_release_evmcs(vcpu);
> + } else {
> + if (vmptr == vmx->nested.current_vmptr)
> + nested_release_vmcs12(vcpu);
>
> - kvm_vcpu_write_guest(vcpu,
> - vmptr + offsetof(struct vmcs12, launch_state),
> - &zero, sizeof(zero));
> + kvm_vcpu_write_guest(vcpu,
> + vmptr + offsetof(struct vmcs12,
> + launch_state),
> + &zero, sizeof(zero));
> + }
>
> return nested_vmx_succeed(vcpu);
> }
> @@ -8637,6 +8664,8 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
> struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
> struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
>
> + vmcs12->hdr.revision_id = evmcs->revision_id;
> +
> /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
> vmcs12->tpr_threshold = evmcs->tpr_threshold;
> vmcs12->guest_rip = evmcs->guest_rip;
> @@ -9268,6 +9297,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
> return nested_vmx_failValid(vcpu,
> VMXERR_VMPTRLD_VMXON_POINTER);
>
> + /* Forbid normal VMPTRLD if Enlightened version was used */
> + if (vmx->nested.hv_evmcs)
> + return 1;
> +
> if (vmx->nested.current_vmptr != vmptr) {
> struct vmcs12 *new_vmcs12;
> struct page *page;
> @@ -9301,6 +9334,68 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
> return nested_vmx_succeed(vcpu);
> }
>
> +/*
> + * This is an equivalent of the nested hypervisor executing the vmptrld
> + * instruction.
> + */
> +static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct hv_vp_assist_page assist_page;
> +
> + if (likely(!vmx->nested.enlightened_vmcs_enabled))
> + return 1;
> +
> + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
> + return 1;
> +
> + if (unlikely(!assist_page.enlighten_vmentry))
> + return 1;
> +
> + if (unlikely(assist_page.current_nested_vmcs !=
> + vmx->nested.hv_evmcs_vmptr)) {
> +
> + if (!vmx->nested.hv_evmcs)
> + vmx->nested.current_vmptr = -1ull;
> +
> + nested_release_evmcs(vcpu);
> +
> + vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
> + vcpu, assist_page.current_nested_vmcs);
> +
> + if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
> + return 0;
> +
> + vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);

Are you sure that directly mapping guest memory isn't going to lead to
time-of-check vs. time-of-use bugs? This is a very hard programming
model to get right.