Re: [PATCH] KVM: SVM: Disable TDP MMU when running on Hyper-V

From: Jeremi Piotrowski
Date: Wed Mar 08 2023 - 10:48:32 EST


On 08/03/2023 01:00, Paolo Bonzini wrote:
> On Tue, Mar 7, 2023 at 6:36 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
>> Thinking about this more, I would rather revert commit 1e0c7d40758b ("KVM: SVM:
>> hyper-v: Remote TLB flush for SVM") or fix the thing properly straitaway. KVM
>> doesn't magically handle the flushes correctly for the shadow/legacy MMU, KVM just
>> happens to get lucky and not run afoul of the underlying bugs.
>
> I don't think it's about luck---the legacy MMU's zapping/invalidation
> seems to invoke the flush hypercall correctly:
>
> Jeremi, did you ever track the call stack where
> hyperv_nested_flush_guest_mapping is triggered?
>
> Paolo
>

Yes, here's all the stacktraces I get for hyperv_nested_flush_guest_mapping with legacy MMU:

1
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
svm_handle_exit
svm_invoke_exit_handler
npf_interception
kvm_mmu_page_fault
kvm_mmu_do_page_fault
kvm_tdp_page_fault
direct_page_fault
kvm_faultin_pfn
__gfn_to_pfn_memslot
hva_to_pfn
get_user_pages_unlocked
__gup_longterm_locked
__get_user_pages
handle_mm_fault
__handle_mm_fault
do_huge_pmd_wp_page
__split_huge_pmd
__mmu_notifier_invalidate_range_start
kvm_mmu_notifier_invalidate_range_start
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping
1
entry_SYSCALL_64_after_hwframe
do_syscall_64
syscall_exit_to_user_mode
exit_to_user_mode_prepare
arch_do_signal_or_restart
get_signal
do_group_exit
do_exit
mmput
__mmput
exit_mmap
__mmu_notifier_release
kvm_mmu_notifier_release
kvm_arch_flush_shadow_all
kvm_mmu_zap_all
kvm_mmu_commit_zap_page.part.0
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping
1
entry_SYSCALL_64_after_hwframe
do_syscall_64
syscall_exit_to_user_mode
exit_to_user_mode_prepare
arch_do_signal_or_restart
get_signal
do_group_exit
do_exit
task_work_run
____fput
__fput
kvm_vm_release
kvm_put_kvm
kvm_destroy_vm
kvm_arch_destroy_vm
kvm_mmu_unload
kvm_mmu_free_roots
kvm_mmu_commit_zap_page.part.0
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping
6
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
svm_handle_exit
svm_invoke_exit_handler
npf_interception
kvm_mmu_page_fault
kvm_mmu_do_page_fault
kvm_tdp_page_fault
direct_page_fault
kvm_faultin_pfn
__gfn_to_pfn_memslot
hva_to_pfn
get_user_pages_unlocked
__gup_longterm_locked
__get_user_pages
handle_mm_fault
__handle_mm_fault
do_wp_page
__mmu_notifier_invalidate_range_start
kvm_mmu_notifier_invalidate_range_start
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping
8
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
kvm_apic_accept_events
kvm_vcpu_reset
kvm_mmu_reset_context
kvm_mmu_free_roots
kvm_mmu_commit_zap_page.part.0
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping
20
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
svm_handle_exit
svm_invoke_exit_handler
npf_interception
kvm_mmu_page_fault
kvm_mmu_do_page_fault
kvm_tdp_page_fault
direct_page_fault
mmu_set_spte
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping_range
406
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vm_ioctl
__kvm_set_memory_region
kvm_set_memslot
kvm_arch_flush_shadow_memslot
kvm_page_track_flush_slot
kvm_mmu_invalidate_zap_pages_in_memslot
kvm_mmu_zap_all_fast
kvm_mmu_commit_zap_page.part.0
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping
406
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
kvm_mmu_free_obsolete_roots
__kvm_mmu_free_obsolete_roots
kvm_mmu_free_roots
kvm_mmu_commit_zap_page.part.0
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping


and here's the stacks for TDP MMU:

1
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
svm_handle_exit
svm_invoke_exit_handler
npf_interception
kvm_mmu_page_fault
kvm_mmu_do_page_fault
kvm_tdp_page_fault
kvm_faultin_pfn
__gfn_to_pfn_memslot
hva_to_pfn
get_user_pages_unlocked
__gup_longterm_locked
__get_user_pages
handle_mm_fault
__handle_mm_fault
do_huge_pmd_wp_page
__split_huge_pmd
__mmu_notifier_invalidate_range_start
kvm_mmu_notifier_invalidate_range_start
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping
6
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
svm_handle_exit
svm_invoke_exit_handler
npf_interception
kvm_mmu_page_fault
kvm_mmu_do_page_fault
kvm_tdp_page_fault
kvm_faultin_pfn
__gfn_to_pfn_memslot
hva_to_pfn
get_user_pages_unlocked
__gup_longterm_locked
__get_user_pages
handle_mm_fault
__handle_mm_fault
do_wp_page
__mmu_notifier_invalidate_range_start
kvm_mmu_notifier_invalidate_range_start
kvm_flush_remote_tlbs
hv_remote_flush_tlb
hv_remote_flush_tlb_with_range
hyperv_flush_guest_mapping

>> The revert appears
>> to be reasonably straightforward (see bottom).
>>
>> And _if_ we want to hack-a-fix it, then I would strongly prefer a very isolated,
>> obviously hacky fix, e.g.
>>
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 36e4561554ca..a9ba4ae14fda 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -5779,8 +5779,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>> tdp_root_level = tdp_forced_root_level;
>> max_tdp_level = tdp_max_root_level;
>>
>> + /*
>> + * FIXME: Remove the enlightened TLB restriction when KVM properly
>> + * handles TLB flushes for said enlightenment.
>> + */.
>> #ifdef CONFIG_X86_64
>> - tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
>> + tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled &&
>> + !(ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB);
>> #endif
>> /*
>> * max_huge_page_level reflects KVM's MMU capabilities irrespective
>>
>>
>>
>>
>> The revert...
>>
>> ---
>> arch/x86/kvm/svm/svm.c | 3 ---
>> arch/x86/kvm/svm/svm_onhyperv.h | 27 ---------------------------
>> 2 files changed, 30 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 11068e8eb969..292650dc85a0 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -1320,7 +1320,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
>> if (sev_guest(vcpu->kvm))
>> sev_init_vmcb(svm);
>>
>> - svm_hv_init_vmcb(vmcb);
>> init_vmcb_after_set_cpuid(vcpu);
>>
>> vmcb_mark_all_dirty(vmcb);
>> @@ -4075,8 +4074,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
>> svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
>> vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
>>
>> - hv_track_root_tdp(vcpu, root_hpa);
>> -
>> cr3 = vcpu->arch.cr3;
>> } else if (root_level >= PT64_ROOT_4LEVEL) {
>> cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
>> diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
>> index 6981c1e9a809..5118fd273e73 100644
>> --- a/arch/x86/kvm/svm/svm_onhyperv.h
>> +++ b/arch/x86/kvm/svm/svm_onhyperv.h
>> @@ -15,31 +15,8 @@ static struct kvm_x86_ops svm_x86_ops;
>>
>> int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
>>
>> -static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
>> -{
>> - struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
>> -
>> - BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) !=
>> - sizeof(vmcb->control.reserved_sw));
>> -
>> - if (npt_enabled &&
>> - ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
>> - hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
>> -
>> - if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
>> - hve->hv_enlightenments_control.msr_bitmap = 1;
>> -}
>> -
>> static inline void svm_hv_hardware_setup(void)
>> {
>> - if (npt_enabled &&
>> - ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
>> - pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
>> - svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
>> - svm_x86_ops.tlb_remote_flush_with_range =
>> - hv_remote_flush_tlb_with_range;
>> - }
>> -
>> if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
>> int cpu;
>>
>> @@ -80,10 +57,6 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
>> }
>> #else
>>
>> -static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
>> -{
>> -}
>> -
>> static inline void svm_hv_hardware_setup(void)
>> {
>> }
>>
>> base-commit: cb8748a781fe983e451f616ce4861a1c49ce79dd
>> --
>>