Re: [PATCH v19 059/130] KVM: x86/tdp_mmu: Don't zap private pages for unsupported cases

From: Chao Gao
Date: Tue Apr 16 2024 - 22:21:46 EST


On Mon, Feb 26, 2024 at 12:26:01AM -0800, isaku.yamahata@xxxxxxxxx wrote:
>@@ -779,6 +780,10 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
>
> lockdep_assert_held_write(&kvm->mmu_lock);
>
>+ WARN_ON_ONCE(zap_private && !is_private_sp(root));
>+ if (!zap_private && is_private_sp(root))
>+ return false;

Should be "return flush;".

Fengwei and I spent one week chasing a bug where virtio-net in the TD guest may
stop working at some point after bootup if the host enables numad. We finally
found that the bug was introduced by the 'return false' statement, which left
some stale EPT entries unflushed.

I am wondering if we can refactor related functions slightly to make it harder
to make such mistakes and make it easier to identify them. e.g., we could make
"@flush" an in/out parameter of tdp_mmu_zap_leafs(), kvm_tdp_mmu_zap_leafs()
and kvm_tdp_mmu_unmap_gfn_range(). It looks more apparent that "*flush = false"
below could be problematic if the changes were something like:

if (!zap_private && is_private_sp(root)) {
*flush = false;
return;
}


>+
> rcu_read_lock();
>
> for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
>@@ -810,13 +815,15 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
> * more SPTEs were zapped since the MMU lock was last acquired.
> */
>-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
>+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush,
>+ bool zap_private)
> {
> struct kvm_mmu_page *root;
>
> lockdep_assert_held_write(&kvm->mmu_lock);
> for_each_tdp_mmu_root_yield_safe(kvm, root)
>- flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
>+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush,
>+ zap_private && is_private_sp(root));
>
> return flush;
> }
>@@ -891,7 +898,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
> * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
> * See kvm_tdp_mmu_get_vcpu_root_hpa().
> */
>-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
>+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private)
> {
> struct kvm_mmu_page *root;
>
>@@ -916,6 +923,12 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
> * or get/put references to roots.
> */
> list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
>+ /*
>+ * Skip private root since private page table
>+ * is only torn down when VM is destroyed.
>+ */
>+ if (skip_private && is_private_sp(root))
>+ continue;
> /*
> * Note, invalid roots can outlive a memslot update! Invalid
> * roots must be *zapped* before the memslot update completes,
>@@ -1104,14 +1117,26 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> return ret;
> }
>
>+/* Used by mmu notifier via kvm_unmap_gfn_range() */
> bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
> bool flush)
> {
> struct kvm_mmu_page *root;
>+ bool zap_private = false;
>+
>+ if (kvm_gfn_shared_mask(kvm)) {
>+ if (!range->only_private && !range->only_shared)
>+ /* attributes change */
>+ zap_private = !(range->arg.attributes &
>+ KVM_MEMORY_ATTRIBUTE_PRIVATE);
>+ else
>+ zap_private = range->only_private;
>+ }
>
> __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
> flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
>- range->may_block, flush);
>+ range->may_block, flush,
>+ zap_private && is_private_sp(root));
>
> return flush;
> }
>diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
>index 20d97aa46c49..b3cf58a50357 100644
>--- a/arch/x86/kvm/mmu/tdp_mmu.h
>+++ b/arch/x86/kvm/mmu/tdp_mmu.h
>@@ -19,10 +19,11 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
>
> void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
>
>-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
>+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush,
>+ bool zap_private);
> bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
> void kvm_tdp_mmu_zap_all(struct kvm *kvm);
>-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
>+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private);
> void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
>
> int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
>--
>2.25.1
>
>