[PATCH V2 02/10] KVM: X86: Synchronize the shadow pagetable before link it

From: Lai Jiangshan
Date: Fri Sep 17 2021 - 20:56:58 EST


From: Lai Jiangshan <laijs@xxxxxxxxxxxxxxxxx>

If gpte is changed from non-present to present, the guest doesn't need
to flush tlb per SDM. So the host must synchronze sp before
link it. Otherwise the guest might use a wrong mapping.

For example: the guest first changes a level-1 pagetable, and then
links its parent to a new place where the original gpte is non-present.
Finally the guest can access the remapped area without flushing
the tlb. The guest's behavior should be allowed per SDM, but the host
kvm mmu makes it wrong.

Fixes: 4731d4c7a077 ("KVM: MMU: out of sync shadow core")
Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxxxxx>
---
Changed from V1:
Don't loop, but just return when it needs to break.

arch/x86/kvm/mmu/mmu.c | 15 ++++++++-------
arch/x86/kvm/mmu/paging_tmpl.h | 22 ++++++++++++++++++++++
2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 26f6bd238a77..3c1b069a7bcf 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2024,8 +2024,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
} while (!sp->unsync_children);
}

-static void mmu_sync_children(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *parent)
+static int mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent, bool can_yield)
{
int i;
struct kvm_mmu_page *sp;
@@ -2052,12 +2052,16 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
}
if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ if (!can_yield)
+ return -EINTR;
+
cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
flush = false;
}
}

kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ return 0;
}

static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
@@ -2143,9 +2147,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}

- if (sp->unsync_children)
- kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-
__clear_sp_write_flooding_count(sp);

trace_get_page:
@@ -3642,7 +3643,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);

- mmu_sync_children(vcpu, sp);
+ mmu_sync_children(vcpu, sp, true);

kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
@@ -3658,7 +3659,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
- mmu_sync_children(vcpu, sp);
+ mmu_sync_children(vcpu, sp, true);
}
}

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5962d4f8a72e..87374cfd82be 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -704,6 +704,28 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
access = gw->pt_access[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
it.level-1, false, access);
+ /*
+ * We must synchronize the pagetable before link it
+ * because the guest doens't need to flush tlb when
+ * gpte is changed from non-present to present.
+ * Otherwise, the guest may use the wrong mapping.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_get_page() has already
+ * synchronized it transiently via kvm_sync_page().
+ *
+ * For higher level pagetable, we synchronize it
+ * via slower mmu_sync_children(). If it needs to
+ * break, returns RET_PF_RETRY and will retry on
+ * next #PF. It had already made some progress.
+ *
+ * It also makes KVM_REQ_MMU_SYNC request if the @sp
+ * is linked on a different addr to expedite it.
+ */
+ if (sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false)) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ return RET_PF_RETRY;
+ }
}

/*
--
2.19.1.6.gb485710b