Re: [PATCH 1/5] KVM: arm64: Grab KVM MMU write lock in kvm_arch_flush_shadow_all()
From: Bibo Mao
Date: Tue May 05 2026 - 22:31:13 EST
On 2026/5/5 上午6:42, James Houghton wrote:
kvm_arch_flush_shadow_all() may sometimes be called on the same `kvm`By looking through the code, kvm_arch_destroy_vm() will free PGD page only, page table walking is executing in deleting memslot or exit_mm().
concurrently in the event that the KVM's `mm` is __mmput() at the
same time that last reference to the KVM is being dropped.
T1 T2
KVM_CREATE_VM
Get VM file from T1
close VM
exit_mm() close VM
T1: exit_mm() -> kvm_mmu_notifier_release() -> kvm_flush_shadow_all(),
with only the KVM srcu read lock held.
T2: kvm_vm_release() ---> mmu_notifier_unregister() ->
kvm_mmu_notifier_release() -> kvm_flush_shadow_all(),
again, with only the KVM srcu read lock held.
With normal code, life cycle of VM is something like this:
KVM_CREATE_VM
Create_VCPUs
Create memslots
Destroy_VCPUs
Destroy memslots
close VM
exit_mm()
And there is kvm_get_kvm()/kvm_put_kvm() function call with creating/destroy vCPUs, however no such operations with memslot operation. Is it possible that VM is destroyed without removing memslots, such as the following operation.
KVM_CREATE_VM
Create memslots
close VM
exit_mm()
Regards
Bibo Mao
This leads to a potential double-free of
kvm->arch.kvm_mmu_free_memory_cache and now with NV
kvm->arch.nested_mmus.
Cc: stable@xxxxxxxxxxxxxxx
Fixes: e7bf7a490c68 ("KVM: arm64: Split huge pages when dirty logging is enabled")
Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
---
arch/arm64/include/asm/kvm_mmu.h | 1 +
arch/arm64/kvm/mmu.c | 23 +++++++++++++++++++----
arch/arm64/kvm/nested.c | 4 +++-
3 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 01e9c72d6aa7..30d5c24fcebb 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -178,6 +178,7 @@ void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
void kvm_uninit_stage2_mmu(struct kvm *kvm);
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
+void kvm_free_stage2_pgd_locked(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d089c107d9b7..4bab407d43bb 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1021,7 +1021,9 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
void kvm_uninit_stage2_mmu(struct kvm *kvm)
{
- kvm_free_stage2_pgd(&kvm->arch.mmu);
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ kvm_free_stage2_pgd_locked(&kvm->arch.mmu);
kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}
@@ -1095,12 +1097,14 @@ void stage2_unmap_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
-void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
+static void __kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu, bool locked)
{
struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
struct kvm_pgtable *pgt = NULL;
- write_lock(&kvm->mmu_lock);
+ if (!locked)
+ write_lock(&kvm->mmu_lock);
+
pgt = mmu->pgt;
if (pgt) {
mmu->pgd_phys = 0;
@@ -1111,7 +1115,8 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
if (kvm_is_nested_s2_mmu(kvm, mmu))
kvm_init_nested_s2_mmu(mmu);
- write_unlock(&kvm->mmu_lock);
+ if (!locked)
+ write_unlock(&kvm->mmu_lock);
if (pgt) {
kvm_stage2_destroy(pgt);
@@ -1119,6 +1124,16 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
}
}
+void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
+{
+ __kvm_free_stage2_pgd(mmu, false);
+}
+
+void kvm_free_stage2_pgd_locked(struct kvm_s2_mmu *mmu)
+{
+ __kvm_free_stage2_pgd(mmu, true);
+}
+
static void hyp_mc_free_fn(void *addr, void *mc)
{
struct kvm_hyp_memcache *memcache = mc;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 883b6c1008fb..977598bff5e6 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1190,11 +1190,13 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
int i;
+ guard(write_lock)(&kvm->mmu_lock);
+
for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
if (!WARN_ON(atomic_read(&mmu->refcnt)))
- kvm_free_stage2_pgd(mmu);
+ kvm_free_stage2_pgd_locked(mmu);
}
kvfree(kvm->arch.nested_mmus);
kvm->arch.nested_mmus = NULL;