Re: [PATCH] KVM: mmu_notifier: make mn_invalidate_lock non-sleeping for non-blocking invalidations

From: Paolo Bonzini

Date: Mon Mar 30 2026 - 07:32:24 EST

On Sat, Mar 28, 2026 at 3:50 PM shaikh kamaluddin
<shaikhkamal2012@xxxxxxxxx> wrote:
> +void __mmu_notifier_oom_enter(struct mm_struct *mm)
> +{
> + struct mmu_notifier *subscription;
> + int id;
> + pr_info("Entering :func:%s\n", __func__);
> + if (!mm->notifier_subscriptions)
> + return;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(subscription,
> + &mm->notifier_subscriptions->list, hlist,
> + rcu_read_lock_held(&srcu)) {
> + if(subscription->ops->oom_enter)
> + subscription->ops->oom_enter(subscription, mm);
> +
> + }
> + srcu_read_unlock(&srcu, id);
> + pr_info("Done:%s\n", __func__);

Yeah, calling mmu_notifier_unregister() won't work from within this function.

One possibility is for the new method to be something like this:

void (*after_oom_unregister)(struct mmu_notifier *subscription);

So it only has to do

kvm->mn_registered = false; /* or xchg, it's the same */
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
if (kvm->mn_active_invalidate_count)
kvm->mn_active_invalidate_count = 0;
else
WARN_ON(kvm->mmu_invalidate_in_progress);

or something like that. See the attached sketch, feel free to reuse it
as you see fit.

Paolo
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3b670ee4eb26..7b14d8099cc1 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1973,12 +1973,15 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
struct x86_exception *exception,
u64 pte_access)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_mmu *mmu = vcpu->arch.mmu;

BUG_ON(!mmu_is_nested(vcpu));

- /* NPT walks are always user-walks */
- access |= PFERR_USER_MASK;
+ /* Non-GMET walks are always user-walks */
+ if (!(svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_GMET_ENABLE))
+ access |= PFERR_USER_MASK;
+
return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
}

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e4cb317807ab..4a1c1f5297c4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7444,6 +7444,15 @@ static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
struct kvm_mmu *mmu = vcpu->arch.mmu;

BUG_ON(!mmu_is_nested(vcpu));
+
+ /*
+ * MBEC differentiates based on the effective U/S bit of
+ * the guest page tables; not the processor CPL.
+ */
+ access &= ~PFERR_USER_MASK;
+ if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK))
+ access |= PFERR_USER_MASK;
+
return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
}

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 8450e18a87c2..3c67ec15c09c 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -212,6 +212,14 @@ struct mmu_notifier_ops {
*/
struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
void (*free_notifier)(struct mmu_notifier *subscription);
+
+ /*
+ * Any mmu notifier that defines this is automatically unregistered
+ * when its mm is the subject of an OOM kill. after_oom_unregister()
+ * is invoked after all other outstanding callbacks have terminated.
+ */
+ void (*after_oom_unregister)(struct mmu_notifier *subscription,
+ struct mm_struct *mm);
};

/*
@@ -287,6 +295,7 @@ mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);
+void mmu_notifier_oom_enter(struct mm_struct *mm);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
struct mm_struct *mm);
@@ -661,6 +670,10 @@ static inline void mmu_notifier_synchronize(void)
{
}

+static inline void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+}
+
#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a6cdf3674bdc..deba056468b1 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -835,6 +835,56 @@ void mmu_notifier_unregister(struct mmu_notifier *subscription,
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);

+void mmu_notifier_oom_enter(struct mm_struct *mm)
+{
+ struct mmu_notifier_subscriptions *subscriptions = mm->notifier_subscriptions;
+ struct mmu_notifier *subscription;
+ struct hlist_node *tmp;
+ HLIST_HEAD(oom_list);
+ int id;
+
+ id = srcu_read_lock(&srcu);
+
+ /*
+ * Prevent further calls to the MMU notifier, except for
+ * release and after_oom_unregister.
+ */
+ spin_lock(&subscriptions->lock);
+ hlist_for_each_entry_safe(subscription, tmp, &subscriptions->list, hlist) {
+ if (!subscription->ops->after_oom_unregister)
+ continue;
+
+ /*
+ * after_oom_unregister and alloc_notifier are incompatible,
+ * because there could be other references to allocated
+ * notifiers.
+ */
+ if (WARN_ON(subscription->ops->alloc_notifier))
+ continue;
+
+ hlist_del_init_rcu(&subscription->hlist);
+ hlist_add_head(&subscription->hlist, &oom_list);
+ }
+ spin_unlock(&subscriptions->lock);
+
+ hlist_for_each_entry(subscription, &oom_list, hlist)
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
+ srcu_read_unlock(&srcu, id);
+
+ if (hlist_empty(&oom_list))
+ return;
+
+ synchronize_srcu(&srcu);
+
+ hlist_for_each_entry_safe(subscription, tmp, &oom_list, hlist) {
+ subscription->ops->after_oom_unregister(subscription, mm);
+
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+ mmdrop(mm);
+ }
+}
+
static void mmu_notifier_free_rcu(struct rcu_head *rcu)
{
struct mmu_notifier *subscription =