[PATCH 1/3] mm/mmu_notifier: let interval notifiers block THP
From: Yitao Jiang
Date: Thu Jun 25 2026 - 07:00:47 EST
Some secondary MMUs cannot safely tolerate a user VMA becoming backed
by transparent huge pages after the range has been registered with an
interval notifier. Drivers can observe the page-table layout change
through invalidations, but devices without replayable faults, or ranges
that must stay mapped, cannot necessarily re-establish coherent device
mappings before later device access.
Add MMU_INTERVAL_NOTIFIER_BLOCK_THP so a driver can declare this
property when registering an interval notifier. The MM core then marks
the covered VMA range VM_NOHUGEPAGE and clears VM_HUGEPAGE while
holding mmap_lock for write. A later MADV_HUGEPAGE on the same active
range is treated as an ignored hint, leaving the MM-owned nohuge
policy intact. MADV_COLLAPSE already rejects VM_NOHUGEPAGE VMAs.
This keeps the policy in MM code instead of requiring device drivers
to edit VMA THP flags directly, and it only affects opt-in notifier
ranges at registration or flag-transition time.
Assisted-by: OpenAI-Codex:GPT-5.5
Signed-off-by: Yitao Jiang <jytscientist@xxxxxxxxxxx>
---
include/linux/huge_mm.h | 5 +-
include/linux/mmu_notifier.h | 28 +++++
mm/khugepaged.c | 9 +-
mm/madvise.c | 3 +-
mm/mmu_notifier.c | 204 +++++++++++++++++++++++++++++++++--
5 files changed, 237 insertions(+), 12 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ad20f7f8c..3dae515ff 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -489,8 +489,8 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
__split_huge_pud(__vma, __pud, __address); \
} while (0)
-int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags,
- int advice);
+int hugepage_madvise(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, vm_flags_t *vm_flags, int advice);
int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
unsigned long end, bool *lock_dropped);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
@@ -694,6 +694,7 @@ static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
do { } while (0)
static inline int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
vm_flags_t *vm_flags, int advice)
{
return -EINVAL;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a11a44eef..4accfb65f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -293,8 +293,16 @@ struct mmu_interval_notifier {
struct mm_struct *mm;
struct hlist_node deferred_item;
unsigned long invalidate_seq;
+ unsigned int flags;
};
+/*
+ * The interval range cannot safely be backed by transparent huge pages while
+ * the notifier is active. The MM core owns the VMA policy change so drivers
+ * do not have to manipulate VM_HUGEPAGE/VM_NOHUGEPAGE directly.
+ */
+#define MMU_INTERVAL_NOTIFIER_BLOCK_THP BIT(0)
+
#ifdef CONFIG_MMU_NOTIFIER
#ifdef CONFIG_LOCKDEP
@@ -347,7 +355,20 @@ int mmu_interval_notifier_insert_locked(
struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
unsigned long start, unsigned long length,
const struct mmu_interval_notifier_ops *ops);
+int
+mmu_interval_notifier_insert_locked_flags(struct mmu_interval_notifier *interval_sub,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops *ops,
+ unsigned int flags);
+int
+mmu_interval_notifier_set_flags_locked(struct mmu_interval_notifier *interval_sub,
+ unsigned int flags);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);
+bool mmu_interval_notifier_range_block_thp(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
/**
* mmu_interval_set_seq - Save the invalidation sequence
@@ -637,6 +658,13 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}
+static inline bool mmu_interval_notifier_range_block_thp(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ return false;
+}
+
#define mmu_notifier_range_update_to_read_only(r) false
static inline void mmu_notifier_synchronize(void)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 617bca76d..a9b05e716 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -445,11 +445,16 @@ static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
return khugepaged_max_ptes_swap;
}
-int hugepage_madvise(struct vm_area_struct *vma,
- vm_flags_t *vm_flags, int advice)
+int hugepage_madvise(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, vm_flags_t *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
+ if ((*vm_flags & VM_NOHUGEPAGE) &&
+ mmu_interval_notifier_range_block_thp(vma->vm_mm,
+ start, end))
+ return 0;
+
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
diff --git a/mm/madvise.c b/mm/madvise.c
index cd9bb0770..c7cee4fcf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1416,7 +1416,8 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
- error = hugepage_madvise(vma, &new_flags, behavior);
+ error = hugepage_madvise(vma, range->start, range->end,
+ &new_flags, behavior);
if (error)
goto out;
break;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 245b74f39..852a5682b 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -581,6 +581,49 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
return 0;
}
+/**
+ * mmu_interval_notifier_range_block_thp - check if a range must not use THP
+ * @mm: mm_struct to check
+ * @start: start address
+ * @end: end address
+ *
+ * Return true if an active interval notifier covering the range requested
+ * MMU_INTERVAL_NOTIFIER_BLOCK_THP.
+ */
+bool mmu_interval_notifier_range_block_thp(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct mmu_notifier_subscriptions *subscriptions;
+ struct mmu_interval_notifier *interval_sub;
+ struct interval_tree_node *node;
+ bool block_thp = false;
+
+ if (start >= end)
+ return false;
+
+ /* Pairs with the store in mmu_notifier_register(). */
+ subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
+ if (!subscriptions || !subscriptions->has_itree)
+ return false;
+
+ spin_lock(&subscriptions->lock);
+ for (node = interval_tree_iter_first(&subscriptions->itree, start,
+ end - 1);
+ node;
+ node = interval_tree_iter_next(node, start, end - 1)) {
+ interval_sub = container_of(node, struct mmu_interval_notifier,
+ interval_tree);
+ if (interval_sub->flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) {
+ block_thp = true;
+ break;
+ }
+ }
+ spin_unlock(&subscriptions->lock);
+
+ return block_thp;
+}
+
static void
mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
struct mmu_notifier_range *range)
@@ -933,13 +976,69 @@ void mmu_notifier_put(struct mmu_notifier *subscription)
}
EXPORT_SYMBOL_GPL(mmu_notifier_put);
+#define MMU_INTERVAL_NOTIFIER_KNOWN_FLAGS \
+ (MMU_INTERVAL_NOTIFIER_BLOCK_THP)
+
+static int mmu_interval_notifier_check_flags(unsigned int flags)
+{
+ if (flags & ~MMU_INTERVAL_NOTIFIER_KNOWN_FLAGS)
+ return -EINVAL;
+ return 0;
+}
+
+static int
+mmu_interval_notifier_block_thp_locked(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct vm_area_struct *vma, *prev;
+ struct vma_iterator vmi;
+
+ mmap_assert_write_locked(mm);
+
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_iter_load(&vmi);
+ prev = vma_prev(&vmi);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for_each_vma_range(vmi, vma, end) {
+ const unsigned long curr_start = max(vma->vm_start, start);
+ const unsigned long curr_end = min(vma->vm_end, end);
+ vma_flags_t new_flags;
+
+ if (vma->vm_flags & VM_NO_KHUGEPAGED)
+ goto next;
+
+ new_flags = vma->flags;
+ vma_flags_set(&new_flags, VMA_NOHUGEPAGE_BIT);
+ vma_flags_clear(&new_flags, VMA_HUGEPAGE_BIT);
+ if (vma_flags_same_pair(&new_flags, &vma->flags))
+ goto next;
+
+ vma = vma_modify_flags(&vmi, prev, vma, curr_start,
+ curr_end, &new_flags);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ vma_start_write(vma);
+ vma->flags = new_flags;
+next:
+ prev = vma;
+ }
+
+ return 0;
+}
+
static int __mmu_interval_notifier_insert(
struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
struct mmu_notifier_subscriptions *subscriptions, unsigned long start,
- unsigned long length, const struct mmu_interval_notifier_ops *ops)
+ unsigned long length, const struct mmu_interval_notifier_ops *ops,
+ unsigned int flags)
{
interval_sub->mm = mm;
interval_sub->ops = ops;
+ interval_sub->flags = flags;
RB_CLEAR_NODE(&interval_sub->interval_tree.rb);
interval_sub->interval_tree.start = start;
/*
@@ -1034,32 +1133,123 @@ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
subscriptions = mm->notifier_subscriptions;
}
return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
- start, length, ops);
+ start, length, ops, 0);
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
-int mmu_interval_notifier_insert_locked(
- struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
- unsigned long start, unsigned long length,
- const struct mmu_interval_notifier_ops *ops)
+/**
+ * mmu_interval_notifier_insert_locked_flags - Insert an interval notifier
+ * @interval_sub: Interval subscription to register
+ * @mm: mm_struct to attach to
+ * @start: Starting virtual address to monitor
+ * @length: Length of the range to monitor
+ * @ops: Interval notifier operations to be called on matching events
+ * @flags: MMU_INTERVAL_NOTIFIER_* flags
+ *
+ * Like mmu_interval_notifier_insert_locked(), but lets callers request
+ * additional MM-owned policy for the interval while holding mmap_lock for
+ * write.
+ */
+int
+mmu_interval_notifier_insert_locked_flags(struct mmu_interval_notifier *interval_sub,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops *ops,
+ unsigned int flags)
{
struct mmu_notifier_subscriptions *subscriptions =
mm->notifier_subscriptions;
+ unsigned long end;
int ret;
mmap_assert_write_locked(mm);
+ ret = mmu_interval_notifier_check_flags(flags);
+ if (ret)
+ return ret;
+
+ if (flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) {
+ if (length == 0 || check_add_overflow(start, length, &end))
+ return -EOVERFLOW;
+ }
+
if (!subscriptions || !subscriptions->has_itree) {
ret = __mmu_notifier_register(NULL, mm);
if (ret)
return ret;
subscriptions = mm->notifier_subscriptions;
}
+
+ if (flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) {
+ ret = mmu_interval_notifier_block_thp_locked(mm, start, end);
+ if (ret)
+ return ret;
+ }
+
return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
- start, length, ops);
+ start, length, ops, flags);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked_flags);
+
+int mmu_interval_notifier_insert_locked(struct mmu_interval_notifier *interval_sub,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ return mmu_interval_notifier_insert_locked_flags(interval_sub, mm,
+ start, length,
+ ops, 0);
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
+/**
+ * mmu_interval_notifier_set_flags_locked - update an interval notifier's flags
+ * @interval_sub: Interval subscription to update
+ * @flags: MMU_INTERVAL_NOTIFIER_* flags
+ *
+ * Update MMU interval notifier flags while holding mmap_lock for write. When
+ * enabling MMU_INTERVAL_NOTIFIER_BLOCK_THP, the MM core first updates the VMA
+ * THP policy for the notifier's address range.
+ */
+int
+mmu_interval_notifier_set_flags_locked(struct mmu_interval_notifier *interval_sub,
+ unsigned int flags)
+{
+ struct mm_struct *mm = interval_sub->mm;
+ unsigned long start = interval_sub->interval_tree.start;
+ unsigned long end;
+ int ret;
+
+ ret = mmu_interval_notifier_check_flags(flags);
+ if (ret)
+ return ret;
+
+ if (WARN_ON_ONCE(!mm))
+ return -EINVAL;
+
+ mmap_assert_write_locked(mm);
+
+ if ((flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) &&
+ !(interval_sub->flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP)) {
+ if (interval_sub->interval_tree.last == ULONG_MAX)
+ return -EOVERFLOW;
+ end = interval_sub->interval_tree.last + 1;
+
+ ret = mmu_interval_notifier_block_thp_locked(mm, start, end);
+ if (ret)
+ return ret;
+ }
+
+ spin_lock(&mm->notifier_subscriptions->lock);
+ interval_sub->flags = flags;
+ spin_unlock(&mm->notifier_subscriptions->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_set_flags_locked);
+
static bool
mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
unsigned long seq)
--
2.53.0