[RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl

From: Kiryl Shutsemau (Meta)

Date: Tue Apr 14 2026 - 10:30:22 EST


UFFDIO_DEACTIVATE marks pages as deactivated within a VM_UFFD_MINOR
range:

- Anonymous memory: set protnone via change_protection(MM_CP_UFFD_DEACTIVATE).
Pages stay resident with PFNs preserved, only permissions removed.
MM_CP_UFFD_DEACTIVATE is handled independently from MM_CP_PROT_NUMA,
bypassing folio_can_map_prot_numa() and CONFIG_NUMA_BALANCING guards.

- Shared shmem/hugetlbfs: zap PTEs via zap_page_range_single().
Pages stay in page cache.

- Private hugetlb: rejected with -EINVAL (zapping would destroy content).

Cleanup on unregister/close: restore protnone PTEs to normal permissions
in userfaultfd_clear_vma(), preventing permanently inaccessible pages.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@xxxxxxxxxx>
Assisted-by: Claude:claude-opus-4-6
---
fs/userfaultfd.c | 35 ++++++++++++++++
include/linux/mm.h | 2 +
include/linux/userfaultfd_k.h | 2 +
mm/huge_memory.c | 9 ++--
mm/mprotect.c | 9 +++-
mm/userfaultfd.c | 78 +++++++++++++++++++++++++++++++++--
6 files changed, 127 insertions(+), 8 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8d508ad19e89..b317c9854b86 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1441,6 +1441,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

+ /* DEACTIVATE is only supported for MINOR ranges. */
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+ ioctls_out &= ~((__u64)1 << _UFFDIO_DEACTIVATE);
+
/*
* Now that we scanned all vmas we can already tell
* userland which ioctls methods are guaranteed to
@@ -1788,6 +1792,34 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return ret;
}

+static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_range;
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ if (copy_from_user(&uffdio_range, (void __user *)arg,
+ sizeof(uffdio_range)))
+ return -EFAULT;
+
+ ret = validate_range(ctx->mm, uffdio_range.start, uffdio_range.len);
+ if (ret)
+ return ret;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mdeactivate_range(ctx, uffdio_range.start,
+ uffdio_range.len);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ return ret;
+}
+
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
__s64 ret;
@@ -2108,6 +2140,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_POISON:
ret = userfaultfd_poison(ctx, arg);
break;
+ case UFFDIO_DEACTIVATE:
+ ret = userfaultfd_deactivate(ctx, arg);
+ break;
}
return ret;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..fc2841264d56 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3036,6 +3036,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
+/* Whether this change is for uffd deactivation */
+#define MM_CP_UFFD_DEACTIVATE (1UL << 4)

bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d1d4ed4a08b0..c94b5c5b5f24 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -130,6 +130,8 @@ extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
+extern int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len);

/* move_pages */
void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b298cba853ab..2ad736ff007c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2563,6 +2563,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spinlock_t *ptl;
pmd_t oldpmd, entry;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
int ret = 1;
@@ -2582,8 +2583,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto unlock;
}

- if (prot_numa) {
+ /* Already protnone — nothing to do for either NUMA or uffd */
+ if ((prot_numa || uffd_deactivate) && pmd_protnone(*pmd))
+ goto unlock;

+ if (prot_numa) {
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -2592,9 +2596,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_huge_zero_pmd(*pmd))
goto unlock;

- if (pmd_protnone(*pmd))
- goto unlock;
-
if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
vma_is_single_threaded_private(vma)))
goto unlock;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c0571445bef7..7c612a680014 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -220,6 +220,7 @@ static long change_pte_range(struct mmu_gather *tlb,
long pages = 0;
bool is_private_single_threaded;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
int nr_ptes;
@@ -245,7 +246,8 @@ static long change_pte_range(struct mmu_gather *tlb,
pte_t ptent;

/* Already in the desired state. */
- if (prot_numa && pte_protnone(oldpte))
+ if ((prot_numa || uffd_deactivate) &&
+ pte_protnone(oldpte))
continue;

page = vm_normal_page(vma, addr, oldpte);
@@ -255,6 +257,8 @@ static long change_pte_range(struct mmu_gather *tlb,
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
+ * Skip this filter for uffd deactivation which
+ * must set protnone regardless of NUMA placement.
*/
if (prot_numa &&
!folio_can_map_prot_numa(folio, vma,
@@ -651,6 +655,9 @@ long change_protection(struct mmu_gather *tlb,
WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
#endif

+ if (cp_flags & MM_CP_UFFD_DEACTIVATE)
+ newprot = PAGE_NONE;
+
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot,
cp_flags);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index dba1ea26fdfe..3373b11b9d83 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -775,7 +775,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (!vma_is_shmem(dst_vma) &&
+ if (!vma_is_shmem(dst_vma) && !vma_is_anonymous(dst_vma) &&
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;

@@ -797,13 +797,16 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
break;
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
+ if (unlikely(!pmd_present(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
/*
* If the dst_pmd is THP don't override it and just be strict.
* (This includes the case where the PMD used to be THP and
* changed back to none after __pte_alloc().)
*/
- if (unlikely(!pmd_present(dst_pmdval) ||
- pmd_trans_huge(dst_pmdval))) {
+ if (unlikely(pmd_trans_huge(dst_pmdval))) {
err = -EEXIST;
break;
}
@@ -996,6 +999,65 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
return err;
}

+int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len)
+{
+ struct mm_struct *dst_mm = ctx->mm;
+ unsigned long end = start + len;
+ struct vm_area_struct *dst_vma;
+ long err;
+ VMA_ITERATOR(vmi, dst_mm, start);
+
+ VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(start + len <= start);
+
+ guard(mmap_read_lock)(dst_mm);
+ guard(rwsem_read)(&ctx->map_changing_lock);
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ err = -ENOENT;
+ for_each_vma_range(vmi, dst_vma, end) {
+ unsigned long vma_start = max(dst_vma->vm_start, start);
+ unsigned long vma_end = min(dst_vma->vm_end, end);
+
+ if (!userfaultfd_minor(dst_vma)) {
+ err = -ENOENT;
+ break;
+ }
+
+ /*
+ * Private hugetlb has no page cache to fall back on —
+ * zapping PTEs would destroy page content.
+ */
+ if (is_vm_hugetlb_page(dst_vma) &&
+ !(dst_vma->vm_flags & VM_SHARED)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (vma_is_anonymous(dst_vma)) {
+ /* Anonymous: set protnone, pages stay resident */
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, dst_mm);
+ err = change_protection(&tlb, dst_vma, vma_start,
+ vma_end,
+ MM_CP_UFFD_DEACTIVATE);
+ tlb_finish_mmu(&tlb);
+ if (err < 0)
+ break;
+ } else {
+ /* Shared shmem/hugetlb: zap PTEs, pages stay in page cache */
+ zap_page_range_single(dst_vma, vma_start,
+ vma_end - vma_start, NULL);
+ }
+ err = 0;
+ }
+ return err;
+}

void double_pt_lock(spinlock_t *ptl1,
spinlock_t *ptl2)
@@ -1988,6 +2050,16 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
if (userfaultfd_wp(vma))
uffd_wp_range(vma, start, end - start, false);

+ /* Restore protnone PTEs to normal permissions */
+ if (userfaultfd_minor(vma) && vma_is_anonymous(vma)) {
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ change_protection(&tlb, vma, start, end,
+ MM_CP_TRY_CHANGE_WRITABLE);
+ tlb_finish_mmu(&tlb);
+ }
+
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
NULL_VM_UFFD_CTX, give_up_on_oom);
--
2.51.2