[PATCH 11/14] userfaultfd: add UFFD_FEATURE_RWP_ASYNC for async fault resolution
From: Kiryl Shutsemau (Meta)
Date: Mon Apr 27 2026 - 07:51:05 EST
Sync RWP delivers a message and blocks the faulting thread until the
handler resolves the fault. For working-set tracking the VMM does not
need the message: it just needs to know, at scan time, which pages
were touched. Async RWP serves that use case — the kernel restores
access in-place and the faulting thread continues without blocking.
The VMM reconstructs the access pattern after the fact via
PAGEMAP_SCAN: pages whose uffd bit is still set (inverted
PAGE_IS_ACCESSED) were not re-accessed since the last RWP cycle.
Worth calling out: async resolution upgrades writable private anon
PTEs via pte_mkwrite() when can_change_pte_writable() allows, mirroring
do_numa_page(). Without it, every re-access of an RWP'd writable page
would COW-fault a second time.
UFFD_FEATURE_RWP_ASYNC requires UFFD_FEATURE_RWP.
Signed-off-by: Kiryl Shutsemau <kas@xxxxxxxxxx>
Assisted-by: Claude:claude-opus-4-6
---
fs/userfaultfd.c | 19 ++++++++++++++++++-
include/linux/userfaultfd_k.h | 6 ++++++
include/uapi/linux/userfaultfd.h | 11 ++++++++++-
mm/huge_memory.c | 25 ++++++++++++++++++++++++-
mm/hugetlb.c | 32 +++++++++++++++++++++++++++++++-
mm/memory.c | 27 +++++++++++++++++++++++++--
6 files changed, 114 insertions(+), 6 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 6e577c4ac4dd..4a701ac830f4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -89,6 +89,11 @@ static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
}
+static bool userfaultfd_rwp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_RWP_ASYNC);
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -1989,6 +1994,11 @@ bool userfaultfd_wp_async(struct vm_area_struct *vma)
return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
}
+bool userfaultfd_rwp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_rwp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2092,6 +2102,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (features & UFFD_FEATURE_WP_ASYNC)
features |= UFFD_FEATURE_WP_UNPOPULATED;
+ ret = -EINVAL;
+ /* RWP_ASYNC requires RWP */
+ if ((features & UFFD_FEATURE_RWP_ASYNC) &&
+ !(features & UFFD_FEATURE_RWP))
+ goto err_out;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2114,7 +2130,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
* but not actually usable.
*/
if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd())
- uffdio_api.features &= ~UFFD_FEATURE_RWP;
+ uffdio_api.features &=
+ ~(UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC);
ret = -EINVAL;
if (features & ~uffdio_api.features)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 37e8d0d29353..777e332edeff 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -295,6 +295,7 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
+extern bool userfaultfd_rwp_async(struct vm_area_struct *vma);
void userfaultfd_reset_ctx(struct vm_area_struct *vma);
@@ -492,6 +493,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_rwp_async(struct vm_area_struct *vma)
+{
+ return false;
+}
+
static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
{
return false;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index d803e76d47ad..c10f08f8a618 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -44,7 +44,8 @@
UFFD_FEATURE_POISON | \
UFFD_FEATURE_WP_ASYNC | \
UFFD_FEATURE_MOVE | \
- UFFD_FEATURE_RWP)
+ UFFD_FEATURE_RWP | \
+ UFFD_FEATURE_RWP_ASYNC)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -243,6 +244,13 @@ struct uffdio_api {
* UFFDIO_REGISTER_MODE_RWP for read-write protection tracking.
* Pages are made inaccessible via UFFDIO_RWPROTECT and faults
* are delivered when the pages are re-accessed.
+ *
+ * UFFD_FEATURE_RWP_ASYNC indicates asynchronous mode for
+ * UFFDIO_REGISTER_MODE_RWP. When set, faults on read-write
+ * protected pages are auto-resolved by the kernel (PTE
+ * permissions restored immediately) without delivering a message
+ * to the userfaultfd handler. Use PAGEMAP_SCAN with inverted
+ * PAGE_IS_ACCESSED to find pages that were not re-accessed.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -262,6 +270,7 @@ struct uffdio_api {
#define UFFD_FEATURE_WP_ASYNC (1<<15)
#define UFFD_FEATURE_MOVE (1<<16)
#define UFFD_FEATURE_RWP (1<<17)
+#define UFFD_FEATURE_RWP_ASYNC (1<<18)
__u64 features;
__u64 ioctls;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 631e0355919f..d49facfdb16b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2266,7 +2266,30 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
{
- return handle_userfault(vmf, VM_UFFD_RWP);
+ struct vm_area_struct *vma = vmf->vma;
+ pmd_t pmd;
+
+ if (!userfaultfd_rwp_async(vma))
+ return handle_userfault(vmf, VM_UFFD_RWP);
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
+ pmd = pmd_modify(vmf->orig_pmd, vma->vm_page_prot);
+ /* pmd_modify() preserves _PAGE_UFFD; drop it on resolution */
+ pmd = pmd_clear_uffd(pmd);
+ pmd = pmd_mkyoung(pmd);
+ if (!pmd_write(pmd) &&
+ vma_wants_manual_pte_write_upgrade(vma) &&
+ can_change_pmd_writable(vma, vmf->address, pmd))
+ pmd = pmd_mkwrite(pmd, vma);
+ set_pmd_at(vma->vm_mm, vmf->address & HPAGE_PMD_MASK,
+ vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ spin_unlock(vmf->ptl);
+ return 0;
}
/* NUMA hinting page fault entry point for trans huge pmds */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bac9aa852f6b..dc581adcb0ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6075,7 +6075,37 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (pte_protnone(vmf.orig_pte) && vma_is_accessible(vma) &&
userfaultfd_rwp(vma) && huge_pte_uffd(vmf.orig_pte)) {
- return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP);
+ spinlock_t *ptl;
+ pte_t pte;
+
+ /* Sync: drop hugetlb locks before blocking in handle_userfault() */
+ if (!userfaultfd_rwp_async(vma))
+ return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP);
+
+ ptl = huge_pte_lock(h, mm, vmf.pte);
+ pte = huge_ptep_get(mm, vmf.address, vmf.pte);
+ if (pte_protnone(pte) && huge_pte_uffd(pte)) {
+ unsigned int shift = huge_page_shift(h);
+
+ pte = huge_pte_modify(pte, vma->vm_page_prot);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+ /* huge_pte_modify() preserves _PAGE_UFFD; drop it on resolution */
+ pte = huge_pte_clear_uffd(pte);
+ pte = pte_mkyoung(pte);
+ /*
+ * Unlike do_uffd_rwp(), do not upgrade to writable
+ * here. Hugetlb lacks a can_change_huge_pte_writable()
+ * equivalent, so a write access will take a separate
+ * COW fault — acceptable for the rare private hugetlb
+ * case.
+ */
+ set_huge_pte_at(mm, vmf.address, vmf.pte, pte,
+ huge_page_size(h));
+ update_mmu_cache(vma, vmf.address, vmf.pte);
+ }
+ spin_unlock(ptl);
+ ret = 0;
+ goto out_mutex;
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index e0dcf2c28d9d..bfe6f218fb16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6174,8 +6174,31 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru
static vm_fault_t do_uffd_rwp(struct vm_fault *vmf)
{
- pte_unmap(vmf->pte);
- return handle_userfault(vmf, VM_UFFD_RWP);
+ pte_t pte;
+
+ if (!userfaultfd_rwp_async(vmf->vma)) {
+ /* Sync mode: unmap PTE and deliver to userfaultfd handler */
+ pte_unmap(vmf->pte);
+ return handle_userfault(vmf, VM_UFFD_RWP);
+ }
+
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+ pte = pte_modify(vmf->orig_pte, vmf->vma->vm_page_prot);
+ /* pte_modify() preserves _PAGE_UFFD; drop it on resolution */
+ pte = pte_clear_uffd(pte);
+ pte = pte_mkyoung(pte);
+ if (!pte_write(pte) &&
+ vma_wants_manual_pte_write_upgrade(vmf->vma) &&
+ can_change_pte_writable(vmf->vma, vmf->address, pte))
+ pte = pte_mkwrite(pte, vmf->vma);
+ set_pte_at(vmf->vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
--
2.51.2