[PATCH v3 1/6] mm: userfaultfd: generic continue for non hugetlbfs

From: Nikita Kalyazin
Date: Fri Apr 04 2025 - 11:44:52 EST


Remove shmem-specific code from UFFDIO_CONTINUE implementation for
non-huge pages by calling vm_ops->fault(). A new VMF flag,
FAULT_FLAG_USERFAULT_CONTINUE, is introduced to avoid recursive call to
handle_userfault().

Suggested-by: James Houghton <jthoughton@xxxxxxxxxx>
Signed-off-by: Nikita Kalyazin <kalyazin@xxxxxxxxxx>
---
include/linux/mm_types.h | 4 ++++
mm/hugetlb.c | 2 +-
mm/shmem.c | 9 ++++++---
mm/userfaultfd.c | 37 +++++++++++++++++++++++++++----------
4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6..2f26ee9742bf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1429,6 +1429,9 @@ enum tlb_flush_reason {
* @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
* We should only access orig_pte if this flag set.
* @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
+ * @FAULT_FLAG_USERFAULT_CONTINUE: The fault handler must not call userfaultfd
+ * minor handler as it is being called by the
+ * userfaultfd code itself.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
@@ -1467,6 +1470,7 @@ enum fault_flag {
FAULT_FLAG_UNSHARE = 1 << 10,
FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
FAULT_FLAG_VMA_LOCK = 1 << 12,
+ FAULT_FLAG_USERFAULT_CONTINUE = 1 << 13,
};

typedef unsigned int __bitwise zap_flags_t;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 97930d44d460..c004cfdcd4e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6228,7 +6228,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
}

/* Check for page in userfault range. */
- if (userfaultfd_minor(vma)) {
+ if (userfaultfd_minor(vma) && !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) {
folio_unlock(folio);
folio_put(folio);
/* See comment in userfaultfd_missing() block above */
diff --git a/mm/shmem.c b/mm/shmem.c
index 1ede0800e846..b4159303fe59 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2467,7 +2467,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
fault_mm = vma ? vma->vm_mm : NULL;

folio = filemap_get_entry(inode->i_mapping, index);
- if (folio && vma && userfaultfd_minor(vma)) {
+ if (folio && vma && userfaultfd_minor(vma) &&
+ !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) {
if (!xa_is_value(folio))
folio_put(folio);
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
@@ -2727,6 +2728,8 @@ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ enum sgp_type sgp = vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE ?
+ SGP_NOALLOC : SGP_CACHE;
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
struct folio *folio = NULL;
vm_fault_t ret = 0;
@@ -2743,8 +2746,8 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
}

WARN_ON_ONCE(vmf->page != NULL);
- err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
- gfp, vmf, &ret);
+ err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, sgp, gfp, vmf,
+ &ret);
if (err)
return vmf_error(err);
if (folio) {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d06453fa8aba..4b3dbc7dac64 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -380,30 +380,47 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
return ret;
}

-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
+/* Handles UFFDIO_CONTINUE for all VMAs */
static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
uffd_flags_t flags)
{
- struct inode *inode = file_inode(dst_vma->vm_file);
- pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
struct folio *folio;
struct page *page;
int ret;
+ struct vm_fault vmf = {
+ .vma = dst_vma,
+ .address = dst_addr,
+ .flags = FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE |
+ FAULT_FLAG_USERFAULT_CONTINUE,
+ .pte = NULL,
+ .page = NULL,
+ .pgoff = linear_page_index(dst_vma, dst_addr),
+ };
+
+ if (!dst_vma->vm_ops || !dst_vma->vm_ops->fault)
+ return -EINVAL;

- ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
- /* Our caller expects us to return -EFAULT if we failed to find folio */
- if (ret == -ENOENT)
+retry:
+ ret = dst_vma->vm_ops->fault(&vmf);
+ if (ret & VM_FAULT_ERROR) {
ret = -EFAULT;
- if (ret)
goto out;
- if (!folio) {
- ret = -EFAULT;
+ }
+
+ if (ret & VM_FAULT_NOPAGE) {
+ ret = -EAGAIN;
goto out;
}

- page = folio_file_page(folio, pgoff);
+ if (ret & VM_FAULT_RETRY)
+ goto retry;
+
+ page = vmf.page;
+ folio = page_folio(page);
+ BUG_ON(!folio);
+
if (PageHWPoison(page)) {
ret = -EIO;
goto out_release;
--
2.47.1