[RFC PATCH v3] mm: retry page faults under per-VMA lock when mmap_lock is not required
From: Hongru Zhang
Date: Fri Jun 26 2026 - 03:51:51 EST
From: Hongru Zhang <zhanghongru@xxxxxxxxxx>
Currently, when a page fault returns VM_FAULT_RETRY under the per-VMA
lock due to folio_lock() failing, mainly because the folio is under
I/O, the architecture fault handler unconditionally falls back to
retrying with mmap_lock. This leads to mmap_lock contention.
This patch introduces VM_FAULT_RETRY_HARD to mark the paths that
require mmap_lock. Retries now stay under the per-VMA lock by default,
and only marked paths fall back to the mmap_lock retry path.
Based on the stress model from Kunwu Chan and Wang Lian in v2, we
adapted a benchmark for a 20-core desktop environment (reduced thread
count, adjusted memcg limits, 20-core Intel i7-12700). The benchmark
uses concurrent page faults under memcg pressure (forcing reclaim and
VM_FAULT_RETRY) with parallel munmap to amplify mmap_lock read-write
contention.
Throughput (higher is better):
+---------+------------+------------+-------------+
| Threads | Vanilla | Patched | Improvement |
+---------+------------+------------+-------------+
| 40 | 1071.11 /s | 1301.11 /s | +21.5% |
+---------+------------+------------+-------------+
| 60 | 1043.15 /s | 1472.36 /s | +41.1% |
+---------+------------+------------+-------------+
| 80 | 1049.39 /s | 1665.65 /s | +58.7% |
+---------+------------+------------+-------------+
mmap_lock contention count (lower is better):
+---------+-----------+---------+-----------+
| Threads | Vanilla | Patched | Reduction |
+---------+-----------+---------+-----------+
| 40 | 3,217,904 | 51,201 | -98.4% |
+---------+-----------+---------+-----------+
| 60 | 4,419,149 | 55,711 | -98.7% |
+---------+-----------+---------+-----------+
| 80 | 5,395,730 | 66,184 | -98.8% |
+---------+-----------+---------+-----------+
Benchmark and test scripts:
https://gist.github.com/zhr250/c36c2c54d9351df37e12fd072d4926ef
Suggested-by: Barry Song <baohua@xxxxxxxxxx>
Suggested-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
Signed-off-by: Hongru Zhang <zhanghongru@xxxxxxxxxx>
---
Changes since v2:
- v2 required each retry path to explicitly opt in to VMA lock retry
(VM_FAULT_RETRY_VMA). This patch inverts the logic: retries default
to VMA lock, and only paths requiring mmap_lock opt out
(VM_FAULT_RETRY_HARD)
- This patch corresponds to v2 patch 1/5; the remaining optimizations
will be submitted separately
- Rebased on mm-unstable
Link to v2:
https://lore.kernel.org/all/20260430040427.4672-1-baohua@xxxxxxxxxx/
Link to v1:
https://lore.kernel.org/all/20251127011438.6918-1-21cnbao@xxxxxxxxx/
arch/arm/mm/fault.c | 4 ++++
arch/arm64/mm/fault.c | 4 ++++
arch/loongarch/mm/fault.c | 4 ++++
arch/powerpc/mm/fault.c | 4 ++++
arch/riscv/mm/fault.c | 4 ++++
arch/s390/mm/fault.c | 4 ++++
arch/x86/mm/fault.c | 4 ++++
include/linux/mm_types.h | 9 +++++----
mm/huge_memory.c | 2 +-
mm/memory.c | 6 +++---
10 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index e62cc4be5adf..2f909fb4f7db 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -391,6 +391,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, addr);
if (!vma)
goto lock_mmap;
@@ -420,6 +421,9 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
goto no_context;
return 0;
}
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
+
lock_mmap:
retry:
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 739800835920..695a09486795 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -673,6 +673,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
if (!(mm_flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, addr);
if (!vma)
goto lock_mmap;
@@ -719,6 +720,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
goto no_context;
return 0;
}
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
+
lock_mmap:
retry:
diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
index 2c93d33356e5..d7dd55722e52 100644
--- a/arch/loongarch/mm/fault.c
+++ b/arch/loongarch/mm/fault.c
@@ -219,6 +219,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, address);
if (!vma)
goto lock_mmap;
@@ -265,6 +266,9 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
no_context(regs, write, address);
return;
}
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
+
lock_mmap:
retry:
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 806c74e0d5ab..65ec9a8252e7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -487,6 +487,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, address);
if (!vma)
goto lock_mmap;
@@ -517,6 +518,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (fault_signal_pending(fault, regs))
return user_mode(regs) ? 0 : SIGBUS;
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
+
lock_mmap:
/* When running in the kernel we expect faults to occur only to
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 04ed6f8acae4..6c68414a1224 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -347,6 +347,7 @@ void handle_page_fault(struct pt_regs *regs)
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, addr);
if (!vma)
goto lock_mmap;
@@ -376,6 +377,9 @@ void handle_page_fault(struct pt_regs *regs)
no_context(regs, addr);
return;
}
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
+
lock_mmap:
retry:
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 028aeb9c48d6..45a6c35044cc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -294,6 +294,7 @@ static void do_exception(struct pt_regs *regs, int access)
flags |= FAULT_FLAG_WRITE;
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, address);
if (!vma)
goto lock_mmap;
@@ -318,6 +319,9 @@ static void do_exception(struct pt_regs *regs, int access)
handle_fault_error_nolock(regs, 0);
return;
}
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
+
lock_mmap:
retry:
vma = lock_mm_and_find_vma(mm, address, regs);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 63de8e8684f2..cb80070271d0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1322,6 +1322,7 @@ void do_user_addr_fault(struct pt_regs *regs,
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
+retry_vma:
vma = lock_vma_under_rcu(mm, address);
if (!vma)
goto lock_mmap;
@@ -1351,6 +1352,9 @@ void do_user_addr_fault(struct pt_regs *regs,
ARCH_DEFAULT_PKEY);
return;
}
+ if (!(fault & VM_FAULT_RETRY_HARD))
+ goto retry_vma;
+
lock_mmap:
retry:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5ef78617ce93..58db6f0af6fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1665,10 +1665,11 @@ enum vm_fault_reason {
VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100,
VM_FAULT_LOCKED = (__force vm_fault_t)0x000200,
VM_FAULT_RETRY = (__force vm_fault_t)0x000400,
- VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
- VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
- VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
- VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
+ VM_FAULT_RETRY_HARD = (__force vm_fault_t)0x000800,
+ VM_FAULT_FALLBACK = (__force vm_fault_t)0x001000,
+ VM_FAULT_DONE_COW = (__force vm_fault_t)0x002000,
+ VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x004000,
+ VM_FAULT_COMPLETED = (__force vm_fault_t)0x008000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94bd656eeaf8..ad0129d579bc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1398,7 +1398,7 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
vma_end_read(vma);
- return VM_FAULT_RETRY;
+ return VM_FAULT_RETRY | VM_FAULT_RETRY_HARD;
}
ptl = pmd_lock(vma->vm_mm, vmf->pmd);
diff --git a/mm/memory.c b/mm/memory.c
index ff338c2abe92..7f2a30b5efca 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3797,7 +3797,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
return 0;
vma_end_read(vma);
- return VM_FAULT_RETRY;
+ return VM_FAULT_RETRY | VM_FAULT_RETRY_HARD;
}
/**
@@ -3824,7 +3824,7 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
return 0;
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
if (!mmap_read_trylock(vma->vm_mm))
- return VM_FAULT_RETRY;
+ return VM_FAULT_RETRY | VM_FAULT_RETRY_HARD;
}
if (__anon_vma_prepare(vma))
ret = VM_FAULT_OOM;
@@ -4778,7 +4778,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* under VMA lock.
*/
vma_end_read(vma);
- ret = VM_FAULT_RETRY;
+ ret = VM_FAULT_RETRY | VM_FAULT_RETRY_HARD;
goto out;
}
base-commit: 81652c5a65d4ae28e9b18c16ef917a40025c3653
--
2.43.0