[PATCH 2/2] mm: Always sanity check anon_vma first for per-vma locks

From: Peter Xu
Date: Thu Apr 04 2024 - 16:05:55 EST


anon_vma is a tricky object in the context of per-vma lock, because
sometimes it is required later for mapping the pages, while it's not safe
to prepare anon_vma with per-vma lock due to the fact that we can try to
access anon_vma of adjacent VMAs, which we don't yet hold any lock.

So far there are three places that per-vma lock will check against the
existance of anon_vma for that:

1) lock_vma_under_rcu(): this is the major entrance of per-vma lock,
where we have taken care of anon memory.

2) lock_vma(): even if it looks so generic as an API, it's only used in
userfaultfd context to leverage per-vma locks. It does extra check over
MAP_PRIVATE file mappings for the same anon_vma issue. Note that this
only applies to the dest vma not src vma, as we don't ask for src vma's
anon_vma even if it should just present regardless.

3) vmf_anon_prepare(): it works for private file mapping faults just like
what lock_vma() wanted to cover above. However this check is done only
afterwards deeper in the fault stack.

The question is whether that's intended to make it as complicated. For
example, why don't we check anon_vma for anonymous too later when prepare
anon_vma, however we do it late for file memory. AFAICT there's nothing
special with file memory in this case.

To unify anon_vma checks, do it always right after we get the per-vma lock.
We should have enough information already at this point, and it doesn't
need to be postponed for file-only. The only missing information here is
whether the caller of the per-vma rcu lock will install a writable mapping
or not. That info is passed into lock_vma_under_rcu() with a boolean now.

So the trivial (all good) side effect of such patch is:

- We may do slightly better on the first WRITE of a private file mapping,
because we can retry earlier (in lock_vma_under_rcu(), rather than
vmf_anon_prepare() later).

- We may start to allow per-vma page faults on zero pages even for
anonymous. While prior to this patch we don't allow that to happen even
if installation of zero page does not require anon_vma's existance.

I still left a WARN_ON_ONCE() in vmf_anon_prepare() to double check we're
all good.

Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx>
Cc: Lokesh Gidra <lokeshgidra@xxxxxxxxxx>
Cc: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
Cc: Alistair Popple <apopple@xxxxxxxxxx>
Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
arch/arm/mm/fault.c | 2 +-
arch/arm64/mm/fault.c | 2 +-
arch/powerpc/mm/fault.c | 2 +-
arch/riscv/mm/fault.c | 2 +-
arch/s390/mm/fault.c | 2 +-
arch/x86/mm/fault.c | 2 +-
include/linux/mm.h | 7 ++++---
mm/memory.c | 46 +++++++++++++++++++++++++++++++----------
mm/userfaultfd.c | 21 +++++++------------
net/ipv4/tcp.c | 3 ++-
10 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5c4b417e24f9..20c21ecec25c 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -288,7 +288,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;

- vma = lock_vma_under_rcu(mm, addr);
+ vma = lock_vma_under_rcu(mm, addr, flags & FAULT_FLAG_WRITE);
if (!vma)
goto lock_mmap;

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 405f9aa831bd..4be946ca80bd 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -566,7 +566,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
if (!(mm_flags & FAULT_FLAG_USER))
goto lock_mmap;

- vma = lock_vma_under_rcu(mm, addr);
+ vma = lock_vma_under_rcu(mm, addr, flags & FAULT_FLAG_WRITE);
if (!vma)
goto lock_mmap;

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 215690452495..7a2adf40d65d 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -480,7 +480,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;

- vma = lock_vma_under_rcu(mm, address);
+ vma = lock_vma_under_rcu(mm, addrress, is_write);
if (!vma)
goto lock_mmap;

diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 5224f3733802..17d8b011105e 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -286,7 +286,7 @@ void handle_page_fault(struct pt_regs *regs)
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;

- vma = lock_vma_under_rcu(mm, addr);
+ vma = lock_vma_under_rcu(mm, addr, flags & FAULT_FLAG_WRITE);
if (!vma)
goto lock_mmap;

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 162ca2576fd4..8b53eefb947a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -320,7 +320,7 @@ static void do_exception(struct pt_regs *regs, int access)
flags |= FAULT_FLAG_WRITE;
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
- vma = lock_vma_under_rcu(mm, address);
+ vma = lock_vma_under_rcu(mm, address, is_write);
if (!vma)
goto lock_mmap;
if (!(vma->vm_flags & access)) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 67b18adc75dd..2cf38befaf45 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1356,7 +1356,7 @@ void do_user_addr_fault(struct pt_regs *regs,
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;

- vma = lock_vma_under_rcu(mm, address);
+ vma = lock_vma_under_rcu(mm, address, flags & FAULT_FLAG_WRITE);
if (!vma)
goto lock_mmap;

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 07c73451d42f..b3a088ce584d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -777,7 +777,8 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
}

struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address);
+ unsigned long address,
+ bool writable);

#else /* CONFIG_PER_VMA_LOCK */

@@ -790,8 +791,8 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
static inline void vma_mark_detached(struct vm_area_struct *vma,
bool detached) {}

-static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
+static inline struct vm_area_struct *lock_vma_under_rcu(
+ struct mm_struct *mm, unsigned long address, bool writable)
{
return NULL;
}
diff --git a/mm/memory.c b/mm/memory.c
index b6fa5146b260..e8168ebc8095 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3218,10 +3218,8 @@ vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)

if (likely(vma->anon_vma))
return 0;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vma);
- return VM_FAULT_RETRY;
- }
+ /* We shouldn't try a per-vma fault at all if anon_vma isn't solid */
+ WARN_ON_ONCE(vmf->flags & FAULT_FLAG_VMA_LOCK);
if (__anon_vma_prepare(vma))
return VM_FAULT_OOM;
return 0;
@@ -5842,13 +5840,38 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
#endif

#ifdef CONFIG_PER_VMA_LOCK
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
+
+bool vma_needs_stable_anon_vma(struct vm_area_struct *vma, bool writable)
+{
+ /*
+ * If to install a read-only mapping, we should never need an
+ * anon_vma later. For anon, it should be the zero page. For
+ * file, it normally should be a page cache except special mappings
+ * (e.g. tcp zerocopy provides its own read-only pages).
+ */
+ if (!writable)
+ return false;
+
+ /* Anonymous writable mappings always will ask for anon_vma */
+ if (vma_is_anonymous(vma))
+ return true;
+
+ /* For file memory, only need anon_vma if it's private */
+ return !(vma->vm_flags & VM_SHARED);
+}
+
+/**
+ * @lock_vma_under_rcu() - Lookup and lock a VMA under RCU protection.
+ * @mm: target mm struct
+ * @address: target virtual address
+ * @writable: whether we may inject a writable mapping with the VMA
+ *
+ * Returned VMA is guaranteed to be stable and not isolated. If the VMA is
+ * not found or is being modified the function returns NULL.
*/
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
+ unsigned long address,
+ bool writable)
{
MA_STATE(mas, &mm->mm_mt, address, address);
struct vm_area_struct *vma;
@@ -5866,9 +5889,10 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
* find_mergeable_anon_vma uses adjacent vmas which are not locked.
* This check must happen after vma_start_read(); otherwise, a
* concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
- * from its anon_vma.
+ * from its anon_vma. This applies to both anon or private file maps.
*/
- if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
+ if (unlikely(vma_needs_stable_anon_vma(vma, writable) &&
+ !vma->anon_vma))
goto inval_end_read;

/* Check since vm_start/vm_end might change before we lock the VMA */
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index f6267afe65d1..b518c111e37a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -71,18 +71,10 @@ static struct vm_area_struct *lock_vma(struct mm_struct *mm,
{
struct vm_area_struct *vma;

- vma = lock_vma_under_rcu(mm, address);
- if (vma) {
- /*
- * lock_vma_under_rcu() only checks anon_vma for private
- * anonymous mappings. But we need to ensure it is assigned in
- * private file-backed vmas as well.
- */
- if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
- vma_end_read(vma);
- else
- return vma;
- }
+ /* This is the dst vma, we always need the anon_vma */
+ vma = lock_vma_under_rcu(mm, address, true);
+ if (vma)
+ return vma;

mmap_read_lock(mm);
vma = find_vma_and_prepare_anon(mm, address);
@@ -1462,8 +1454,11 @@ static int uffd_move_lock(struct mm_struct *mm,
* vma_start_read(src_vma)
* mmap_read_lock(mm)
* vma_start_write(dst_vma)
+ *
+ * Note that here we won't touch src vma's anon_vma. It should be
+ * there, but here we don't need to ask for it.
*/
- *src_vmap = lock_vma_under_rcu(mm, src_start);
+ *src_vmap = lock_vma_under_rcu(mm, src_start, false);
if (likely(*src_vmap))
return 0;

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e767721b3a58..5940847b616c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2053,7 +2053,8 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
unsigned long address,
bool *mmap_locked)
{
- struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
+ /* TCP zerocopy pages are always read-only, see tcp_mmap() */
+ struct vm_area_struct *vma = lock_vma_under_rcu(mm, address, false);

if (vma) {
if (vma->vm_ops != &tcp_vm_ops) {
--
2.44.0


--
Peter Xu