Re: [PATCH 3/3] mm: read remote memory without the mmap lock where possible

From: Suren Baghdasaryan

Date: Wed Jun 17 2026 - 02:19:39 EST

On Tue, Jun 16, 2026 at 12:04 PM Rik van Riel <riel@xxxxxxxxxxx> wrote:
>
> __access_remote_vm() takes mmap_read_lock() for the entire transfer and
> uses get_user_pages_remote(), which faults pages in. For the common
> case of reading memory that is already resident -- /proc/PID/cmdline,
> /proc/PID/environ, ptrace PEEK of resident pages -- the mmap lock is
> unnecessary and is badly contended on large machines.
>
> Add an opportunistic, read-only fast path that transfers what it can
> without the mmap lock. For each address it takes the per-VMA lock with
> lock_vma_under_rcu(), re-checks the read-side VMA permissions, and uses
> folio_walk_start(..., FW_VMA_LOCKED) to grab a short-lived reference to
> a present page before copying it out. Anything non-trivial -- a not-
> present page (needs faulting), a hugetlb or VM_IO/VM_PFNMAP mapping, or
> a race with a VMA writer -- falls back to the existing mmap_lock path
> for the remainder.

I don't think we should be using per-VMA locks if the read spans
multiple VMAs. Doing that would risk a possibility of reading
inconsistent data since we are locking one VMA at a time. While we
load and read VMA, its neighboring VMA can be unmapped and another one
can be mapped in its place. So, our read spanning both VMAs will
return inconsistent data. access_remote_vm_fast() can check if the
entire read is contained within one VMA and if not, fall back to
mmap_lock.

>
> untagged_addr_remote() asserts the mmap lock, so add an unlocked variant
> for the fast path; the untag mask is a stable per-mm value.
>
> Only reads are handled here; writes keep using the slow path.
>
> Assisted-by: Claude:claude-opus-4-8
> Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
> ---
> arch/x86/include/asm/uaccess_64.h | 12 +++
> include/linux/uaccess.h | 11 ++
> mm/memory.c | 166 +++++++++++++++++++++++++++++-
> 3 files changed, 188 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
> index 4a52497ba6a1..c6fac900a747 100644
> --- a/arch/x86/include/asm/uaccess_64.h
> +++ b/arch/x86/include/asm/uaccess_64.h
> @@ -51,6 +51,18 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
> (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \
> })
>
> +/* Same as __untagged_addr_remote(), but usable without the mmap lock held. */
> +static inline unsigned long __untagged_addr_remote_unlocked(struct mm_struct *mm,
> + unsigned long addr)
> +{
> + return addr & READ_ONCE((mm)->context.untag_mask);
> +}
> +
> +#define untagged_addr_remote_unlocked(mm, addr) ({ \
> + unsigned long __addr = (__force unsigned long)(addr); \
> + (__force __typeof__(addr))__untagged_addr_remote_unlocked(mm, __addr); \
> +})
> +
> #endif
>
> #define valid_user_address(x) \
> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index 8a264662b242..c8c83372c9d8 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -34,6 +34,17 @@
> })
> #endif
>
> +/*
> + * Like untagged_addr_remote(), but for callers that stabilize @mm by other
> + * means (e.g. a per-VMA lock) and must not assert the mmap lock.
> + */
> +#ifndef untagged_addr_remote_unlocked
> +#define untagged_addr_remote_unlocked(mm, addr) ({ \
> + (void)(mm); \
> + untagged_addr(addr); \
> +})
> +#endif
> +
> #ifdef masked_user_access_begin
> #define can_do_masked_user_access() 1
> # ifndef masked_user_write_access_begin
> diff --git a/mm/memory.c b/mm/memory.c
> index 86a973119bd4..0b23b82eaa18 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -42,6 +42,8 @@
> #include <linux/kernel_stat.h>
> #include <linux/mm.h>
> #include <linux/mm_inline.h>
> +#include <linux/secretmem.h>
> +#include <linux/pagewalk.h>
> #include <linux/sched/mm.h>
> #include <linux/sched/numa_balancing.h>
> #include <linux/sched/task.h>
> @@ -7062,6 +7064,153 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
> EXPORT_SYMBOL_GPL(generic_access_phys);
> #endif
>
> +/*
> + * The fast path uses folio_walk_start(FW_VMA_LOCKED), which needs the per-VMA
> + * lock and RCU-freed page tables to walk page tables without the mmap lock.
> + */
> +#if defined(CONFIG_PER_VMA_LOCK) && defined(CONFIG_MMU_GATHER_RCU_TABLE_FREE)

Just FYI, Dave Hansen is working on a patchset [1] that removes
CONFIG_PER_VMA_LOCK and makes per-VMA locks always available. This
would simplify this patchset too.

[1] https://lore.kernel.org/all/20260610230409.A44D29FA@xxxxxxxxxxxxxxxxxxxxxxxxxxxxx/

> +/*
> + * Opportunistic lockless fast path for __access_remote_vm() reads.
> + *
> + * Memory already resident in @mm can be read without taking the heavily
> + * contended mmap_lock: a per-VMA lock stabilizes the VMA, and folio_walk_start()
> + * with FW_VMA_LOCKED grabs a short-lived reference to a present page via an
> + * RCU/PTL protected page table walk (relying on MMU_GATHER_RCU_TABLE_FREE).
> + *
> + * Anything that would require faulting a page in, touching a hugetlb or
> + * VM_IO/VM_PFNMAP mapping, or that races a VMA writer is left to the mmap_lock
> + * path in __access_remote_vm(). Only reads are handled here.
> + *
> + * Returns the number of bytes transferred via the fast path.
> + */
> +static int access_remote_vm_fast(struct mm_struct *mm, unsigned long addr,
> + void *buf, int len, unsigned int gup_flags)
> +{
> + void *old_buf = buf;
> +
> + addr = untagged_addr_remote_unlocked(mm, addr);
> +
> + while (len) {
> + struct vm_area_struct *vma;
> + vm_flags_t vm_flags;
> +
> + vma = lock_vma_under_rcu(mm, addr);
> + if (!vma)
> + break;
> +
> + /*
> + * Mirror the read-side permission checks of check_vma_flags(),
> + * and exclude what FW_VMA_LOCKED cannot handle (hugetlb) or what
> + * needs the ->access() handler (VM_IO/VM_PFNMAP). Checked once
> + * per VMA; anything not positively allowed falls back to the
> + * slow path, which re-validates everything.
> + */
> + vm_flags = vma->vm_flags;
> + if ((vm_flags & (VM_IO | VM_PFNMAP)) ||
> + is_vm_hugetlb_page(vma) || vma_is_secretmem(vma) ||
> + (!(vm_flags & VM_READ) &&
> + (!(gup_flags & FOLL_FORCE) || !(vm_flags & VM_MAYREAD)))) {
> + vma_end_read(vma);
> + break;
> + }
> +
> + /*
> + * Copy as much of this VMA as we can without re-acquiring the
> + * per-VMA lock; re-lock only when @addr leaves the VMA.
> + */
> + while (len && addr < vma->vm_end) {
> + struct folio_walk fw;
> + struct folio *folio;
> + struct page *page;
> + unsigned long entry_size, entry_left, folio_left, span;
> + unsigned long copied, idx0;
> + int offset;
> +
> + folio = folio_walk_start(&fw, vma, addr, FW_VMA_LOCKED);
> + if (!folio) {
> + vma_end_read(vma);
> + goto out;
> + }
> + page = fw.page;
> + if (!page) {
> + folio_walk_end(&fw, vma);
> + vma_end_read(vma);
> + goto out;
> + }
> + /* Pin the folio so it stays valid after the PTL is dropped. */
> + folio_get(folio);
> + folio_walk_end(&fw, vma);
> +
> + /*
> + * folio_walk_start() validated exactly one mapping entry,
> + * which covers a contiguous, present run of this folio:
> + * PAGE_SIZE for a pte, PMD_SIZE for a pmd leaf, PUD_SIZE
> + * for a pud leaf. Copy up to the end of that entry,
> + * bounded by the folio, the VMA and len, so a huge mapping
> + * is handled in one walk instead of per page.
> + */
> + offset = offset_in_page(addr);
> + switch (fw.level) {
> + case FW_LEVEL_PUD:
> + entry_size = PUD_SIZE;
> + break;
> + case FW_LEVEL_PMD:
> + entry_size = PMD_SIZE;
> + break;
> + default:
> + entry_size = PAGE_SIZE;
> + break;
> + }
> + entry_left = entry_size - (addr & (entry_size - 1));
> + idx0 = folio_page_idx(folio, page);
> + folio_left = ((folio_nr_pages(folio) - idx0) << PAGE_SHIFT) -
> + offset;
> + span = min3((unsigned long)len, entry_left, folio_left);
> + span = min(span, vma->vm_end - addr);
> +
> + /*
> + * Copy the span page-by-page: kmap_local_folio() maps one
> + * page on HIGHMEM and copy_from_user_page() flushes per
> + * page on aliasing caches, but the page tables are not
> + * re-walked. The span borrows the single folio reference
> + * taken above, so each mapping is dropped with
> + * kunmap_local() (not folio_release_kmap(), which would
> + * also drop a folio reference per page).
> + */
> + for (copied = 0; copied < span; ) {
> + unsigned long foff = offset + copied;
> + unsigned long pidx = idx0 + (foff >> PAGE_SHIFT);
> + int poff = foff & ~PAGE_MASK;
> + int chunk = min_t(unsigned long, span - copied,
> + PAGE_SIZE - poff);
> + void *maddr = kmap_local_folio(folio,
> + pidx << PAGE_SHIFT);
> +
> + copy_from_user_page(vma, folio_page(folio, pidx),
> + addr + copied, buf + copied,
> + maddr + poff, chunk);
> + kunmap_local(maddr);
> + copied += chunk;
> + }
> +
> + folio_put(folio);
> + len -= span;
> + buf += span;
> + addr += span;
> + }
> + vma_end_read(vma);
> + }
> +out:
> + return buf - old_buf;
> +}
> +#else
> +static int access_remote_vm_fast(struct mm_struct *mm, unsigned long addr,
> + void *buf, int len, unsigned int gup_flags)
> +{
> + return 0;
> +}
> +#endif /* CONFIG_PER_VMA_LOCK && CONFIG_MMU_GATHER_RCU_TABLE_FREE */
> +
> /*
> * Access another process' address space as given in mm.
> */
> @@ -7071,8 +7220,23 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
> void *old_buf = buf;
> int write = gup_flags & FOLL_WRITE;
>
> + /*
> + * Try the lockless fast path for reads first; it transfers what it can
> + * from resident memory without taking mmap_lock, and leaves the
> + * remainder (if any) to the slow path below.
> + */
> + if (!write) {
> + int done = access_remote_vm_fast(mm, addr, buf, len, gup_flags);
> +
> + addr += done;
> + buf += done;
> + len -= done;
> + if (!len)
> + return buf - old_buf;
> + }
> +
> if (mmap_read_lock_killable(mm))
> - return 0;
> + return buf - old_buf;
>
> /* Untag the address before looking up the VMA */
> addr = untagged_addr_remote(mm, addr);
> --
> 2.53.0-Meta
>