Re: [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI

From: Suren Baghdasaryan
Date: Tue Sep 19 2023 - 19:08:29 EST


On Thu, Sep 14, 2023 at 7:28 PM Jann Horn <jannh@xxxxxxxxxx> wrote:
>
> On Thu, Sep 14, 2023 at 5:26 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
> >
> > From: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> >
> > This implements the uABI of UFFDIO_REMAP.
> >
> > Notably one mode bitflag is also forwarded (and in turn known) by the
> > lowlevel remap_pages method.
> >
> > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> > ---
> > fs/userfaultfd.c | 49 +++
> > include/linux/rmap.h | 5 +
> > include/linux/userfaultfd_k.h | 17 +
> > include/uapi/linux/userfaultfd.h | 22 ++
> > mm/huge_memory.c | 118 +++++++
> > mm/khugepaged.c | 3 +
> > mm/userfaultfd.c | 586 +++++++++++++++++++++++++++++++
> > 7 files changed, 800 insertions(+)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 56eaae9dac1a..7bf64e7541c1 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -2027,6 +2027,52 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
> > return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
> > }
> >
> > +static int userfaultfd_remap(struct userfaultfd_ctx *ctx,
> > + unsigned long arg)
> > +{
> > + __s64 ret;
> > + struct uffdio_remap uffdio_remap;
> > + struct uffdio_remap __user *user_uffdio_remap;
> > + struct userfaultfd_wake_range range;
> > +
> > + user_uffdio_remap = (struct uffdio_remap __user *) arg;
> > +
> > + ret = -EFAULT;
> > + if (copy_from_user(&uffdio_remap, user_uffdio_remap,
> > + /* don't copy "remap" last field */
> > + sizeof(uffdio_remap)-sizeof(__s64)))
> > + goto out;
> > +
> > + ret = validate_range(ctx->mm, uffdio_remap.dst, uffdio_remap.len);
> > + if (ret)
> > + goto out;
> > + ret = validate_range(current->mm, uffdio_remap.src, uffdio_remap.len);
> > + if (ret)
> > + goto out;
> > + ret = -EINVAL;
> > + if (uffdio_remap.mode & ~(UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES|
> > + UFFDIO_REMAP_MODE_DONTWAKE))
> > + goto out;
>

Sorry for the delay with the answers...

> Do you not need mmget_not_zero(ctx->mm) to make sure the MM can't be
> concurrently torn down while remap_pages() is running, similar to what
> the other userfaultfd ioctl handlers do?

Yes, I do need that. Will add.


>
> > + ret = remap_pages(ctx->mm, current->mm,
> > + uffdio_remap.dst, uffdio_remap.src,
> > + uffdio_remap.len, uffdio_remap.mode);
> > + if (unlikely(put_user(ret, &user_uffdio_remap->remap)))
> > + return -EFAULT;
> > + if (ret < 0)
> > + goto out;
> > + /* len == 0 would wake all */
> > + BUG_ON(!ret);
> > + range.len = ret;
> > + if (!(uffdio_remap.mode & UFFDIO_REMAP_MODE_DONTWAKE)) {
> > + range.start = uffdio_remap.dst;
> > + wake_userfault(ctx, &range);
> > + }
> > + ret = range.len == uffdio_remap.len ? 0 : -EAGAIN;
> > +out:
> > + return ret;
> > +}
> [...]
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 064fbd90822b..c7a9880a1f6a 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1932,6 +1932,124 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > return ret;
> > }
> >
> > +#ifdef CONFIG_USERFAULTFD
> > +/*
> > + * The PT lock for src_pmd and the mmap_lock for reading are held by
> > + * the caller, but it must return after releasing the
> > + * page_table_lock. We're guaranteed the src_pmd is a pmd_trans_huge
> > + * until the PT lock of the src_pmd is released. Just move the page
> > + * from src_pmd to dst_pmd if possible. Return zero if succeeded in
> > + * moving the page, -EAGAIN if it needs to be repeated by the caller,
> > + * or other errors in case of failure.
> > + */
> > +int remap_pages_huge_pmd(struct mm_struct *dst_mm,
> > + struct mm_struct *src_mm,
> > + pmd_t *dst_pmd, pmd_t *src_pmd,
> > + pmd_t dst_pmdval,
> > + struct vm_area_struct *dst_vma,
> > + struct vm_area_struct *src_vma,
> > + unsigned long dst_addr,
> > + unsigned long src_addr)
> > +{
> > + pmd_t _dst_pmd, src_pmdval;
> > + struct page *src_page;
> > + struct anon_vma *src_anon_vma, *dst_anon_vma;
> > + spinlock_t *src_ptl, *dst_ptl;
> > + pgtable_t pgtable;
> > + struct mmu_notifier_range range;
> > +
> > + src_pmdval = *src_pmd;
> > + src_ptl = pmd_lockptr(src_mm, src_pmd);
> > +
> > + BUG_ON(!pmd_trans_huge(src_pmdval));
> > + BUG_ON(!pmd_none(dst_pmdval));
>
> Why can we assert that pmd_none(dst_pmdval) is true here? Can we not
> have concurrent faults (or userfaultfd operations) populating that
> PMD?

IIUC dst_pmdval is a copy of the value from dst_pmd, so that local
copy should not change even if some concurrent operation changes
dst_pmd. We can assert that it's pmd_none because we checked for that
before calling remap_pages_huge_pmd. Later on we check if dst_pmd
changed from under us (see pmd_same(*dst_pmd, dst_pmdval) check) and
retry if that happened.

>
> > + BUG_ON(!spin_is_locked(src_ptl));
> > + mmap_assert_locked(src_mm);
> > + mmap_assert_locked(dst_mm);
> > + BUG_ON(src_addr & ~HPAGE_PMD_MASK);
> > + BUG_ON(dst_addr & ~HPAGE_PMD_MASK);
> > +
> > + src_page = pmd_page(src_pmdval);
> > + BUG_ON(!PageHead(src_page));
> > + BUG_ON(!PageAnon(src_page));
> > + if (unlikely(page_mapcount(src_page) != 1)) {
> > + spin_unlock(src_ptl);
> > + return -EBUSY;
> > + }
> > +
> > + get_page(src_page);
> > + spin_unlock(src_ptl);
> > +
> > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, src_addr,
> > + src_addr + HPAGE_PMD_SIZE);
> > + mmu_notifier_invalidate_range_start(&range);
> > +
> > + /* block all concurrent rmap walks */
> > + lock_page(src_page);
> > +
> > + /*
> > + * split_huge_page walks the anon_vma chain without the page
> > + * lock. Serialize against it with the anon_vma lock, the page
> > + * lock is not enough.
> > + */
> > + src_anon_vma = folio_get_anon_vma(page_folio(src_page));
> > + if (!src_anon_vma) {
> > + unlock_page(src_page);
> > + put_page(src_page);
> > + mmu_notifier_invalidate_range_end(&range);
> > + return -EAGAIN;
> > + }
> > + anon_vma_lock_write(src_anon_vma);
> > +
> > + dst_ptl = pmd_lockptr(dst_mm, dst_pmd);
> > + double_pt_lock(src_ptl, dst_ptl);
> > + if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
> > + !pmd_same(*dst_pmd, dst_pmdval) ||
> > + page_mapcount(src_page) != 1)) {
> > + double_pt_unlock(src_ptl, dst_ptl);
> > + anon_vma_unlock_write(src_anon_vma);
> > + put_anon_vma(src_anon_vma);
> > + unlock_page(src_page);
> > + put_page(src_page);
> > + mmu_notifier_invalidate_range_end(&range);
> > + return -EAGAIN;
> > + }
> > +
> > + BUG_ON(!PageHead(src_page));
> > + BUG_ON(!PageAnon(src_page));
> > + /* the PT lock is enough to keep the page pinned now */
> > + put_page(src_page);
> > +
> > + dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON;
> > + WRITE_ONCE(src_page->mapping, (struct address_space *) dst_anon_vma);
> > + WRITE_ONCE(src_page->index, linear_page_index(dst_vma, dst_addr));
> > +
> > + if (!pmd_same(pmdp_huge_clear_flush(src_vma, src_addr, src_pmd),
> > + src_pmdval))
> > + BUG_ON(1);
>
> I'm not sure we can assert that the PMDs are exactly equal; the CPU
> might have changed the A/D bits under us?

Yes. I wonder if I can simply remove the BUG_ON here like this:

src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);

Technically we don't use src_pmdval after this but for the possible
future use that would keep things correct. If A/D bits changed from
under us we will still copy correct values into dst_pmd.

>
> > + _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
> > + _dst_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
> > + set_pmd_at(dst_mm, dst_addr, dst_pmd, _dst_pmd);
> > +
> > + pgtable = pgtable_trans_huge_withdraw(src_mm, src_pmd);
> > + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
>
> Are we allowed to move page tables between mm_structs on all
> architectures? The first example I found that looks a bit dodgy,
> looking through various architectures' pte_alloc_one(), is s390's
> page_table_alloc() which looks like page tables are tied to per-MM
> lists sometimes.
> If that's not allowed, we might have to allocate a new deposit table
> and free the old one or something like that.

Hmm. Yeah, looks like in the case of !CONFIG_PGSTE the table can be
linked to mm->context.pgtable_list, so can't be moved to another mm. I
guess I'll have to keep a pgtable allocated, ready to be deposited and
free the old one. Maybe it's worth having an arch-specific function
indicating whether moving a pgtable between MMs is supported? Or do it
separately as an optimization. WDYT?

>
> > + if (dst_mm != src_mm) {
> > + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > + add_mm_counter(src_mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> > + }
> > + double_pt_unlock(src_ptl, dst_ptl);
> > +
> > + anon_vma_unlock_write(src_anon_vma);
> > + put_anon_vma(src_anon_vma);
> > +
> > + /* unblock rmap walks */
> > + unlock_page(src_page);
> > +
> > + mmu_notifier_invalidate_range_end(&range);
> > + return 0;
> > +}
> > +#endif /* CONFIG_USERFAULTFD */
> > +
> > /*
> > * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
> > *
> [...]
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 96d9eae5c7cc..0cca60dfa8f8 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> [...]
> > +ssize_t remap_pages(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > + unsigned long dst_start, unsigned long src_start,
> > + unsigned long len, __u64 mode)
> > +{
> [...]
> > +
> > + if (pgprot_val(src_vma->vm_page_prot) !=
> > + pgprot_val(dst_vma->vm_page_prot))
> > + goto out;
>
> Does this check intentionally allow moving pages from a
> PROT_READ|PROT_WRITE anonymous private VMA into a PROT_READ anonymous
> private VMA (on architectures like x86 and arm64 where CoW memory has
> the same protection flags as read-only memory), but forbid moving them
> from a PROT_READ|PROT_EXEC VMA into a PROT_READ VMA? I think this
> check needs at least a comment to explain what's going on here.

The check is simply to ensure the VMAs have the same access
permissions to prevent page copies that should have different
permissions. The fact that x86 and arm64 have the same protection bits
for R/O and COW memory is a "side-effect" IMHO. I'm not sure what
comment would be good here but I'm open to suggestions.

>
> > + /* only allow remapping if both are mlocked or both aren't */
> > + if ((src_vma->vm_flags & VM_LOCKED) ^ (dst_vma->vm_flags & VM_LOCKED))
> > + goto out;
> > +
> > + /*
> > + * Be strict and only allow remap_pages if either the src or
> > + * dst range is registered in the userfaultfd to prevent
> > + * userland errors going unnoticed. As far as the VM
> > + * consistency is concerned, it would be perfectly safe to
> > + * remove this check, but there's no useful usage for
> > + * remap_pages ouside of userfaultfd registered ranges. This
> > + * is after all why it is an ioctl belonging to the
> > + * userfaultfd and not a syscall.
> > + *
> > + * Allow both vmas to be registered in the userfaultfd, just
> > + * in case somebody finds a way to make such a case useful.
> > + * Normally only one of the two vmas would be registered in
> > + * the userfaultfd.
> > + */
> > + if (!dst_vma->vm_userfaultfd_ctx.ctx &&
> > + !src_vma->vm_userfaultfd_ctx.ctx)
> > + goto out;
> > +
> > + /*
> > + * FIXME: only allow remapping across anonymous vmas,
> > + * tmpfs should be added.
> > + */
> > + if (src_vma->vm_ops || dst_vma->vm_ops)
> > + goto out;
>
> I don't think it's okay to check for anonymous VMAs by checking
> ->vm_ops. There are some weird drivers whose ->mmap helpers don't set
> ->vm_ops and instead just shove all the necessary PTEs into the VMA
> right on ->mmap, so I think they end up with ->vm_ops==NULL. For
> example, kcov_mmap() looks that way. I'm not sure how common this is.
>
> Though, uuuuuh, I guess if that's true, the existing
> vma_is_anonymous() is broken, since that also just checks ->vm_ops?
> I'm not sure what the consequences of that would be... Either way,
> vma_is_anonymous() might be the better way to check for anonymous VMAs
> here, and someone should figure out whether vma_is_anonymous() needs
> to be fixed.

Absolutely, vma_is_anonymous() should have been used here. Will fix it.

>
> > + /*
> > + * Ensure the dst_vma has a anon_vma or this page
> > + * would get a NULL anon_vma when moved in the
> > + * dst_vma.
> > + */
> > + err = -ENOMEM;
> > + if (unlikely(anon_vma_prepare(dst_vma)))
> > + goto out;
> > +
> > + for (src_addr = src_start, dst_addr = dst_start;
> > + src_addr < src_start + len;) {
> > + spinlock_t *ptl;
> > + pmd_t dst_pmdval;
> > +
> > + BUG_ON(dst_addr >= dst_start + len);
> > + src_pmd = mm_find_pmd(src_mm, src_addr);
>
> (this would blow up pretty badly if we could have transparent huge PUD
> in the region but I think that's limited to file VMAs so it's fine as
> it currently is)

Should I add a comment here as a warning if in the future we decide to
implement support for file-backed pages?

>
> > + if (unlikely(!src_pmd)) {
> > + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> > + err = -ENOENT;
> > + break;
> > + }
> > + src_pmd = mm_alloc_pmd(src_mm, src_addr);
> > + if (unlikely(!src_pmd)) {
> > + err = -ENOMEM;
> > + break;
> > + }
> > + }
> > + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> > + if (unlikely(!dst_pmd)) {
> > + err = -ENOMEM;
> > + break;
> > + }
> > +
> > + dst_pmdval = pmdp_get_lockless(dst_pmd);
> > + /*
> > + * If the dst_pmd is mapped as THP don't
> > + * override it and just be strict.
> > + */
> > + if (unlikely(pmd_trans_huge(dst_pmdval))) {
> > + err = -EEXIST;
> > + break;
> > + }
>
> This check is racy because the dst_pmd can still change at this point,
> from previously pointing to a zeroed PMD to now pointing to a
> hugepage, right? And we rely on remap_pages_pte() and
> remap_pages_huge_pmd() to recheck for that?
> If yes, maybe add a comment noting this and explaining why we want this check.

Yes, you are right. remap_pages_huge_pmd() does check
pmd_same(*dst_pmd, dst_pmdval) after doing double_pt_lock(), so it
will retry and exit on this check if dst_pmd got mapped as THP from
under us. remap_pages_pte() OTOH simply does
BUG_ON(pmd_trans_huge(*dst_pmd)), which is wrong. Instead it should
also check if PMD value changed from under us and retry in such a
case. I'll fix that and will add a comment here about this racy check
and the retry logic.

>
> > + ptl = pmd_trans_huge_lock(src_pmd, src_vma);
> > + if (ptl) {
> > + /*
> > + * Check if we can move the pmd without
> > + * splitting it. First check the address
> > + * alignment to be the same in src/dst. These
> > + * checks don't actually need the PT lock but
> > + * it's good to do it here to optimize this
> > + * block away at build time if
> > + * CONFIG_TRANSPARENT_HUGEPAGE is not set.
> > + */
> > + if (thp_aligned == -1)
> > + thp_aligned = ((src_addr & ~HPAGE_PMD_MASK) ==
> > + (dst_addr & ~HPAGE_PMD_MASK));
> > + if (!thp_aligned || (src_addr & ~HPAGE_PMD_MASK) ||
>
> This seems overly complicated, the only case when you can move a huge
> PMD is if both addresses are hugepage-aligned and you have enough
> length for one hugepage:
>
> (src_addr & ~HPAGE_PMD_MASK) == 0 && (dst_addr & ~HPAGE_PMD_MASK) == 0
> && (src_start + len - src_addr >= HPAGE_PMD_SIZE).

Ack.

>
> > + !pmd_none(dst_pmdval) ||
> > + src_start + len - src_addr < HPAGE_PMD_SIZE) {
> > + spin_unlock(ptl);
> > + /* Fall through */
> > + split_huge_pmd(src_vma, src_pmd, src_addr);
> > + } else {
> > + err = remap_pages_huge_pmd(dst_mm,
> > + src_mm,
> > + dst_pmd,
> > + src_pmd,
> > + dst_pmdval,
> > + dst_vma,
> > + src_vma,
> > + dst_addr,
> > + src_addr);
> > + cond_resched();
> > +
> > + if (!err) {
> > + dst_addr += HPAGE_PMD_SIZE;
> > + src_addr += HPAGE_PMD_SIZE;
> > + moved += HPAGE_PMD_SIZE;
> > + }
> > +
> > + if ((!err || err == -EAGAIN) &&
> > + fatal_signal_pending(current))
> > + err = -EINTR;
> > +
> > + if (err && err != -EAGAIN)
> > + break;
> > +
> > + continue;
> > + }
> > + }
> > +
> > + if (pmd_none(*src_pmd)) {
> > + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> > + err = -ENOENT;
> > + break;
> > + }
> > + if (unlikely(__pte_alloc(src_mm, src_pmd))) {
> > + err = -ENOMEM;
> > + break;
> > + }
> > + }
> > +
> > + if (unlikely(pmd_none(dst_pmdval)) &&
> > + unlikely(__pte_alloc(dst_mm, dst_pmd))) {
>
> Maybe just use pte_alloc() here?

Ack.

>
> > + err = -ENOMEM;
> > + break;
> > + }
> > +
> > + err = remap_pages_pte(dst_mm, src_mm,
> > + dst_pmd, src_pmd,
> > + dst_vma, src_vma,
> > + dst_addr, src_addr,
> > + mode);
> > +
> > + cond_resched();
> > +
> > + if (!err) {
> > + dst_addr += PAGE_SIZE;
> > + src_addr += PAGE_SIZE;
> > + moved += PAGE_SIZE;
> > + }
> > +
> > + if ((!err || err == -EAGAIN) &&
> > + fatal_signal_pending(current))
> > + err = -EINTR;
> > +
> > + if (err && err != -EAGAIN)
> > + break;
> > + }
> > +
> > +out:
> > + mmap_read_unlock(dst_mm);
> > + if (dst_mm != src_mm)
> > + mmap_read_unlock(src_mm);
> > + BUG_ON(moved < 0);
> > + BUG_ON(err > 0);
> > + BUG_ON(!moved && !err);
> > + return moved ? moved : err;
> > +}
>
> Maybe you could try whether this function would look simpler with a
> shape roughly like:
>
> for (src_addr = ...; src_addr < ...;) {
> unsigned long step_size;
>
> if (hugepage case) {
> if (have to split) {
> split it;
> continue;
> }
> step_size = HPAGE_PMD_SIZE;
> ...
> } else {
> ... 4k case ...
> step_size = PAGE_SIZE;
> }
> ...
> cond_resched();
> if (!err) {
> dst_addr += step_size;
> src_addr += step_size;
> moved += step_size;
> }

Yeah, that looks simpler. I'll try that, thanks!

> ...
> }