Re: [PATCH 19/23] hugetlb/userfaultfd: Handle uffd-wp special pte in hugetlb pf handler
From: Mike Kravetz
Date: Thu Apr 22 2021 - 18:46:30 EST
On 3/22/21 5:50 PM, Peter Xu wrote:
> Teach the hugetlb page fault code to understand uffd-wp special pte. For
> example, when seeing such a pte we need to convert any write fault into a read
> one (which is fake - we'll retry the write later if so). Meanwhile, for
> handle_userfault() we'll need to make sure we must wait for the special swap
> pte too just like a none pte.
>
> Note that we also need to teach UFFDIO_COPY about this special pte across the
> code path so that we can safely install a new page at this special pte as long
> as we know it's a stall entry.
>
> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
> ---
> fs/userfaultfd.c | 5 ++++-
> mm/hugetlb.c | 34 +++++++++++++++++++++++++++-------
> mm/userfaultfd.c | 5 ++++-
> 3 files changed, 35 insertions(+), 9 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 72956f9cc892..f6fa34f58c37 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -245,8 +245,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> /*
> * Lockless access: we're in a wait_event so it's ok if it
> * changes under us.
> + *
> + * Regarding uffd-wp special case, please refer to comments in
> + * userfaultfd_must_wait().
> */
> - if (huge_pte_none(pte))
> + if (huge_pte_none(pte) || pte_swp_uffd_wp_special(pte))
> ret = true;
> if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
> ret = true;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 64e424b03774..448ef745d5ee 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4369,7 +4369,8 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
> static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> struct vm_area_struct *vma,
> struct address_space *mapping, pgoff_t idx,
> - unsigned long address, pte_t *ptep, unsigned int flags)
> + unsigned long address, pte_t *ptep,
> + pte_t old_pte, unsigned int flags)
> {
> struct hstate *h = hstate_vma(vma);
> vm_fault_t ret = VM_FAULT_SIGBUS;
> @@ -4493,7 +4494,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
>
> ptl = huge_pte_lock(h, mm, ptep);
> ret = 0;
> - if (!huge_pte_none(huge_ptep_get(ptep)))
> + if (!pte_same(huge_ptep_get(ptep), old_pte))
> goto backout;
>
> if (anon_rmap) {
> @@ -4503,6 +4504,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> page_dup_rmap(page, true);
> new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
> && (vma->vm_flags & VM_SHARED)));
> + if (unlikely(flags & FAULT_FLAG_UFFD_WP)) {
> + WARN_ON_ONCE(flags & FAULT_FLAG_WRITE);
> + /* We should have the write bit cleared already, but be safe */
> + new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
> + }
> set_huge_pte_at(mm, haddr, ptep, new_pte);
>
> hugetlb_count_add(pages_per_huge_page(h), mm);
> @@ -4584,9 +4590,16 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> if (unlikely(is_hugetlb_entry_migration(entry))) {
> migration_entry_wait_huge(vma, mm, ptep);
> return 0;
> - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
> + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
> return VM_FAULT_HWPOISON_LARGE |
> VM_FAULT_SET_HINDEX(hstate_index(h));
> + } else if (unlikely(is_swap_special_pte(entry))) {
> + /* Must be a uffd-wp special swap pte */
> + WARN_ON_ONCE(!pte_swp_uffd_wp_special(entry));
> + flags |= FAULT_FLAG_UFFD_WP;
> + /* Emulate a read fault */
> + flags &= ~FAULT_FLAG_WRITE;
> + }
The comment above this if/else block points out that we hold no locks
and are only checking conditions that would cause a quick return. Yet,
this new code is potentially modifying flags. Pretty sure we can race
and have the entry change.
Not sure of all the side effects of emulating a read if changed entry is
not a uffd-wp special swap pte and we emulate read when we should not.
Perhaps we should just put this check and modification of flags after
taking the fault mutex and before the change below?
--
Mike Kravetz
> }
>
> /*
> @@ -4618,8 +4631,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> mutex_lock(&hugetlb_fault_mutex_table[hash]);
>
> entry = huge_ptep_get(ptep);
> - if (huge_pte_none(entry)) {
> - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
> + /*
> + * FAULT_FLAG_UFFD_WP should be handled merely the same as pte none
> + * because it's basically a none pte with a special marker
> + */
> + if (huge_pte_none(entry) || pte_swp_uffd_wp_special(entry)) {
> + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
> + entry, flags);
> goto out_mutex;
> }
>
> @@ -4753,7 +4771,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> unsigned long size;
> int vm_shared = dst_vma->vm_flags & VM_SHARED;
> struct hstate *h = hstate_vma(dst_vma);
> - pte_t _dst_pte;
> + pte_t _dst_pte, cur_pte;
> spinlock_t *ptl;
> int ret;
> struct page *page;
> @@ -4831,8 +4849,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> if (idx >= size)
> goto out_release_unlock;
>
> + cur_pte = huge_ptep_get(dst_pte);
> ret = -EEXIST;
> - if (!huge_pte_none(huge_ptep_get(dst_pte)))
> + /* Please refer to shmem_mfill_atomic_pte() for uffd-wp special case */
> + if (!huge_pte_none(cur_pte) && !pte_swp_uffd_wp_special(cur_pte))
> goto out_release_unlock;
>
> if (vm_shared) {
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 01170197a3d7..a2b0dcc80a19 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -274,6 +274,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> }
>
> while (src_addr < src_start + len) {
> + pte_t pteval;
> +
> BUG_ON(dst_addr >= dst_start + len);
>
> /*
> @@ -296,8 +298,9 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
> goto out_unlock;
> }
>
> + pteval = huge_ptep_get(dst_pte);
> if (mode != MCOPY_ATOMIC_CONTINUE &&
> - !huge_pte_none(huge_ptep_get(dst_pte))) {
> + !huge_pte_none(pteval) && !pte_swp_uffd_wp_special(pteval)) {
> err = -EEXIST;
> mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> i_mmap_unlock_read(mapping);
>