Re: [PATCH v2 04/15] mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()

From: Yang Shi
Date: Wed Mar 16 2022 - 16:02:58 EST


On Tue, Mar 15, 2022 at 3:50 AM David Hildenbrand <david@xxxxxxxxxx> wrote:
>
> ... and move the special check for pinned pages into
> page_try_dup_anon_rmap() to prepare for tracking exclusive anonymous
> pages via a new pageflag, clearing it only after making sure that there
> are no GUP pins on the anonymous page.
>
> We really only care about pins on anonymous pages, because they are
> prone to getting replaced in the COW handler once mapped R/O. For !anon
> pages in cow-mappings (!VM_SHARED && VM_MAYWRITE) we shouldn't really
> care about that, at least not that I could come up with an example.
>
> Let's drop the is_cow_mapping() check from page_needs_cow_for_dma(), as we
> know we're dealing with anonymous pages. Also, drop the handling of
> pinned pages from copy_huge_pud() and add a comment if ever supporting
> anonymous pages on the PUD level.
>
> This is a preparation for tracking exclusivity of anonymous pages in
> the rmap code, and disallowing marking a page shared (-> failing to
> duplicate) if there are GUP pins on a page.
>
> RFC notes: if I'm missing something important for !anon pages, we could
> similarly handle it via page_try_dup_file_rmap().
>
> Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
> ---
> include/linux/mm.h | 5 +----
> include/linux/rmap.h | 48 +++++++++++++++++++++++++++++++++++++++++++-
> mm/huge_memory.c | 27 ++++++++-----------------
> mm/hugetlb.c | 16 ++++++++-------
> mm/memory.c | 17 +++++++++++-----
> mm/migrate.c | 2 +-
> 6 files changed, 78 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 391b950e919d..63ee06001189 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1322,16 +1322,13 @@ static inline bool is_cow_mapping(vm_flags_t flags)
>
> /*
> * This should most likely only be called during fork() to see whether we
> - * should break the cow immediately for a page on the src mm.
> + * should break the cow immediately for an anon page on the src mm.
> *
> * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
> */
> static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
> struct page *page)
> {
> - if (!is_cow_mapping(vma->vm_flags))
> - return false;
> -
> VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
>
> if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index e704b1a4c06c..92c3585b8c6a 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -180,11 +180,57 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
> void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
> unsigned long);
>
> -static inline void page_dup_rmap(struct page *page, bool compound)
> +static inline void __page_dup_rmap(struct page *page, bool compound)
> {
> atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
> }
>
> +static inline void page_dup_file_rmap(struct page *page, bool compound)
> +{
> + __page_dup_rmap(page, compound);
> +}
> +
> +/**
> + * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
> + * anonymous page
> + * @page: the page to duplicate the mapping for
> + * @compound: the page is mapped as compound or as a small page
> + * @vma: the source vma
> + *
> + * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
> + *
> + * Duplicating the mapping can only fail if the page may be pinned; device
> + * private pages cannot get pinned and consequently this function cannot fail.
> + *
> + * If duplicating the mapping succeeds, the page has to be mapped R/O into
> + * the parent and the child. It must *not* get mapped writable after this call.
> + *
> + * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
> + */
> +static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
> + struct vm_area_struct *vma)
> +{
> + VM_BUG_ON_PAGE(!PageAnon(page), page);
> +
> + /*
> + * If this page may have been pinned by the parent process,
> + * don't allow to duplicate the mapping but instead require to e.g.,
> + * copy the page immediately for the child so that we'll always
> + * guarantee the pinned page won't be randomly replaced in the
> + * future on write faults.
> + */
> + if (likely(!is_device_private_page(page) &&
> + unlikely(page_needs_cow_for_dma(vma, page))))
> + return -EBUSY;
> +
> + /*
> + * It's okay to share the anon page between both processes, mapping
> + * the page R/O into both processes.
> + */
> + __page_dup_rmap(page, compound);
> + return 0;
> +}
> +
> /*
> * Called from mm/vmscan.c to handle paging out
> */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index cda88d8ac1bd..c126d728b8de 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1097,23 +1097,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> src_page = pmd_page(pmd);
> VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
>
> - /*
> - * If this page is a potentially pinned page, split and retry the fault
> - * with smaller page size. Normally this should not happen because the
> - * userspace should use MADV_DONTFORK upon pinned regions. This is a
> - * best effort that the pinned pages won't be replaced by another
> - * random page during the coming copy-on-write.
> - */
> - if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
> + get_page(src_page);
> + if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
> + /* Page maybe pinned: split and retry the fault on PTEs. */
> + put_page(src_page);

Do we have to do the get_page()/put_page() sequence? It seems we don't
have to get the page before calling page_try_dup_anon_rmap(), right?
We just need to bump page refcount when dupping rmap successfully. So
we could do:

if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
/* do something */
}

get_page(src_page)

> pte_free(dst_mm, pgtable);
> spin_unlock(src_ptl);
> spin_unlock(dst_ptl);
> __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
> return -EAGAIN;
> }
> -
> - get_page(src_page);
> - page_dup_rmap(src_page, true);
> add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> out_zero_page:
> mm_inc_nr_ptes(dst_mm);
> @@ -1217,14 +1210,10 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> /* No huge zero pud yet */
> }
>
> - /* Please refer to comments in copy_huge_pmd() */
> - if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
> - spin_unlock(src_ptl);
> - spin_unlock(dst_ptl);
> - __split_huge_pud(vma, src_pud, addr);
> - return -EAGAIN;
> - }
> -
> + /*
> + * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
> + * and split if duplicating fails.
> + */
> pudp_set_wrprotect(src_mm, addr, src_pud);
> pud = pud_mkold(pud_wrprotect(pud));
> set_pud_at(dst_mm, addr, dst_pud, pud);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d3ce89697855..9fb990d95dab 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4781,15 +4781,18 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> get_page(ptepage);
>
> /*
> - * This is a rare case where we see pinned hugetlb
> - * pages while they're prone to COW. We need to do the
> - * COW earlier during fork.
> + * Failing to duplicate the anon rmap is a rare case
> + * where we see pinned hugetlb pages while they're
> + * prone to COW. We need to do the COW earlier during
> + * fork.
> *
> * When pre-allocating the page or copying data, we
> * need to be without the pgtable locks since we could
> * sleep during the process.
> */
> - if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
> + if (!PageAnon(ptepage)) {
> + page_dup_file_rmap(ptepage, true);
> + } else if (page_try_dup_anon_rmap(ptepage, true, vma)) {
> pte_t src_pte_old = entry;
> struct page *new;
>
> @@ -4836,7 +4839,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> entry = huge_pte_wrprotect(entry);
> }
>
> - page_dup_rmap(ptepage, true);
> set_huge_pte_at(dst, addr, dst_pte, entry);
> hugetlb_count_add(npages, dst);
> }
> @@ -5514,7 +5516,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> ClearHPageRestoreReserve(page);
> hugepage_add_new_anon_rmap(page, vma, haddr);
> } else
> - page_dup_rmap(page, true);
> + page_dup_file_rmap(page, true);
> new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
> && (vma->vm_flags & VM_SHARED)));
> set_huge_pte_at(mm, haddr, ptep, new_pte);
> @@ -5874,7 +5876,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> goto out_release_unlock;
>
> if (vm_shared) {
> - page_dup_rmap(page, true);
> + page_dup_file_rmap(page, true);
> } else {
> ClearHPageRestoreReserve(page);
> hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
> diff --git a/mm/memory.c b/mm/memory.c
> index accb72a3343d..b9602d41d907 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -828,7 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> */
> get_page(page);
> rss[mm_counter(page)]++;
> - page_dup_rmap(page, false);
> + /* Cannot fail as these pages cannot get pinned. */
> + BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
>
> /*
> * We do not preserve soft-dirty information, because so
> @@ -924,18 +925,24 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> struct page *page;
>
> page = vm_normal_page(src_vma, addr, pte);
> - if (page && unlikely(page_needs_cow_for_dma(src_vma, page))) {
> + if (page && PageAnon(page)) {
> /*
> * If this page may have been pinned by the parent process,
> * copy the page immediately for the child so that we'll always
> * guarantee the pinned page won't be randomly replaced in the
> * future.
> */
> - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
> - addr, rss, prealloc, page);
> + get_page(page);
> + if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
> + /* Page maybe pinned, we have to copy. */
> + put_page(page);
> + return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
> + addr, rss, prealloc, page);
> + }
> + rss[mm_counter(page)]++;
> } else if (page) {
> get_page(page);
> - page_dup_rmap(page, false);
> + page_dup_file_rmap(page, false);
> rss[mm_counter(page)]++;
> }
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index c7da064b4781..524c2648ab36 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -240,7 +240,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
> if (PageAnon(new))
> hugepage_add_anon_rmap(new, vma, pvmw.address);
> else
> - page_dup_rmap(new, true);
> + page_dup_file_rmap(new, true);
> set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
> } else
> #endif
> --
> 2.35.1
>