Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

From: Alistair Popple
Date: Thu Dec 07 2023 - 19:38:26 EST



Ryan Roberts <ryan.roberts@xxxxxxx> writes:

<snip>

> /*
> * On some architectures hardware does not set page access bit when accessing
> * memory page, it is responsibility of software setting this bit. It brings
> diff --git a/mm/memory.c b/mm/memory.c
> index 1f18ed4a5497..8a87a488950c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -924,68 +924,162 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
> return 0;
> }
>
> +static int folio_nr_pages_cont_mapped(struct folio *folio,
> + struct page *page, pte_t *pte,
> + unsigned long addr, unsigned long end,
> + pte_t ptent, bool enforce_uffd_wp,
> + int *dirty_nr, int *writable_nr)
> +{
> + int floops;
> + int i;
> + unsigned long pfn;
> + bool prot_none;
> + bool uffd_wp;
> +
> + if (!folio_test_large(folio))
> + return 1;
> +
> + /*
> + * Loop either to `end` or to end of folio if its contiguously mapped,
> + * whichever is smaller.
> + */
> + floops = (end - addr) >> PAGE_SHIFT;
> + floops = min_t(int, floops,
> + folio_pfn(folio_next(folio)) - page_to_pfn(page));

Much better, thanks for addressing my comments here.

> +
> + pfn = page_to_pfn(page);
> + prot_none = pte_protnone(ptent);
> + uffd_wp = pte_uffd_wp(ptent);
> +
> + *dirty_nr = !!pte_dirty(ptent);
> + *writable_nr = !!pte_write(ptent);
> +
> + pfn++;
> + pte++;
> +
> + for (i = 1; i < floops; i++) {
> + ptent = ptep_get(pte);
> +
> + if (!pte_present(ptent) || pte_pfn(ptent) != pfn ||
> + prot_none != pte_protnone(ptent) ||
> + (enforce_uffd_wp && uffd_wp != pte_uffd_wp(ptent)))
> + break;
> +
> + if (pte_dirty(ptent))
> + (*dirty_nr)++;
> + if (pte_write(ptent))
> + (*writable_nr)++;
> +
> + pfn++;
> + pte++;
> + }
> +
> + return i;
> +}
> +
> /*
> - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
> - * is required to copy this pte.
> + * Copy set of contiguous ptes. Returns number of ptes copied if succeeded
> + * (always gte 1), or -EAGAIN if one preallocated page is required to copy the
> + * first pte.
> */
> static inline int
> -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
> - struct folio **prealloc)
> +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> + pte_t *dst_pte, pte_t *src_pte,
> + unsigned long addr, unsigned long end,
> + int *rss, struct folio **prealloc)
> {
> struct mm_struct *src_mm = src_vma->vm_mm;
> unsigned long vm_flags = src_vma->vm_flags;
> pte_t pte = ptep_get(src_pte);
> struct page *page;
> struct folio *folio;
> + int nr = 1;
> + bool anon = false;
> + bool enforce_uffd_wp = userfaultfd_wp(dst_vma);
> + int nr_dirty = !!pte_dirty(pte);
> + int nr_writable = !!pte_write(pte);
> + int i, ret;
>
> page = vm_normal_page(src_vma, addr, pte);
> - if (page)
> + if (page) {
> folio = page_folio(page);
> - if (page && folio_test_anon(folio)) {
> - /*
> - * If this page may have been pinned by the parent process,
> - * copy the page immediately for the child so that we'll always
> - * guarantee the pinned page won't be randomly replaced in the
> - * future.
> - */
> - folio_get(folio);
> - if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
> - /* Page may be pinned, we have to copy. */
> - folio_put(folio);
> - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
> - addr, rss, prealloc, page);
> + anon = folio_test_anon(folio);
> + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
> + pte, enforce_uffd_wp, &nr_dirty,
> + &nr_writable);
> + folio_ref_add(folio, nr);
> +
> + for (i = 0; i < nr; i++, page++) {
> + if (anon) {
> + /*
> + * If this page may have been pinned by the
> + * parent process, copy the page immediately for
> + * the child so that we'll always guarantee the
> + * pinned page won't be randomly replaced in the
> + * future.
> + */
> + if (unlikely(page_try_dup_anon_rmap(
> + page, false, src_vma))) {
> + if (i != 0)
> + break;
> + /* Page may be pinned, we have to copy. */
> + folio_ref_sub(folio, nr);
> + ret = copy_present_page(
> + dst_vma, src_vma, dst_pte,
> + src_pte, addr, rss, prealloc,
> + page);
> + return ret == 0 ? 1 : ret;
> + }
> + rss[MM_ANONPAGES]++;
> + VM_BUG_ON(PageAnonExclusive(page));
> + } else {
> + page_dup_file_rmap(page, false);
> + rss[mm_counter_file(page)]++;
> + }
> }
> - rss[MM_ANONPAGES]++;
> - } else if (page) {
> - folio_get(folio);
> - page_dup_file_rmap(page, false);
> - rss[mm_counter_file(page)]++;
> - }
>
> - /*
> - * If it's a COW mapping, write protect it both
> - * in the parent and the child
> - */
> - if (is_cow_mapping(vm_flags) && pte_write(pte)) {
> - ptep_set_wrprotect(src_mm, addr, src_pte);
> - pte = pte_wrprotect(pte);
> + if (i < nr) {
> + folio_ref_sub(folio, nr - i);
> + nr = i;
> + }
> }
> - VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
>
> /*
> - * If it's a shared mapping, mark it clean in
> - * the child
> + * If it's a shared mapping, mark it clean and write protected in the
> + * child, and rely on a write fault to fix up the permissions. This
> + * allows determining batch size without having to consider RO/RW
> + * permissions. As an optimization, skip wrprotect if all ptes in the
> + * batch have the same permissions.
> + *
> + * If its a private (CoW) mapping, mark it dirty in the child if _any_
> + * of the parent mappings in the block were marked dirty. The contiguous
> + * block of mappings are all backed by the same folio, so if any are
> + * dirty then the whole folio is dirty. This allows determining batch
> + * size without having to consider the dirty bit. Further, write protect
> + * it both in the parent and the child so that a future write will cause
> + * a CoW. As as an optimization, skip the wrprotect if all the ptes in
> + * the batch are already readonly.
> */
> - if (vm_flags & VM_SHARED)
> + if (vm_flags & VM_SHARED) {
> pte = pte_mkclean(pte);
> - pte = pte_mkold(pte);
> + if (nr_writable > 0 && nr_writable < nr)
> + pte = pte_wrprotect(pte);
> + } else {
> + if (nr_dirty)
> + pte = pte_mkdirty(pte);
> + if (nr_writable) {
> + ptep_set_wrprotects(src_mm, addr, src_pte, nr);
> + pte = pte_wrprotect(pte);
> + }
> + }
>
> - if (!userfaultfd_wp(dst_vma))
> + pte = pte_mkold(pte);
> + pte = pte_clear_soft_dirty(pte);
> + if (!enforce_uffd_wp)
> pte = pte_clear_uffd_wp(pte);
>
> - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
> - return 0;
> + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
> + return nr;

I don't have any further comments and you have addressed my previous
ones so feel free to add:

Reviewed-by: Alistair Popple <apopple@xxxxxxxxxx>

However whilst I think the above CoW sequence looks correct it would be
nice if someone else could take a look as well.

> }
>
> static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
> @@ -1021,6 +1115,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> int rss[NR_MM_COUNTERS];
> swp_entry_t entry = (swp_entry_t){0};
> struct folio *prealloc = NULL;
> + int nr_ptes;
>
> again:
> progress = 0;
> @@ -1051,6 +1146,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> arch_enter_lazy_mmu_mode();
>
> do {
> + nr_ptes = 1;
> +
> /*
> * We are holding two locks at this point - either of them
> * could generate latencies in another task on another CPU.
> @@ -1086,16 +1183,21 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> * the now present pte.
> */
> WARN_ON_ONCE(ret != -ENOENT);
> + ret = 0;
> }
> - /* copy_present_pte() will clear `*prealloc' if consumed */
> - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
> - addr, rss, &prealloc);
> + /* copy_present_ptes() will clear `*prealloc' if consumed */
> + nr_ptes = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
> + addr, end, rss, &prealloc);
> +
> /*
> * If we need a pre-allocated page for this pte, drop the
> * locks, allocate, and try again.
> */
> - if (unlikely(ret == -EAGAIN))
> + if (unlikely(nr_ptes == -EAGAIN)) {
> + ret = -EAGAIN;
> break;
> + }
> +
> if (unlikely(prealloc)) {
> /*
> * pre-alloc page cannot be reused by next time so as
> @@ -1106,8 +1208,9 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> folio_put(prealloc);
> prealloc = NULL;
> }
> - progress += 8;
> - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
> + progress += 8 * nr_ptes;
> + } while (dst_pte += nr_ptes, src_pte += nr_ptes,
> + addr += PAGE_SIZE * nr_ptes, addr != end);
>
> arch_leave_lazy_mmu_mode();
> pte_unmap_unlock(orig_src_pte, src_ptl);