Re: [PATCH v2] mm: make folio_pte_batch available outside of mm/memory.c

From: Ryan Roberts
Date: Tue Feb 27 2024 - 05:48:39 EST


On 27/02/2024 10:42, Barry Song wrote:
> From: Barry Song <v-songbaohua@xxxxxxxx>
>
> madvise, mprotect and some others might need folio_pte_batch to check if
> a range of PTEs are completely mapped to a large folio with contiguous
> physical addresses. Let's make it available in mm/internal.h.
>
> Suggested-by: David Hildenbrand <david@xxxxxxxxxx>
> Cc: Lance Yang <ioworker0@xxxxxxxxx>
> Cc: Ryan Roberts <ryan.roberts@xxxxxxx>
> Cc: Yin Fengwei <fengwei.yin@xxxxxxxxx>
> [david@xxxxxxxxxx: improve the doc for the exported func]
> Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
> Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>

Reviewed-by: Ryan Roberts <ryan.roberts@xxxxxxx>

> ---
> -v2:
> * inline folio_pte_batch according to Ryan and David;
> * improve the doc, thanks to David's work on this;
> * fix tags of David and add David's s-o-b;
> -v1:
> https://lore.kernel.org/all/20240227024050.244567-1-21cnbao@xxxxxxxxx/
>
> mm/internal.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
> mm/memory.c | 76 -------------------------------------------
> 2 files changed, 90 insertions(+), 76 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 13b59d384845..fa9e2f7db506 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -83,6 +83,96 @@ static inline void *folio_raw_mapping(struct folio *folio)
> return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
> }
>
> +/* Flags for folio_pte_batch(). */
> +typedef int __bitwise fpb_t;
> +
> +/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
> +#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
> +
> +/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
> +#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
> +
> +static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> +{
> + if (flags & FPB_IGNORE_DIRTY)
> + pte = pte_mkclean(pte);
> + if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
> + pte = pte_clear_soft_dirty(pte);
> + return pte_wrprotect(pte_mkold(pte));
> +}
> +
> +/**
> + * folio_pte_batch - detect a PTE batch for a large folio
> + * @folio: The large folio to detect a PTE batch for.
> + * @addr: The user virtual address the first page is mapped at.
> + * @start_ptep: Page table pointer for the first entry.
> + * @pte: Page table entry for the first page.
> + * @max_nr: The maximum number of table entries to consider.
> + * @flags: Flags to modify the PTE batch semantics.
> + * @any_writable: Optional pointer to indicate whether any entry except the
> + * first one is writable.
> + *
> + * Detect a PTE batch: consecutive (present) PTEs that map consecutive
> + * pages of the same large folio.
> + *
> + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
> + * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
> + * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
> + *
> + * start_ptep must map any page of the folio. max_nr must be at least one and
> + * must be limited by the caller so scanning cannot exceed a single page table.
> + *
> + * Return: the number of table entries in the batch.
> + */
> +static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
> + bool *any_writable)
> +{
> + unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
> + const pte_t *end_ptep = start_ptep + max_nr;
> + pte_t expected_pte, *ptep;
> + bool writable;
> + int nr;
> +
> + if (any_writable)
> + *any_writable = false;
> +
> + VM_WARN_ON_FOLIO(!pte_present(pte), folio);
> + VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
> + VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
> +
> + nr = pte_batch_hint(start_ptep, pte);
> + expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
> + ptep = start_ptep + nr;
> +
> + while (ptep < end_ptep) {
> + pte = ptep_get(ptep);
> + if (any_writable)
> + writable = !!pte_write(pte);
> + pte = __pte_batch_clear_ignored(pte, flags);
> +
> + if (!pte_same(pte, expected_pte))
> + break;
> +
> + /*
> + * Stop immediately once we reached the end of the folio. In
> + * corner cases the next PFN might fall into a different
> + * folio.
> + */
> + if (pte_pfn(pte) >= folio_end_pfn)
> + break;
> +
> + if (any_writable)
> + *any_writable |= writable;
> +
> + nr = pte_batch_hint(ptep, pte);
> + expected_pte = pte_advance_pfn(expected_pte, nr);
> + ptep += nr;
> + }
> +
> + return min(ptep - start_ptep, max_nr);
> +}
> +
> void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
> int nr_throttled);
> static inline void acct_reclaim_writeback(struct folio *folio)
> diff --git a/mm/memory.c b/mm/memory.c
> index 1c45b6a42a1b..a7bcc39de56b 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -953,82 +953,6 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
> set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
> }
>
> -/* Flags for folio_pte_batch(). */
> -typedef int __bitwise fpb_t;
> -
> -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
> -#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
> -
> -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
> -#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
> -
> -static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> -{
> - if (flags & FPB_IGNORE_DIRTY)
> - pte = pte_mkclean(pte);
> - if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
> - pte = pte_clear_soft_dirty(pte);
> - return pte_wrprotect(pte_mkold(pte));
> -}
> -
> -/*
> - * Detect a PTE batch: consecutive (present) PTEs that map consecutive
> - * pages of the same folio.
> - *
> - * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
> - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
> - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
> - *
> - * If "any_writable" is set, it will indicate if any other PTE besides the
> - * first (given) PTE is writable.
> - */
> -static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> - pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
> - bool *any_writable)
> -{
> - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
> - const pte_t *end_ptep = start_ptep + max_nr;
> - pte_t expected_pte, *ptep;
> - bool writable;
> - int nr;
> -
> - if (any_writable)
> - *any_writable = false;
> -
> - VM_WARN_ON_FOLIO(!pte_present(pte), folio);
> -
> - nr = pte_batch_hint(start_ptep, pte);
> - expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
> - ptep = start_ptep + nr;
> -
> - while (ptep < end_ptep) {
> - pte = ptep_get(ptep);
> - if (any_writable)
> - writable = !!pte_write(pte);
> - pte = __pte_batch_clear_ignored(pte, flags);
> -
> - if (!pte_same(pte, expected_pte))
> - break;
> -
> - /*
> - * Stop immediately once we reached the end of the folio. In
> - * corner cases the next PFN might fall into a different
> - * folio.
> - */
> - if (pte_pfn(pte) >= folio_end_pfn)
> - break;
> -
> - if (any_writable)
> - *any_writable |= writable;
> -
> - nr = pte_batch_hint(ptep, pte);
> - expected_pte = pte_advance_pfn(expected_pte, nr);
> - ptep += nr;
> - }
> -
> - return min(ptep - start_ptep, max_nr);
> -}
> -
> /*
> * Copy one present PTE, trying to batch-process subsequent PTEs that map
> * consecutive pages of the same folio by copying them as well.