Re: [PATCH v5 2/9] mm: Add an apply_to_pfn_range interface

From: Christoph Hellwig
Date: Wed Jun 12 2019 - 08:20:44 EST


On Wed, Jun 12, 2019 at 08:42:36AM +0200, Thomas Hellström (VMware) wrote:
> From: Thomas Hellstrom <thellstrom@xxxxxxxxxx>
>
> This is basically apply_to_page_range with added functionality:
> Allocating missing parts of the page table becomes optional, which
> means that the function can be guaranteed not to error if allocation
> is disabled. Also passing of the closure struct and callback function
> becomes different and more in line with how things are done elsewhere.
>
> Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range
>
> The reason for not using the page-walk code is that we want to perform
> the page-walk on vmas pointing to an address space without requiring the
> mmap_sem to be held rather than on vmas belonging to a process with the
> mmap_sem held.
>
> Notable changes since RFC:
> Don't export apply_to_pfn range.
>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> Cc: Will Deacon <will.deacon@xxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: Rik van Riel <riel@xxxxxxxxxxx>
> Cc: Minchan Kim <minchan@xxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxx>
> Cc: Huang Ying <ying.huang@xxxxxxxxx>
> Cc: Souptick Joarder <jrdr.linux@xxxxxxxxx>
> Cc: "Jérôme Glisse" <jglisse@xxxxxxxxxx>
> Cc: linux-mm@xxxxxxxxx
> Cc: linux-kernel@xxxxxxxxxxxxxxx
>
> Signed-off-by: Thomas Hellstrom <thellstrom@xxxxxxxxxx>
> Reviewed-by: Ralph Campbell <rcampbell@xxxxxxxxxx> #v1
> ---
> include/linux/mm.h | 10 ++++
> mm/memory.c | 135 ++++++++++++++++++++++++++++++++++-----------
> 2 files changed, 113 insertions(+), 32 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0e8834ac32b7..3d06ce2a64af 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
> extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
> unsigned long size, pte_fn_t fn, void *data);
>
> +struct pfn_range_apply;
> +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
> + struct pfn_range_apply *closure);
> +struct pfn_range_apply {
> + struct mm_struct *mm;
> + pter_fn_t ptefn;
> + unsigned int alloc;
> +};
> +extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> + unsigned long address, unsigned long size);
>
> #ifdef CONFIG_PAGE_POISONING
> extern bool page_poisoning_enabled(void);
> diff --git a/mm/memory.c b/mm/memory.c
> index 168f546af1ad..462aa47f8878 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
> }
> EXPORT_SYMBOL(vm_iomap_memory);
>
> -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
> + unsigned long addr, unsigned long end)
> {
> pte_t *pte;
> int err;
> pgtable_t token;
> spinlock_t *uninitialized_var(ptl);
>
> - pte = (mm == &init_mm) ?
> + pte = (closure->mm == &init_mm) ?
> pte_alloc_kernel(pmd, addr) :
> - pte_alloc_map_lock(mm, pmd, addr, &ptl);
> + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
> if (!pte)
> return -ENOMEM;
>
> @@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> token = pmd_pgtable(*pmd);
>
> do {
> - err = fn(pte++, token, addr, data);
> + err = closure->ptefn(pte++, token, addr, closure);
> if (err)
> break;
> } while (addr += PAGE_SIZE, addr != end);
>
> arch_leave_lazy_mmu_mode();
>
> - if (mm != &init_mm)
> + if (closure->mm != &init_mm)
> pte_unmap_unlock(pte-1, ptl);
> return err;
> }
>
> -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
> + unsigned long addr, unsigned long end)
> {
> pmd_t *pmd;
> unsigned long next;
> - int err;
> + int err = 0;
>
> BUG_ON(pud_huge(*pud));
>
> - pmd = pmd_alloc(mm, pud, addr);
> + pmd = pmd_alloc(closure->mm, pud, addr);
> if (!pmd)
> return -ENOMEM;
> +
> do {
> next = pmd_addr_end(addr, end);
> - err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
> + if (!closure->alloc && pmd_none_or_clear_bad(pmd))
> + continue;
> + err = apply_to_pte_range(closure, pmd, addr, next);
> if (err)
> break;
> } while (pmd++, addr = next, addr != end);
> return err;
> }
>
> -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
> + unsigned long addr, unsigned long end)
> {
> pud_t *pud;
> unsigned long next;
> - int err;
> + int err = 0;
>
> - pud = pud_alloc(mm, p4d, addr);
> + pud = pud_alloc(closure->mm, p4d, addr);
> if (!pud)
> return -ENOMEM;
> +
> do {
> next = pud_addr_end(addr, end);
> - err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
> + if (!closure->alloc && pud_none_or_clear_bad(pud))
> + continue;
> + err = apply_to_pmd_range(closure, pud, addr, next);
> if (err)
> break;
> } while (pud++, addr = next, addr != end);
> return err;
> }
>
> -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
> + unsigned long addr, unsigned long end)
> {
> p4d_t *p4d;
> unsigned long next;
> - int err;
> + int err = 0;
>
> - p4d = p4d_alloc(mm, pgd, addr);
> + p4d = p4d_alloc(closure->mm, pgd, addr);
> if (!p4d)
> return -ENOMEM;
> +
> do {
> next = p4d_addr_end(addr, end);
> - err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
> + if (!closure->alloc && p4d_none_or_clear_bad(p4d))
> + continue;
> + err = apply_to_pud_range(closure, p4d, addr, next);
> if (err)
> break;
> } while (p4d++, addr = next, addr != end);
> return err;
> }
>
> -/*
> - * Scan a region of virtual memory, filling in page tables as necessary
> - * and calling a provided function on each leaf page table.
> +/**
> + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
> + * function on each leaf page table entry
> + * @closure: Details about how to scan and what function to apply
> + * @addr: Start virtual address
> + * @size: Size of the region
> + *
> + * If @closure->alloc is set to 1, the function will fill in the page table
> + * as necessary. Otherwise it will skip non-present parts.
> + * Note: The caller must ensure that the range does not contain huge pages.
> + * The caller must also assure that the proper mmu_notifier functions are
> + * called before and after the call to apply_to_pfn_range.
> + *
> + * WARNING: Do not use this function unless you know exactly what you are
> + * doing. It is lacking support for huge pages and transparent huge pages.
> + *
> + * Return: Zero on success. If the provided function returns a non-zero status,
> + * the page table walk will terminate and that status will be returned.
> + * If @closure->alloc is set to 1, then this function may also return memory
> + * allocation errors arising from allocating page table memory.
> */
> -int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> - unsigned long size, pte_fn_t fn, void *data)
> +int apply_to_pfn_range(struct pfn_range_apply *closure,
> + unsigned long addr, unsigned long size)
> {
> pgd_t *pgd;
> unsigned long next;
> @@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> if (WARN_ON(addr >= end))
> return -EINVAL;
>
> - pgd = pgd_offset(mm, addr);
> + pgd = pgd_offset(closure->mm, addr);
> do {
> next = pgd_addr_end(addr, end);
> - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
> + if (!closure->alloc && pgd_none_or_clear_bad(pgd))
> + continue;
> + err = apply_to_p4d_range(closure, pgd, addr, next);
> if (err)
> break;
> } while (pgd++, addr = next, addr != end);
>
> return err;
> }
> +
> +/**
> + * struct page_range_apply - Closure structure for apply_to_page_range()
> + * @pter: The base closure structure we derive from
> + * @fn: The leaf pte function to call
> + * @data: The leaf pte function closure
> + */
> +struct page_range_apply {
> + struct pfn_range_apply pter;
> + pte_fn_t fn;
> + void *data;
> +};
> +
> +/*
> + * Callback wrapper to enable use of apply_to_pfn_range for
> + * the apply_to_page_range interface
> + */
> +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
> + unsigned long addr,
> + struct pfn_range_apply *pter)
> +{
> + struct page_range_apply *pra =
> + container_of(pter, typeof(*pra), pter);
> +
> + return pra->fn(pte, token, addr, pra->data);
> +}
> +
> +/*
> + * Scan a region of virtual memory, filling in page tables as necessary
> + * and calling a provided function on each leaf page table.
> + *
> + * WARNING: Do not use this function unless you know exactly what you are
> + * doing. It is lacking support for huge pages and transparent huge pages.
> + */
> +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> + unsigned long size, pte_fn_t fn, void *data)
> +{
> + struct page_range_apply pra = {
> + .pter = {.mm = mm,
> + .alloc = 1,
> + .ptefn = apply_to_page_range_wrapper },
> + .fn = fn,
> + .data = data
> + };
> +
> + return apply_to_pfn_range(&pra.pter, addr, size);
> +}
>
> EXPORT_SYMBOL_GPL(apply_to_page_range);

Actually - did you look into converting our two hand full of
apply_to_page_range callers to your new scheme? It seems like that
might actually not be to bad and avoid various layers of wrappers.