Re: [PATCH v3 10/12] mempolicy: alloc_pages_mpol() for NUMA policy without vma

From: Zi Yan
Date: Mon Oct 23 2023 - 14:34:48 EST


On 19 Oct 2023, at 16:39, Hugh Dickins wrote:

> Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
> allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
> principal actor for passing mempolicy choice down to __alloc_pages(),
> rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).
>
> vma_alloc_folio() and alloc_pages() remain, but as wrappers around
> alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the
> additional args to policy_nodemask(), which subsumes policy_node().
> Cleanup throughout, cutting out some unhelpful "helpers".
>
> It would all be much simpler without MPOL_INTERLEAVE, but that adds a
> dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd
> ("tmpfs: distribute interleave better across nodes"), which added ino bias
> to the interleave, hidden from mm/mempolicy.c until this commit.
>
> Hence "ilx" throughout, the "interleave index". Originally I thought it
> could be done just with nid, but that's wrong: the nodemask may come from
> the shared policy layer below a shmem vma, or it may come from the task
> layer above a shmem vma; and without the final nodemask then nodeid cannot
> be decided. And how ilx is applied depends also on page order.
>
> The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
> with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
> passed down from vma-less alloc_pages() is also used as hint not to use
> THP-style hugepage allocation - to avoid the overhead of a hugepage arg
> (though I don't understand why we never just added a GFP bit for THP - if
> it actually needs a different allocation strategy from other pages of the
> same order). vma_alloc_folio() still carries its hugepage arg here, but
> it is not used, and should be removed when agreed.
>
> get_vma_policy() no longer allows a NULL vma: over time I believe we've
> eradicated all the places which used to need it e.g. swapoff and madvise
> used to pass NULL vma to read_swap_cache_async(), but now know the vma.
>
> Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@xxxxxxxxxx
> Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> Cc: Christoph Lameter <cl@xxxxxxxxx>
> Cc: David Hildenbrand <david@xxxxxxxxxx>
> Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
> Cc: Huang Ying <ying.huang@xxxxxxxxx>
> Cc: Kefeng Wang <wangkefeng.wang@xxxxxxxxxx>
> Cc: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxx>
> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
> Cc: Nhat Pham <nphamcs@xxxxxxxxx>
> Cc: Sidhartha Kumar <sidhartha.kumar@xxxxxxxxxx>
> Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> Cc: Tejun heo <tj@xxxxxxxxxx>
> Cc: Vishal Moola (Oracle) <vishal.moola@xxxxxxxxx>
> Cc: Yang Shi <shy828301@xxxxxxxxx>
> Cc: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
> ---
> Rebased to mm.git's current mm-stable, to resolve with removal of
> vma_policy() from include/linux/mempolicy.h, and temporary omission
> of Nhat's ZSWAP mods from mm/swap_state.c: no other changes.
>
> git cherry-pick 800caf44af25^..237d4ce921f0 # applies mm-unstable's 01-09
> then apply this "mempolicy: alloc_pages_mpol() for NUMA policy without vma"
> git cherry-pick e4fb3362b782^..ec6412928b8e # applies mm-unstable's 11-12
>
> fs/proc/task_mmu.c | 5 +-
> include/linux/gfp.h | 10 +-
> include/linux/mempolicy.h | 13 +-
> include/linux/mm.h | 2 +-
> ipc/shm.c | 21 +--
> mm/mempolicy.c | 383 +++++++++++++++++++---------------------------
> mm/shmem.c | 92 ++++++-----
> mm/swap.h | 9 +-
> mm/swap_state.c | 86 +++++++----
> 9 files changed, 299 insertions(+), 322 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 1d99450..66ae1c2 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v)
> struct numa_maps *md = &numa_priv->md;
> struct file *file = vma->vm_file;
> struct mm_struct *mm = vma->vm_mm;
> - struct mempolicy *pol;
> char buffer[64];
> + struct mempolicy *pol;
> + pgoff_t ilx;
> int nid;
>
> if (!mm)
> @@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v)
> /* Ensure we start with an empty set of numa_maps statistics. */
> memset(md, 0, sizeof(*md));
>
> - pol = __get_vma_policy(vma, vma->vm_start);
> + pol = __get_vma_policy(vma, vma->vm_start, &ilx);
> if (pol) {
> mpol_to_str(buffer, sizeof(buffer), pol);
> mpol_cond_put(pol);
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 665f066..f74f8d0 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -8,6 +8,7 @@
> #include <linux/topology.h>
>
> struct vm_area_struct;
> +struct mempolicy;
>
> /* Convert GFP flags to their corresponding migrate type */
> #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
> @@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>
> #ifdef CONFIG_NUMA
> struct page *alloc_pages(gfp_t gfp, unsigned int order);
> -struct folio *folio_alloc(gfp_t gfp, unsigned order);
> +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
> + struct mempolicy *mpol, pgoff_t ilx, int nid);
> +struct folio *folio_alloc(gfp_t gfp, unsigned int order);
> struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
> unsigned long addr, bool hugepage);
> #else
> @@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
> {
> return alloc_pages_node(numa_node_id(), gfp_mask, order);
> }
> +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
> + struct mempolicy *mpol, pgoff_t ilx, int nid)
> +{
> + return alloc_pages(gfp, order);
> +}
> static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
> {
> return __folio_alloc_node(gfp, order, numa_node_id());
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index acdb12f..2801d5b 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -126,7 +126,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
>
> struct mempolicy *get_task_policy(struct task_struct *p);
> struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
> - unsigned long addr);
> + unsigned long addr, pgoff_t *ilx);
> +struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
> + unsigned long addr, int order, pgoff_t *ilx);
> bool vma_policy_mof(struct vm_area_struct *vma);
>
> extern void numa_default_policy(void);
> @@ -140,8 +142,6 @@ extern int huge_node(struct vm_area_struct *vma,
> extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
> extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
> const nodemask_t *mask);
> -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
> -
> extern unsigned int mempolicy_slab_node(void);
>
> extern enum zone_type policy_zone;
> @@ -213,6 +213,13 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp)
> return NULL;
> }
>
> +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
> + unsigned long addr, int order, pgoff_t *ilx)
> +{
> + *ilx = 0;
> + return NULL;
> +}
> +
> static inline int
> vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
> {
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 86e040e..b4d67a8 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -619,7 +619,7 @@ struct vm_operations_struct {
> * policy.
> */
> struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
> - unsigned long addr);
> + unsigned long addr, pgoff_t *ilx);
> #endif
> /*
> * Called by vm_normal_page() for special PTEs to find the
> diff --git a/ipc/shm.c b/ipc/shm.c
> index 576a543..222aaf0 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma)
> }
>
> #ifdef CONFIG_NUMA
> -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
> +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
> {
> - struct file *file = vma->vm_file;
> - struct shm_file_data *sfd = shm_file_data(file);
> + struct shm_file_data *sfd = shm_file_data(vma->vm_file);
> int err = 0;
>
> if (sfd->vm_ops->set_policy)
> - err = sfd->vm_ops->set_policy(vma, new);
> + err = sfd->vm_ops->set_policy(vma, mpol);
> return err;
> }
>
> static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
> - unsigned long addr)
> + unsigned long addr, pgoff_t *ilx)
> {
> - struct file *file = vma->vm_file;
> - struct shm_file_data *sfd = shm_file_data(file);
> - struct mempolicy *pol = NULL;
> + struct shm_file_data *sfd = shm_file_data(vma->vm_file);
> + struct mempolicy *mpol = vma->vm_policy;
>
> if (sfd->vm_ops->get_policy)
> - pol = sfd->vm_ops->get_policy(vma, addr);
> - else if (vma->vm_policy)
> - pol = vma->vm_policy;
> -
> - return pol;
> + mpol = sfd->vm_ops->get_policy(vma, addr, ilx);
> + return mpol;
> }
> #endif
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 596d580..8df0503 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -114,6 +114,8 @@
> #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
> #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
>
> +#define NO_INTERLEAVE_INDEX (-1UL)
> +
> static struct kmem_cache *policy_cache;
> static struct kmem_cache *sn_cache;
>
> @@ -898,6 +900,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> }
>
> if (flags & MPOL_F_ADDR) {
> + pgoff_t ilx; /* ignored here */
> /*
> * Do NOT fall back to task policy if the
> * vma/shared policy at addr is NULL. We
> @@ -909,10 +912,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
> mmap_read_unlock(mm);
> return -EFAULT;
> }
> - if (vma->vm_ops && vma->vm_ops->get_policy)
> - pol = vma->vm_ops->get_policy(vma, addr);
> - else
> - pol = vma->vm_policy;
> + pol = __get_vma_policy(vma, addr, &ilx);
> } else if (addr)
> return -EINVAL;
>
> @@ -1170,6 +1170,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start)
> break;
> }
>
> + /*
> + * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL
> + * when the page can no longer be located in a vma: that is not ideal
> + * (migrate_pages() will give up early, presuming ENOMEM), but good
> + * enough to avoid a crash by syzkaller or concurrent holepunch.
> + */
> + if (!vma)
> + return NULL;
> +

How often would this happen? I just want to point out that ENOMEM can cause
src THPs or large folios to be split by migrate_pages().

> if (folio_test_hugetlb(src)) {
> return alloc_hugetlb_folio_vma(folio_hstate(src),
> vma, address);
> @@ -1178,9 +1187,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start)
> if (folio_test_large(src))
> gfp = GFP_TRANSHUGE;
>
> - /*
> - * if !vma, vma_alloc_folio() will use task or system default policy
> - */
> return vma_alloc_folio(gfp, folio_order(src), vma, address,
> folio_test_large(src));
> }


--
Best Regards,
Yan, Zi

Attachment: signature.asc
Description: OpenPGP digital signature