Re: [PATCH 4/6] mm: thp: refactor NUMA fault handling

From: Yang Shi
Date: Tue Mar 30 2021 - 13:04:06 EST


On Mon, Mar 29, 2021 at 5:41 PM Huang, Ying <ying.huang@xxxxxxxxx> wrote:
>
> Yang Shi <shy828301@xxxxxxxxx> writes:
>
> > When the THP NUMA fault support was added THP migration was not supported yet.
> > So the ad hoc THP migration was implemented in NUMA fault handling. Since v4.14
> > THP migration has been supported so it doesn't make too much sense to still keep
> > another THP migration implementation rather than using the generic migration
> > code.
> >
> > This patch reworked the NUMA fault handling to use generic migration implementation
> > to migrate misplaced page. There is no functional change.
> >
> > After the refactor the flow of NUMA fault handling looks just like its
> > PTE counterpart:
> > Acquire ptl
> > Restore PMD
> > Prepare for migration (elevate page refcount)
> > Release ptl
> > Isolate page from lru and elevate page refcount
> > Migrate the misplaced THP
>
> There's some behavior change between the original implementation and
> your new implementation. Originally, PMD is restored after trying to
> migrate the misplaced THP. I think this can reduce the TLB
> shooting-down IPI.

In theory, yes. However I'm not sure if it is really measurable to
real life workload. I would like to make the huge NUMA hinting behave
like its PTE counterpart for now. If your PTE NUMA hinting patch
proves the TLB shooting down optimization is really worth it we could
easily do it for the PMD side too.

>
> Best Regards,
> Huang, Ying
>
> > In the old code anon_vma lock was needed to serialize THP migration
> > against THP split, but since then the THP code has been reworked a lot,
> > it seems anon_vma lock is not required anymore to avoid the race.
> >
> > The page refcount elevation when holding ptl should prevent from THP
> > split.
> >
> > Signed-off-by: Yang Shi <shy828301@xxxxxxxxx>
> > ---
> > include/linux/migrate.h | 23 ------
> > mm/huge_memory.c | 132 ++++++++----------------------
> > mm/migrate.c | 173 ++++++----------------------------------
> > 3 files changed, 57 insertions(+), 271 deletions(-)
> >
> > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > index 6abd34986cc5..6c8640e9af4f 100644
> > --- a/include/linux/migrate.h
> > +++ b/include/linux/migrate.h
> > @@ -100,15 +100,10 @@ static inline void __ClearPageMovable(struct page *page)
> > #endif
> >
> > #ifdef CONFIG_NUMA_BALANCING
> > -extern bool pmd_trans_migrating(pmd_t pmd);
> > extern int migrate_misplaced_page(struct page *page,
> > struct vm_area_struct *vma, int node,
> > bool compound);
> > #else
> > -static inline bool pmd_trans_migrating(pmd_t pmd)
> > -{
> > - return false;
> > -}
> > static inline int migrate_misplaced_page(struct page *page,
> > struct vm_area_struct *vma, int node,
> > bool compound)
> > @@ -117,24 +112,6 @@ static inline int migrate_misplaced_page(struct page *page,
> > }
> > #endif /* CONFIG_NUMA_BALANCING */
> >
> > -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> > -extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
> > - struct vm_area_struct *vma,
> > - pmd_t *pmd, pmd_t entry,
> > - unsigned long address,
> > - struct page *page, int node);
> > -#else
> > -static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
> > - struct vm_area_struct *vma,
> > - pmd_t *pmd, pmd_t entry,
> > - unsigned long address,
> > - struct page *page, int node)
> > -{
> > - return -EAGAIN;
> > -}
> > -#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
> > -
> > -
> > #ifdef CONFIG_MIGRATION
> >
> > /*
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 53f3843ce72a..157c63b0fd95 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1419,94 +1419,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
> > {
> > struct vm_area_struct *vma = vmf->vma;
> > pmd_t pmd = vmf->orig_pmd;
> > - struct anon_vma *anon_vma = NULL;
> > struct page *page;
> > unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > - int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
> > + int page_nid = NUMA_NO_NODE;
> > int target_nid, last_cpupid = -1;
> > - bool page_locked;
> > bool migrated = false;
> > - bool was_writable;
> > + bool was_writable = pmd_savedwrite(pmd);
> > int flags = 0;
> >
> > vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> > - if (unlikely(!pmd_same(pmd, *vmf->pmd)))
> > - goto out_unlock;
> > -
> > - /*
> > - * If there are potential migrations, wait for completion and retry
> > - * without disrupting NUMA hinting information. Do not relock and
> > - * check_same as the page may no longer be mapped.
> > - */
> > - if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
> > - page = pmd_page(*vmf->pmd);
> > - if (!get_page_unless_zero(page))
> > - goto out_unlock;
> > - spin_unlock(vmf->ptl);
> > - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
> > - goto out;
> > - }
> > -
> > - page = pmd_page(pmd);
> > - BUG_ON(is_huge_zero_page(page));
> > - page_nid = page_to_nid(page);
> > - last_cpupid = page_cpupid_last(page);
> > - count_vm_numa_event(NUMA_HINT_FAULTS);
> > - if (page_nid == this_nid) {
> > - count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
> > - flags |= TNF_FAULT_LOCAL;
> > - }
> > -
> > - /* See similar comment in do_numa_page for explanation */
> > - if (!pmd_savedwrite(pmd))
> > - flags |= TNF_NO_GROUP;
> > -
> > - /*
> > - * Acquire the page lock to serialise THP migrations but avoid dropping
> > - * page_table_lock if at all possible
> > - */
> > - page_locked = trylock_page(page);
> > - target_nid = mpol_misplaced(page, vma, haddr);
> > - if (target_nid == NUMA_NO_NODE) {
> > - /* If the page was locked, there are no parallel migrations */
> > - if (page_locked)
> > - goto clear_pmdnuma;
> > - }
> > -
> > - /* Migration could have started since the pmd_trans_migrating check */
> > - if (!page_locked) {
> > - page_nid = NUMA_NO_NODE;
> > - if (!get_page_unless_zero(page))
> > - goto out_unlock;
> > + if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
> > spin_unlock(vmf->ptl);
> > - put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
> > goto out;
> > }
> >
> > - /*
> > - * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
> > - * to serialises splits
> > - */
> > - get_page(page);
> > - spin_unlock(vmf->ptl);
> > - anon_vma = page_lock_anon_vma_read(page);
> > -
> > - /* Confirm the PMD did not change while page_table_lock was released */
> > - spin_lock(vmf->ptl);
> > - if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
> > - unlock_page(page);
> > - put_page(page);
> > - page_nid = NUMA_NO_NODE;
> > - goto out_unlock;
> > - }
> > -
> > - /* Bail if we fail to protect against THP splits for any reason */
> > - if (unlikely(!anon_vma)) {
> > - put_page(page);
> > - page_nid = NUMA_NO_NODE;
> > - goto clear_pmdnuma;
> > - }
> > -
> > /*
> > * Since we took the NUMA fault, we must have observed the !accessible
> > * bit. Make sure all other CPUs agree with that, to avoid them
> > @@ -1533,38 +1459,44 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
> > haddr + HPAGE_PMD_SIZE);
> > }
> >
> > - /*
> > - * Migrate the THP to the requested node, returns with page unlocked
> > - * and access rights restored.
> > - */
> > - spin_unlock(vmf->ptl);
> > -
> > - migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
> > - vmf->pmd, pmd, vmf->address, page, target_nid);
> > - if (migrated) {
> > - flags |= TNF_MIGRATED;
> > - page_nid = target_nid;
> > - } else
> > - flags |= TNF_MIGRATE_FAIL;
> > -
> > - goto out;
> > -clear_pmdnuma:
> > - BUG_ON(!PageLocked(page));
> > - was_writable = pmd_savedwrite(pmd);
> > + /* Restore the PMD */
> > pmd = pmd_modify(pmd, vma->vm_page_prot);
> > pmd = pmd_mkyoung(pmd);
> > if (was_writable)
> > pmd = pmd_mkwrite(pmd);
> > set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
> > update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > - unlock_page(page);
> > -out_unlock:
> > +
> > + page = vm_normal_page_pmd(vma, haddr, pmd);
> > + if (!page) {
> > + spin_unlock(vmf->ptl);
> > + goto out;
> > + }
> > +
> > + /* See similar comment in do_numa_page for explanation */
> > + if (!was_writable)
> > + flags |= TNF_NO_GROUP;
> > +
> > + page_nid = page_to_nid(page);
> > + last_cpupid = page_cpupid_last(page);
> > + target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
> > + &flags);
> > +
> > spin_unlock(vmf->ptl);
> >
> > -out:
> > - if (anon_vma)
> > - page_unlock_anon_vma_read(anon_vma);
> > + if (target_nid == NUMA_NO_NODE) {
> > + put_page(page);
> > + goto out;
> > + }
> > +
> > + migrated = migrate_misplaced_page(page, vma, target_nid, true);
> > + if (migrated) {
> > + flags |= TNF_MIGRATED;
> > + page_nid = target_nid;
> > + } else
> > + flags |= TNF_MIGRATE_FAIL;
> >
> > +out:
> > if (page_nid != NUMA_NO_NODE)
> > task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
> > flags);
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 9c4ae5132919..86325c750c14 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -2066,6 +2066,23 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
> > return newpage;
> > }
> >
> > +static struct page *alloc_misplaced_dst_page_thp(struct page *page,
> > + unsigned long data)
> > +{
> > + int nid = (int) data;
> > + struct page *newpage;
> > +
> > + newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
> > + HPAGE_PMD_ORDER);
> > + if (!newpage)
> > + goto out;
> > +
> > + prep_transhuge_page(newpage);
> > +
> > +out:
> > + return newpage;
> > +}
> > +
> > static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
> > {
> > int page_lru;
> > @@ -2104,12 +2121,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
> > return 1;
> > }
> >
> > -bool pmd_trans_migrating(pmd_t pmd)
> > -{
> > - struct page *page = pmd_page(pmd);
> > - return PageLocked(page);
> > -}
> > -
> > static inline bool is_shared_exec_page(struct vm_area_struct *vma,
> > struct page *page)
> > {
> > @@ -2133,6 +2144,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
> > int isolated;
> > int nr_remaining;
> > LIST_HEAD(migratepages);
> > + new_page_t *new;
> > +
> > + if (compound)
> > + new = alloc_misplaced_dst_page_thp;
> > + else
> > + new = alloc_misplaced_dst_page;
> >
> > /*
> > * Don't migrate file pages that are mapped in multiple processes
> > @@ -2153,9 +2170,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
> > goto out;
> >
> > list_add(&page->lru, &migratepages);
> > - nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
> > - NULL, node, MIGRATE_ASYNC,
> > - MR_NUMA_MISPLACED);
> > + nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
> > + MIGRATE_ASYNC, MR_NUMA_MISPLACED);
> > if (nr_remaining) {
> > if (!list_empty(&migratepages)) {
> > list_del(&page->lru);
> > @@ -2174,145 +2190,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
> > return 0;
> > }
> > #endif /* CONFIG_NUMA_BALANCING */
> > -
> > -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> > -/*
> > - * Migrates a THP to a given target node. page must be locked and is unlocked
> > - * before returning.
> > - */
> > -int migrate_misplaced_transhuge_page(struct mm_struct *mm,
> > - struct vm_area_struct *vma,
> > - pmd_t *pmd, pmd_t entry,
> > - unsigned long address,
> > - struct page *page, int node)
> > -{
> > - spinlock_t *ptl;
> > - pg_data_t *pgdat = NODE_DATA(node);
> > - int isolated = 0;
> > - struct page *new_page = NULL;
> > - int page_lru = page_is_file_lru(page);
> > - unsigned long start = address & HPAGE_PMD_MASK;
> > -
> > - if (is_shared_exec_page(vma, page))
> > - goto out;
> > -
> > - new_page = alloc_pages_node(node,
> > - (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
> > - HPAGE_PMD_ORDER);
> > - if (!new_page)
> > - goto out_fail;
> > - prep_transhuge_page(new_page);
> > -
> > - isolated = numamigrate_isolate_page(pgdat, page);
> > - if (!isolated) {
> > - put_page(new_page);
> > - goto out_fail;
> > - }
> > -
> > - /* Prepare a page as a migration target */
> > - __SetPageLocked(new_page);
> > - if (PageSwapBacked(page))
> > - __SetPageSwapBacked(new_page);
> > -
> > - /* anon mapping, we can simply copy page->mapping to the new page: */
> > - new_page->mapping = page->mapping;
> > - new_page->index = page->index;
> > - /* flush the cache before copying using the kernel virtual address */
> > - flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
> > - migrate_page_copy(new_page, page);
> > - WARN_ON(PageLRU(new_page));
> > -
> > - /* Recheck the target PMD */
> > - ptl = pmd_lock(mm, pmd);
> > - if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
> > - spin_unlock(ptl);
> > -
> > - /* Reverse changes made by migrate_page_copy() */
> > - if (TestClearPageActive(new_page))
> > - SetPageActive(page);
> > - if (TestClearPageUnevictable(new_page))
> > - SetPageUnevictable(page);
> > -
> > - unlock_page(new_page);
> > - put_page(new_page); /* Free it */
> > -
> > - /* Retake the callers reference and putback on LRU */
> > - get_page(page);
> > - putback_lru_page(page);
> > - mod_node_page_state(page_pgdat(page),
> > - NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
> > -
> > - goto out_unlock;
> > - }
> > -
> > - entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> > - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > -
> > - /*
> > - * Overwrite the old entry under pagetable lock and establish
> > - * the new PTE. Any parallel GUP will either observe the old
> > - * page blocking on the page lock, block on the page table
> > - * lock or observe the new page. The SetPageUptodate on the
> > - * new page and page_add_new_anon_rmap guarantee the copy is
> > - * visible before the pagetable update.
> > - */
> > - page_add_anon_rmap(new_page, vma, start, true);
> > - /*
> > - * At this point the pmd is numa/protnone (i.e. non present) and the TLB
> > - * has already been flushed globally. So no TLB can be currently
> > - * caching this non present pmd mapping. There's no need to clear the
> > - * pmd before doing set_pmd_at(), nor to flush the TLB after
> > - * set_pmd_at(). Clearing the pmd here would introduce a race
> > - * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
> > - * mmap_lock for reading. If the pmd is set to NULL at any given time,
> > - * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
> > - * pmd.
> > - */
> > - set_pmd_at(mm, start, pmd, entry);
> > - update_mmu_cache_pmd(vma, address, &entry);
> > -
> > - page_ref_unfreeze(page, 2);
> > - mlock_migrate_page(new_page, page);
> > - page_remove_rmap(page, true);
> > - set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
> > -
> > - spin_unlock(ptl);
> > -
> > - /* Take an "isolate" reference and put new page on the LRU. */
> > - get_page(new_page);
> > - putback_lru_page(new_page);
> > -
> > - unlock_page(new_page);
> > - unlock_page(page);
> > - put_page(page); /* Drop the rmap reference */
> > - put_page(page); /* Drop the LRU isolation reference */
> > -
> > - count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
> > - count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
> > -
> > - mod_node_page_state(page_pgdat(page),
> > - NR_ISOLATED_ANON + page_lru,
> > - -HPAGE_PMD_NR);
> > - return isolated;
> > -
> > -out_fail:
> > - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
> > - ptl = pmd_lock(mm, pmd);
> > - if (pmd_same(*pmd, entry)) {
> > - entry = pmd_modify(entry, vma->vm_page_prot);
> > - set_pmd_at(mm, start, pmd, entry);
> > - update_mmu_cache_pmd(vma, address, &entry);
> > - }
> > - spin_unlock(ptl);
> > -
> > -out_unlock:
> > - unlock_page(page);
> > -out:
> > - put_page(page);
> > - return 0;
> > -}
> > -#endif /* CONFIG_NUMA_BALANCING */
> > -
> > #endif /* CONFIG_NUMA */
> >
> > #ifdef CONFIG_DEVICE_PRIVATE