Re: [PATCH v4 05/11] mm: thp: enable thp migration in generic path

From: Kirill A. Shutemov
Date: Fri Mar 24 2017 - 10:29:01 EST


On Mon, Mar 13, 2017 at 11:45:01AM -0400, Zi Yan wrote:
> From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
>
> This patch adds thp migration's core code, including conversions
> between a PMD entry and a swap entry, setting PMD migration entry,
> removing PMD migration entry, and waiting on PMD migration entries.
>
> This patch makes it possible to support thp migration.
> If you fail to allocate a destination page as a thp, you just split
> the source thp as we do now, and then enter the normal page migration.
> If you succeed to allocate destination thp, you enter thp migration.
> Subsequent patches actually enable thp migration for each caller of
> page migration by allowing its get_new_page() callback to
> allocate thps.
>
> ChangeLog v1 -> v2:
> - support pte-mapped thp, doubly-mapped thp
>
> Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
>
> ChangeLog v2 -> v3:
> - use page_vma_mapped_walk()
>
> ChangeLog v3 -> v4:
> - factor out the code of removing pte pgtable page in zap_huge_pmd()
>
> Signed-off-by: Zi Yan <zi.yan@xxxxxxxxxxxxxx>

See few questions below.

It would be nice to split it into few patches. Probably three or four.

> ---
> arch/x86/include/asm/pgtable_64.h | 2 +
> include/linux/swapops.h | 70 +++++++++++++++++-
> mm/huge_memory.c | 147 ++++++++++++++++++++++++++++++++++----
> mm/migrate.c | 29 +++++++-
> mm/page_vma_mapped.c | 13 +++-
> mm/pgtable-generic.c | 3 +-
> mm/rmap.c | 9 +++
> 7 files changed, 252 insertions(+), 21 deletions(-)
>
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index a5c4fc62e078..350397fd2129 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -187,7 +187,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
> ((type) << (SWP_TYPE_FIRST_BIT)) \
> | ((offset) << SWP_OFFSET_FIRST_BIT) })
> #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
> +#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
> #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
> +#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
>
> extern int kern_addr_valid(unsigned long addr);
> extern void cleanup_highmap(void);
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 5c3a5f3e7eec..6625bea13869 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -103,7 +103,8 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
> #ifdef CONFIG_MIGRATION
> static inline swp_entry_t make_migration_entry(struct page *page, int write)
> {
> - BUG_ON(!PageLocked(page));
> + BUG_ON(!PageLocked(compound_head(page)));
> +
> return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
> page_to_pfn(page));
> }
> @@ -126,7 +127,7 @@ static inline struct page *migration_entry_to_page(swp_entry_t entry)
> * Any use of migration entries may only occur while the
> * corresponding page is locked
> */
> - BUG_ON(!PageLocked(p));
> + BUG_ON(!PageLocked(compound_head(p)));
> return p;
> }
>
> @@ -163,6 +164,71 @@ static inline int is_write_migration_entry(swp_entry_t entry)
>
> #endif
>
> +struct page_vma_mapped_walk;
> +
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
> + struct page *page);
> +
> +extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
> + struct page *new);
> +
> +extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
> +
> +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> +{
> + swp_entry_t arch_entry;
> +
> + arch_entry = __pmd_to_swp_entry(pmd);
> + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
> +}
> +
> +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> +{
> + swp_entry_t arch_entry;
> +
> + arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
> + return __swp_entry_to_pmd(arch_entry);
> +}
> +
> +static inline int is_pmd_migration_entry(pmd_t pmd)
> +{
> + return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
> +}
> +#else
> +static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
> + struct page *page)
> +{
> + BUILD_BUG();
> +}
> +
> +static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
> + struct page *new)
> +{
> + BUILD_BUG();
> + return 0;
> +}
> +
> +static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
> +
> +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> +{
> + BUILD_BUG();
> + return swp_entry(0, 0);
> +}
> +
> +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> +{
> + BUILD_BUG();
> + return (pmd_t){ 0 };
> +}
> +
> +static inline int is_pmd_migration_entry(pmd_t pmd)
> +{
> + return 0;
> +}
> +#endif
> +
> #ifdef CONFIG_MEMORY_FAILURE
>
> extern atomic_long_t num_poisoned_pages __read_mostly;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index e32ccbd8ee3a..a9c2a0ef5b9b 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1588,6 +1588,26 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
> atomic_long_dec(&mm->nr_ptes);
> }
>
> +static inline void remove_trans_huge_pgtable(struct page *page,
> + struct mmu_gather *tlb, pmd_t *pmd)
> +{
> + if (PageAnon(page)) {
> + pgtable_t pgtable;
> +
> + pgtable = pgtable_trans_huge_withdraw(tlb->mm,
> + pmd);
> + pte_free(tlb->mm, pgtable);
> + atomic_long_dec(&tlb->mm->nr_ptes);
> + add_mm_counter(tlb->mm, MM_ANONPAGES,
> + -HPAGE_PMD_NR);
> + } else {
> + if (arch_needs_pgtable_deposit())
> + zap_deposited_table(tlb->mm, pmd);
> + add_mm_counter(tlb->mm, MM_FILEPAGES,
> + -HPAGE_PMD_NR);
> + }
> +}
> +
> int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> pmd_t *pmd, unsigned long addr)
> {
> @@ -1618,23 +1638,27 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> spin_unlock(ptl);
> tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
> } else {
> - struct page *page = pmd_page(orig_pmd);
> - page_remove_rmap(page, true);
> - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
> - VM_BUG_ON_PAGE(!PageHead(page), page);
> - if (PageAnon(page)) {
> - pgtable_t pgtable;
> - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
> - pte_free(tlb->mm, pgtable);
> - atomic_long_dec(&tlb->mm->nr_ptes);
> - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> + struct page *page;
> + int migration = 0;
> +
> + if (!is_pmd_migration_entry(orig_pmd)) {
> + page = pmd_page(orig_pmd);
> + page_remove_rmap(page, true);
> + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
> + VM_BUG_ON_PAGE(!PageHead(page), page);
> + remove_trans_huge_pgtable(page, tlb, pmd);
> } else {
> - if (arch_needs_pgtable_deposit())
> - zap_deposited_table(tlb->mm, pmd);
> - add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
> + swp_entry_t entry;
> +
> + entry = pmd_to_swp_entry(orig_pmd);
> + page = pfn_to_page(swp_offset(entry));
> + remove_trans_huge_pgtable(page, tlb, pmd);
> + free_swap_and_cache(entry); /* waring in failure? */
> + migration = 1;
> }
> spin_unlock(ptl);
> - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
> + if (!migration)
> + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
> }
> return 1;
> }
> @@ -2652,3 +2676,98 @@ static int __init split_huge_pages_debugfs(void)
> }
> late_initcall(split_huge_pages_debugfs);
> #endif
> +
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
> + struct page *page)
> +{
> + struct vm_area_struct *vma = pvmw->vma;
> + struct mm_struct *mm = vma->vm_mm;
> + unsigned long address = pvmw->address;
> + pmd_t pmdval;
> + swp_entry_t entry;
> +
> + if (pvmw->pmd && !pvmw->pte) {
> + pmd_t pmdswp;
> +
> + mmu_notifier_invalidate_range_start(mm, address,
> + address + HPAGE_PMD_SIZE);
> +
> + flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
> + pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);
> + if (pmd_dirty(pmdval))
> + set_page_dirty(page);
> + entry = make_migration_entry(page, pmd_write(pmdval));
> + pmdswp = swp_entry_to_pmd(entry);
> + set_pmd_at(mm, address, pvmw->pmd, pmdswp);
> + page_remove_rmap(page, true);
> + put_page(page);
> +
> + mmu_notifier_invalidate_range_end(mm, address,
> + address + HPAGE_PMD_SIZE);
> + } else { /* pte-mapped thp */
> + pte_t pteval;
> + struct page *subpage = page - page_to_pfn(page) + pte_pfn(*pvmw->pte);
> + pte_t swp_pte;
> +
> + pteval = ptep_clear_flush(vma, address, pvmw->pte);
> + if (pte_dirty(pteval))
> + set_page_dirty(subpage);
> + entry = make_migration_entry(subpage, pte_write(pteval));
> + swp_pte = swp_entry_to_pte(entry);
> + set_pte_at(mm, address, pvmw->pte, swp_pte);
> + page_remove_rmap(subpage, false);
> + put_page(subpage);
> + mmu_notifier_invalidate_page(mm, address);
> + }
> +}
> +
> +void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
> +{
> + struct vm_area_struct *vma = pvmw->vma;
> + struct mm_struct *mm = vma->vm_mm;
> + unsigned long address = pvmw->address;
> + swp_entry_t entry;
> +
> + /* PMD-mapped THP */
> + if (pvmw->pmd && !pvmw->pte) {
> + unsigned long mmun_start = address & HPAGE_PMD_MASK;
> + unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
> + pmd_t pmde;
> +
> + entry = pmd_to_swp_entry(*pvmw->pmd);
> + get_page(new);
> + pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
> + if (is_write_migration_entry(entry))
> + pmde = maybe_pmd_mkwrite(pmde, vma);
> +
> + flush_cache_range(vma, mmun_start, mmun_end);
> + page_add_anon_rmap(new, vma, mmun_start, true);
> + pmdp_huge_clear_flush_notify(vma, mmun_start, pvmw->pmd);
> + set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
> + flush_tlb_range(vma, mmun_start, mmun_end);
> + if (vma->vm_flags & VM_LOCKED)
> + mlock_vma_page(new);
> + update_mmu_cache_pmd(vma, address, pvmw->pmd);
> +
> + } else { /* pte-mapped thp */
> + pte_t pte;
> + pte_t *ptep = pvmw->pte;
> +
> + entry = pte_to_swp_entry(*pvmw->pte);
> + get_page(new);
> + pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
> + if (pte_swp_soft_dirty(*pvmw->pte))
> + pte = pte_mksoft_dirty(pte);
> + if (is_write_migration_entry(entry))
> + pte = maybe_mkwrite(pte, vma);
> + flush_dcache_page(new);
> + set_pte_at(mm, address, ptep, pte);
> + if (PageAnon(new))
> + page_add_anon_rmap(new, vma, address, false);
> + else
> + page_add_file_rmap(new, false);
> + update_mmu_cache(vma, address, ptep);
> + }
> +}
> +#endif
> diff --git a/mm/migrate.c b/mm/migrate.c
> index cda4c2778d04..0bbad6dcf95a 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -211,6 +211,12 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
> new = page - pvmw.page->index +
> linear_page_index(vma, pvmw.address);
>
> + /* PMD-mapped THP migration entry */
> + if (!PageHuge(page) && PageTransCompound(page)) {
> + remove_migration_pmd(&pvmw, new);
> + continue;
> + }
> +

Any reason not to share PTE handling of non-THP with THP?

> get_page(new);
> pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
> if (pte_swp_soft_dirty(*pvmw.pte))
> @@ -324,6 +330,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
> __migration_entry_wait(mm, pte, ptl);
> }
>
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
> +{
> + spinlock_t *ptl;
> + struct page *page;
> +
> + ptl = pmd_lock(mm, pmd);
> + if (!is_pmd_migration_entry(*pmd))
> + goto unlock;
> + page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
> + if (!get_page_unless_zero(page))
> + goto unlock;
> + spin_unlock(ptl);
> + wait_on_page_locked(page);
> + put_page(page);
> + return;
> +unlock:
> + spin_unlock(ptl);
> +}
> +#endif
> +
> #ifdef CONFIG_BLOCK
> /* Returns true if all buffers are successfully locked */
> static bool buffer_migrate_lock_buffers(struct buffer_head *head,
> @@ -1082,7 +1109,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
> goto out;
> }
>
> - if (unlikely(PageTransHuge(page))) {
> + if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
> lock_page(page);
> rc = split_huge_page(page);
> unlock_page(page);
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index a23001a22c15..0ed3aee62d50 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -137,16 +137,23 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> if (!pud_present(*pud))
> return false;
> pvmw->pmd = pmd_offset(pud, pvmw->address);
> - if (pmd_trans_huge(*pvmw->pmd)) {
> + if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) {
> pvmw->ptl = pmd_lock(mm, pvmw->pmd);
> - if (!pmd_present(*pvmw->pmd))
> - return not_found(pvmw);
> if (likely(pmd_trans_huge(*pvmw->pmd))) {
> if (pvmw->flags & PVMW_MIGRATION)
> return not_found(pvmw);
> if (pmd_page(*pvmw->pmd) != page)
> return not_found(pvmw);
> return true;
> + } else if (!pmd_present(*pvmw->pmd)) {
> + if (unlikely(is_migration_entry(pmd_to_swp_entry(*pvmw->pmd)))) {
> + swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
> +
> + if (migration_entry_to_page(entry) != page)
> + return not_found(pvmw);
> + return true;
> + }
> + return not_found(pvmw);
> } else {
> /* THP pmd was split under us: handle on pte level */
> spin_unlock(pvmw->ptl);
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 4ed5908c65b0..9d550a8a0c71 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -118,7 +118,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
> {
> pmd_t pmd;
> VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> - VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
> + !pmd_devmap(*pmdp));

How does this? _flush doesn't make sense for !present.

> pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
> flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> return pmd;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 555cc7ebacf6..2c65abbd7a0e 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1298,6 +1298,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> int ret = SWAP_AGAIN;
> enum ttu_flags flags = (enum ttu_flags)arg;
>
> +
> /* munlock has nothing to gain from examining un-locked vmas */
> if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
> return SWAP_AGAIN;
> @@ -1308,6 +1309,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> }
>
> while (page_vma_mapped_walk(&pvmw)) {
> + /* THP migration */
> + if (flags & TTU_MIGRATION) {
> + if (!PageHuge(page) && PageTransCompound(page)) {
> + set_pmd_migration_entry(&pvmw, page);

Again, it would be nice share PTE handling. It should be rather similar,
no?

> + continue;
> + }
> + }
> +
> /*
> * If the page is mlock()d, we cannot swap it out.
> * If it's recently referenced (perhaps page_referenced
> --
> 2.11.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

--
Kirill A. Shutemov