[PATCH v7 4/8] mm/rmap: Split migration into its own function

From: Alistair Popple
Date: Thu Mar 25 2021 - 20:09:38 EST


Migration is currently implemented as a mode of operation for
try_to_unmap_one() generally specified by passing the TTU_MIGRATION flag
or in the case of splitting a huge anonymous page TTU_SPLIT_FREEZE.

However it does not have much in common with the rest of the unmap
functionality of try_to_unmap_one() and thus splitting it into a
separate function reduces the complexity of try_to_unmap_one() making it
more readable.

Several simplifications can also be made in try_to_migrate_one() based
on the following observations:

- All users of TTU_MIGRATION also set TTU_IGNORE_MLOCK.
- No users of TTU_MIGRATION ever set TTU_IGNORE_HWPOISON.
- No users of TTU_MIGRATION ever set TTU_BATCH_FLUSH.

TTU_SPLIT_FREEZE is a special case of migration used when splitting an
anonymous page. This is most easily dealt with by calling the correct
function from unmap_page() in mm/huge_memory.c - either
try_to_migrate() for PageAnon or try_to_unmap().

Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
Reviewed-by: Christoph Hellwig <hch@xxxxxx>
Reviewed-by: Ralph Campbell <rcampbell@xxxxxxxxxx>

---

v5:
* Added comments about how PMD splitting works for migration vs.
unmapping
* Tightened up the flag check in try_to_migrate() to be explicit about
which TTU_XXX flags are supported.
---
include/linux/rmap.h | 4 +-
mm/huge_memory.c | 15 +-
mm/migrate.c | 9 +-
mm/rmap.c | 358 ++++++++++++++++++++++++++++++++-----------
4 files changed, 280 insertions(+), 106 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e26ac2d71346..6062e0cfca2d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -86,8 +86,6 @@ struct anon_vma_chain {
};

enum ttu_flags {
- TTU_MIGRATION = 0x1, /* migration mode */
-
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
@@ -96,7 +94,6 @@ enum ttu_flags {
* do a final flush if necessary */
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
- TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */
};

#ifdef CONFIG_MMU
@@ -193,6 +190,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);

+bool try_to_migrate(struct page *page, enum ttu_flags flags);
bool try_to_unmap(struct page *, enum ttu_flags flags);

/* Avoid racy checks */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 89af065cea5b..eab004331b97 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2357,16 +2357,21 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,

static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+ enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
bool unmap_success;

VM_BUG_ON_PAGE(!PageHead(page), page);

if (PageAnon(page))
- ttu_flags |= TTU_SPLIT_FREEZE;
-
- unmap_success = try_to_unmap(page, ttu_flags);
+ unmap_success = try_to_migrate(page, ttu_flags);
+ else
+ /*
+ * Don't install migration entries for file backed pages. This
+ * helps handle cases when i_size is in the middle of the page
+ * as there is no need to unmap pages beyond i_size manually.
+ */
+ unmap_success = try_to_unmap(page, ttu_flags |
+ TTU_IGNORE_MLOCK);
VM_BUG_ON_PAGE(!unmap_success, page);
}

diff --git a/mm/migrate.c b/mm/migrate.c
index b752543adb64..cc4612e2a246 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1130,7 +1130,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
- try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
+ try_to_migrate(page, 0);
page_was_mapped = 1;
}

@@ -1332,7 +1332,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,

if (page_mapped(hpage)) {
bool mapping_locked = false;
- enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+ enum ttu_flags ttu = 0;

if (!PageAnon(hpage)) {
/*
@@ -1349,7 +1349,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
ttu |= TTU_RMAP_LOCKED;
}

- try_to_unmap(hpage, ttu);
+ try_to_migrate(hpage, ttu);
page_was_mapped = 1;

if (mapping_locked)
@@ -2756,7 +2756,6 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
unsigned long addr, i, restore = 0;
@@ -2768,7 +2767,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
continue;

if (page_mapped(page)) {
- try_to_unmap(page, flags);
+ try_to_migrate(page, 0);
if (page_mapped(page))
goto restore;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index d02bade5245b..b540b44e299a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1405,14 +1405,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;

- if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
- is_zone_device_page(page) && !is_device_private_page(page))
- return true;
-
- if (flags & TTU_SPLIT_HUGE_PMD) {
- split_huge_pmd_address(vma, address,
- flags & TTU_SPLIT_FREEZE, page);
- }
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, false, page);

/*
* For THP, we have to assume the worse case ie pmd for invalidation.
@@ -1436,16 +1430,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);

while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- /* PMD-mapped THP migration entry */
- if (!pvmw.pte && (flags & TTU_MIGRATION)) {
- VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
-
- set_pmd_migration_entry(&pvmw, page);
- continue;
- }
-#endif
-
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
@@ -1507,46 +1491,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
}

- if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION) &&
- is_zone_device_page(page)) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = make_readable_migration_entry(page_to_pfn(page));
- swp_pte = swp_entry_to_pte(entry);
-
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- *
- * The assignment to subpage above was computed from a
- * swap PTE which results in an invalid pointer.
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- subpage = page;
- goto discard;
- }
-
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
@@ -1599,39 +1543,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
- } else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- if (pte_write(pteval))
- entry = make_writable_migration_entry(
- page_to_pfn(subpage));
- else
- entry = make_readable_migration_entry(
- page_to_pfn(subpage));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
@@ -1758,6 +1669,268 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
.anon_lock = page_lock_anon_vma_read,
};

+ if (flags & TTU_RMAP_LOCKED)
+ rmap_walk_locked(page, &rwc);
+ else
+ rmap_walk(page, &rwc);
+
+ return !page_mapcount(page) ? true : false;
+}
+
+/*
+ * @arg: enum ttu_flags will be passed to this argument.
+ *
+ * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
+ * containing migration entries. This and TTU_RMAP_LOCKED are the only supported
+ * flags.
+ */
+static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
+
+ if (is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
+ /*
+ * unmap_page() in mm/huge_memory.c is the only user of migration with
+ * TTU_SPLIT_HUGE_PMD and it wants to freeze.
+ */
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, true, page);
+
+ /*
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address,
+ min(vma->vm_end, address + page_size(page)));
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
+ }
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ VM_BUG_ON_PAGE(PageHuge(page) ||
+ !PageTransCompound(page), page);
+
+ set_pmd_migration_entry(&pvmw, page);
+ continue;
+ }
+#endif
+
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+ if (PageHuge(page) && !PageAnon(page)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
+ if (is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+
+ /*
+ * pteval maps a zone device page and is therefore
+ * a swap pte.
+ */
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ *
+ * The assignment to subpage above was computed from a
+ * swap PTE which results in an invalid pointer.
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
+ */
+ subpage = page;
+ } else if (PageHWPoison(page)) {
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
+ if (PageHuge(page)) {
+ hugetlb_count_sub(compound_nr(page), mm);
+ set_huge_swap_pte_at(mm, address,
+ pvmw.pte, pteval,
+ vma_mmu_pagesize(vma));
+ } else {
+ dec_mm_counter(mm, mm_counter(page));
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ }
+
+ } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ * A future reference will then fault in a new zero
+ * page. When userfaultfd is active, we must not drop
+ * this page though, as its main user (postcopy
+ * migration) will not expect userfaults on already
+ * copied pages.
+ */
+ dec_mm_counter(mm, mm_counter(page));
+ /* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ } else {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_migration_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(subpage));
+
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
+ }
+
+ /*
+ * No need to call mmu_notifier_invalidate_range() it has be
+ * done above for all cases requiring it to happen under page
+ * table lock before mmu_notifier_invalidate_range_end()
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ page_remove_rmap(subpage, PageHuge(page));
+ put_page(page);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+/**
+ * try_to_migrate - try to replace all page table mappings with swap entries
+ * @page: the page to replace page table entries for
+ * @flags: action and flags
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special swap entries. Caller must hold the page lock.
+ *
+ * If is successful, return true. Otherwise, false.
+ */
+bool try_to_migrate(struct page *page, enum ttu_flags flags)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_migrate_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+
+ /*
+ * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
+ * TTU_SPLIT_HUGE_PMD flags.
+ */
+ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD)))
+ return false;
+
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1766,8 +1939,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
- && !PageKsm(page) && PageAnon(page))
+ if (!PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;

if (flags & TTU_RMAP_LOCKED)
--
2.20.1