[PATCH v2 1/2] mm: clear pte for folios that are zero filled

From: Usama Arif
Date: Tue Jun 04 2024 - 07:00:16 EST


Approximately 10-20% of pages to be swapped out are zero pages [1].
Rather than reading/writing these pages to flash resulting
in increased I/O and flash wear, the pte can be cleared for those
addresses at unmap time while shrinking folio list. When this
causes a page fault, do_pte_missing will take care of this page.
With this patch, NVMe writes in Meta server fleet decreased
by almost 10% with conventional swap setup (zswap disabled).

[1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/

Signed-off-by: Usama Arif <usamaarif642@xxxxxxxxx>
---
include/linux/rmap.h | 1 +
mm/rmap.c | 163 ++++++++++++++++++++++---------------------
mm/vmscan.c | 89 ++++++++++++++++-------
3 files changed, 150 insertions(+), 103 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bb53e5920b88..b36db1e886e4 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -100,6 +100,7 @@ enum ttu_flags {
* do a final flush if necessary */
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
+ TTU_ZERO_FOLIO = 0x100,/* zero folio */
};

#ifdef CONFIG_MMU
diff --git a/mm/rmap.c b/mm/rmap.c
index 52357d79917c..d98f70876327 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1819,96 +1819,101 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
dec_mm_counter(mm, mm_counter(folio));
} else if (folio_test_anon(folio)) {
- swp_entry_t entry = page_swap_entry(subpage);
- pte_t swp_pte;
- /*
- * Store the swap location in the pte.
- * See handle_pte_fault() ...
- */
- if (unlikely(folio_test_swapbacked(folio) !=
- folio_test_swapcache(folio))) {
+ if (flags & TTU_ZERO_FOLIO) {
+ pte_clear(mm, address, pvmw.pte);
+ dec_mm_counter(mm, MM_ANONPAGES);
+ } else {
+ swp_entry_t entry = page_swap_entry(subpage);
+ pte_t swp_pte;
/*
- * unmap_huge_pmd_locked() will unmark a
- * PMD-mapped folio as lazyfree if the folio or
- * its PMD was redirtied.
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
*/
- if (!pmd_mapped)
- WARN_ON_ONCE(1);
- goto walk_done_err;
- }
+ if (unlikely(folio_test_swapbacked(folio) !=
+ folio_test_swapcache(folio))) {
+ /*
+ * unmap_huge_pmd_locked() will unmark a
+ * PMD-mapped folio as lazyfree if the folio or
+ * its PMD was redirtied.
+ */
+ if (!pmd_mapped)
+ WARN_ON_ONCE(1);
+ goto walk_done_err;
+ }

- /* MADV_FREE page check */
- if (!folio_test_swapbacked(folio)) {
- int ref_count, map_count;
+ /* MADV_FREE page check */
+ if (!folio_test_swapbacked(folio)) {
+ int ref_count, map_count;

- /*
- * Synchronize with gup_pte_range():
- * - clear PTE; barrier; read refcount
- * - inc refcount; barrier; read PTE
- */
- smp_mb();
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();

- ref_count = folio_ref_count(folio);
- map_count = folio_mapcount(folio);
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);

- /*
- * Order reads for page refcount and dirty flag
- * (see comments in __remove_mapping()).
- */
- smp_rmb();
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();

- /*
- * The only page refs must be one from isolation
- * plus the rmap(s) (dropped by discard:).
- */
- if (ref_count == 1 + map_count &&
- !folio_test_dirty(folio)) {
- dec_mm_counter(mm, MM_ANONPAGES);
- goto discard;
- }
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ }

- /*
- * If the folio was redirtied, it cannot be
- * discarded. Remap the page to page table.
- */
- set_pte_at(mm, address, pvmw.pte, pteval);
- folio_set_swapbacked(folio);
- goto walk_done_err;
- }
+ /*
+ * If the folio was redirtied, it cannot be
+ * discarded. Remap the page to page table.
+ */
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ folio_set_swapbacked(folio);
+ goto walk_done_err;
+ }

- if (swap_duplicate(entry) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
- goto walk_done_err;
- }
- if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- swap_free(entry);
- set_pte_at(mm, address, pvmw.pte, pteval);
- goto walk_done_err;
- }
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ goto walk_done_err;
+ }
+ if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ swap_free(entry);
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ goto walk_done_err;
+ }

- /* See folio_try_share_anon_rmap(): clear PTE first. */
- if (anon_exclusive &&
- folio_try_share_anon_rmap_pte(folio, subpage)) {
- swap_free(entry);
- set_pte_at(mm, address, pvmw.pte, pteval);
- goto walk_done_err;
- }
- if (list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- if (list_empty(&mm->mmlist))
- list_add(&mm->mmlist, &init_mm.mmlist);
- spin_unlock(&mmlist_lock);
+ /* See folio_try_share_anon_rmap(): clear PTE first. */
+ if (anon_exclusive &&
+ folio_try_share_anon_rmap_pte(folio, subpage)) {
+ swap_free(entry);
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ goto walk_done_err;
+ }
+ if (list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
+ swp_pte = swp_entry_to_pte(entry);
+ if (anon_exclusive)
+ swp_pte = pte_swp_mkexclusive(swp_pte);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
}
- dec_mm_counter(mm, MM_ANONPAGES);
- inc_mm_counter(mm, MM_SWAPENTS);
- swp_pte = swp_entry_to_pte(entry);
- if (anon_exclusive)
- swp_pte = pte_swp_mkexclusive(swp_pte);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
} else {
/*
* This is a locked file-backed folio,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b9170f767353..d54f44b556f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1026,6 +1026,38 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
}

+static bool is_folio_page_zero_filled(struct folio *folio, int i)
+{
+ unsigned long *data;
+ unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1;
+ bool ret = false;
+
+ data = kmap_local_folio(folio, i * PAGE_SIZE);
+
+ if (data[last_pos])
+ goto out;
+
+ for (pos = 0; pos < last_pos; pos++) {
+ if (data[pos])
+ goto out;
+ }
+ ret = true;
+out:
+ kunmap_local(data);
+ return ret;
+}
+
+static bool is_folio_zero_filled(struct folio *folio)
+{
+ unsigned int i;
+
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ if (!is_folio_page_zero_filled(folio, i))
+ return false;
+ }
+ return true;
+}
+
/*
* shrink_folio_list() returns the number of reclaimed pages
*/
@@ -1053,6 +1085,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
enum folio_references references = FOLIOREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
+ bool folio_zero_filled = false;

cond_resched();

@@ -1270,6 +1303,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
nr_pages = 1;
}

+ folio_zero_filled = is_folio_zero_filled(folio);
/*
* The folio is mapped into the page tables of one or more
* processes. Try to unmap it here.
@@ -1295,6 +1329,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
if (folio_test_large(folio) && list_empty(&folio->_deferred_list))
flags |= TTU_SYNC;

+ if (folio_zero_filled)
+ flags |= TTU_ZERO_FOLIO;
+
try_to_unmap(folio, flags);
if (folio_mapped(folio)) {
stat->nr_unmap_fail += nr_pages;
@@ -1358,32 +1395,36 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping, &plug)) {
- case PAGE_KEEP:
- goto keep_locked;
- case PAGE_ACTIVATE:
- goto activate_locked;
- case PAGE_SUCCESS:
- stat->nr_pageout += nr_pages;
+ if (folio_zero_filled) {
+ folio_clear_dirty(folio);
+ } else {
+ switch (pageout(folio, mapping, &plug)) {
+ case PAGE_KEEP:
+ goto keep_locked;
+ case PAGE_ACTIVATE:
+ goto activate_locked;
+ case PAGE_SUCCESS:
+ stat->nr_pageout += nr_pages;

- if (folio_test_writeback(folio))
- goto keep;
- if (folio_test_dirty(folio))
- goto keep;
+ if (folio_test_writeback(folio))
+ goto keep;
+ if (folio_test_dirty(folio))
+ goto keep;

- /*
- * A synchronous write - probably a ramdisk. Go
- * ahead and try to reclaim the folio.
- */
- if (!folio_trylock(folio))
- goto keep;
- if (folio_test_dirty(folio) ||
- folio_test_writeback(folio))
- goto keep_locked;
- mapping = folio_mapping(folio);
- fallthrough;
- case PAGE_CLEAN:
- ; /* try to free the folio below */
+ /*
+ * A synchronous write - probably a ramdisk. Go
+ * ahead and try to reclaim the folio.
+ */
+ if (!folio_trylock(folio))
+ goto keep;
+ if (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))
+ goto keep_locked;
+ mapping = folio_mapping(folio);
+ fallthrough;
+ case PAGE_CLEAN:
+ ; /* try to free the folio below */
+ }
}
}

--
2.43.0