Approximately 10-20% of pages to be swapped out are zero pages [1].
Rather than reading/writing these pages to flash resulting
in increased I/O and flash wear, the pte can be cleared for those
addresses at unmap time while shrinking folio list. When this
causes a page fault, do_pte_missing will take care of this page.
With this patch, NVMe writes in Meta server fleet decreased
by almost 10% with conventional swap setup (zswap disabled).
[1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/
Signed-off-by: Usama Arif <usamaarif642@xxxxxxxxx>
---
include/linux/rmap.h | 1 +
mm/rmap.c | 163 ++++++++++++++++++++++---------------------
mm/vmscan.c | 89 ++++++++++++++++-------
3 files changed, 150 insertions(+), 103 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bb53e5920b88..b36db1e886e4 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -100,6 +100,7 @@ enum ttu_flags {
* do a final flush if necessary */
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
+ TTU_ZERO_FOLIO = 0x100,/* zero folio */
};
#ifdef CONFIG_MMU
diff --git a/mm/rmap.c b/mm/rmap.c
index 52357d79917c..d98f70876327 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1819,96 +1819,101 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
dec_mm_counter(mm, mm_counter(folio));
} else if (folio_test_anon(folio)) {
- swp_entry_t entry = page_swap_entry(subpage);
- pte_t swp_pte;
- /*
- * Store the swap location in the pte.
- * See handle_pte_fault() ...
- */
- if (unlikely(folio_test_swapbacked(folio) !=
- folio_test_swapcache(folio))) {
+ if (flags & TTU_ZERO_FOLIO) {
+ pte_clear(mm, address, pvmw.pte);
+ dec_mm_counter(mm, MM_ANONPAGES);