[PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff

From: Usama Arif

Date: Mon Apr 27 2026 - 06:19:00 EST

Add unuse_pmd() and call it from unuse_pmd_range() to swap in
PMD-level swap entries as whole THPs during swapoff. This mirrors
the existing unuse_pte_range() but operates at PMD granularity.

If the PMD-order folio cannot be allocated, the cached folio is no
longer PMD-sized (e.g. split in the swap cache by
deferred_split_scan() or memory_failure() while the PMD swap entry
was installed), or the folio is not uptodate, the PMD swap entry is
split into PTE-level entries via __split_huge_pmd() and a non-zero
error is returned so unuse_pmd_range() falls through to
unuse_pte_range(), which handles the individual entries at order-0.

swapin_alloc_pmd_folio() is a separate function in swap_state.c
as it will be reused in swapin in a later patch.

Signed-off-by: Usama Arif <usama.arif@xxxxxxxxx>
---
mm/swap.h | 7 +++
mm/swap_state.c | 35 +++++++++++++
mm/swapfile.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 179 insertions(+)

diff --git a/mm/swap.h b/mm/swap.h
index a77016f2423b..76752df71693 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -301,6 +301,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
+struct folio *swapin_alloc_pmd_folio(swp_entry_t entry, struct mm_struct *mm);
void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr);

@@ -438,6 +439,12 @@ static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
return NULL;
}

+static inline struct folio *swapin_alloc_pmd_folio(swp_entry_t entry,
+ struct mm_struct *mm)
+{
+ return NULL;
+}
+
static inline void swap_update_readahead(struct folio *folio,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1415a5c54a43..c2e8c76658f5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -584,6 +584,41 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
return swapcache;
}

+#ifdef CONFIG_THP_SWAP
+/**
+ * swapin_alloc_pmd_folio - allocate, charge, and read a PMD-sized swap folio.
+ * @entry: starting swap entry to swap in
+ * @mm: mm to charge for the swap-in
+ *
+ * Allocate a HPAGE_PMD_ORDER folio, charge it to @mm's memcg for @entry, and
+ * issue the swap-in via swapin_folio(). Used by callers that need to map a
+ * PMD swap entry as a whole THP (PMD swapoff).
+ *
+ * Return: the swapped-in folio, or NULL on alloc/charge/swapin failure (in
+ * which case the caller should fall back to splitting the PMD).
+ */
+struct folio *swapin_alloc_pmd_folio(swp_entry_t entry, struct mm_struct *mm)
+{
+ struct folio *folio;
+
+ folio = folio_alloc(GFP_HIGHUSER_MOVABLE, HPAGE_PMD_ORDER);
+ if (!folio)
+ return NULL;
+
+ if (mem_cgroup_swapin_charge_folio(folio, mm, GFP_KERNEL, entry)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ if (!swapin_folio(entry, folio)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ return folio;
+}
+#endif /* CONFIG_THP_SWAP */
+
/*
* Locate a page of swap in physical memory, reserving swap cache space
* and reading the disk if it is not already cached.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 390f191be9a6..7256edf4ce66 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,7 @@
#include <linux/suspend.h>
#include <linux/zswap.h>
#include <linux/plist.h>
+#include <linux/huge_mm.h>

#include <asm/tlbflush.h>
#include <linux/leafops.h>
@@ -2519,6 +2520,130 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return 0;
}

+/*
+ * unuse_pmd - Map a locked folio at PMD granularity during swapoff.
+ *
+ * The caller provides a locked, swapped-in folio. Returns 0 on success
+ * (PMD was mapped). Returns -EAGAIN if the swap cache folio no longer
+ * matches the entry or the PMD changed under the lock (try_to_unuse will
+ * rescan). Returns -EIO if the folio is not uptodate; in that case the
+ * PMD is split so unuse_pte_range() can handle individual pages.
+ */
+static int unuse_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, softleaf_t entry,
+ struct folio *folio)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ pmd_t new_pmd, old_pmd;
+ spinlock_t *ptl;
+ rmap_t rmap_flags = RMAP_NONE;
+ bool exclusive;
+
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
+ return -EAGAIN;
+
+ if (unlikely(!folio_test_uptodate(folio))) {
+ __split_huge_pmd(vma, pmd, addr, false);
+ return -EIO;
+ }
+
+ page = folio_page(folio, 0);
+
+ ptl = pmd_lock(mm, pmd);
+ old_pmd = pmdp_get(pmd);
+
+ if (!pmd_is_swap_entry(old_pmd) ||
+ softleaf_from_pmd(old_pmd).val != entry.val) {
+ spin_unlock(ptl);
+ return -EAGAIN;
+ }
+
+ exclusive = pmd_swp_exclusive(old_pmd);
+
+ /*
+ * Some architectures may have to restore extra metadata to the folio
+ * when reading from swap. This metadata may be indexed by swap entry
+ * so this must be called before folio_put_swap().
+ */
+ arch_swap_restore(folio_swap(entry, folio), folio);
+
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+
+ new_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
+ new_pmd = pmd_mkold(new_pmd);
+ if (pmd_swp_soft_dirty(old_pmd))
+ new_pmd = pmd_mksoft_dirty(new_pmd);
+ if (pmd_swp_uffd_wp(old_pmd))
+ new_pmd = pmd_mkuffd_wp(new_pmd);
+
+ if (exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_get(folio);
+ if (!folio_test_anon(folio))
+ folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
+ else
+ folio_add_anon_rmap_pmd(folio, page, vma, addr, rmap_flags);
+
+ set_pmd_at(mm, addr, pmd, new_pmd);
+ folio_put_swap(folio, NULL);
+
+ spin_unlock(ptl);
+
+ folio_free_swap(folio);
+ return 0;
+}
+
+/*
+ * Try to swap in a PMD swap entry as a whole THP. Returns 0 on success.
+ * Returns -ENOMEM if the PMD-order folio could not be allocated/charged,
+ * -EIO if swap-in failed, or -EAGAIN if the cached folio is no longer
+ * PMD-sized; in all of these the PMD is split so the caller can fall
+ * back to unuse_pte_range(). Otherwise propagates the error from
+ * unuse_pmd().
+ */
+static int unuse_pmd_entry(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, softleaf_t entry)
+{
+ struct folio *folio;
+ int ret;
+
+ folio = swap_cache_get_folio(entry);
+ if (!folio) {
+ folio = swapin_alloc_pmd_folio(entry, vma->vm_mm);
+ if (!folio) {
+ ret = -ENOMEM;
+ goto split_fallback;
+ }
+ }
+
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ /*
+ * If the cached folio is no longer PMD-sized (e.g. split in the
+ * swap cache by deferred_split_scan() or memory_failure() while
+ * the PMD swap entry was installed), the PMD swap entry no longer
+ * maps a single contiguous folio. Split the PMD swap entry so
+ * unuse_pte_range() can swap the per-slot folios in individually.
+ */
+ if (folio_nr_pages(folio) != HPAGE_PMD_NR) {
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = -EAGAIN;
+ goto split_fallback;
+ }
+ ret = unuse_pmd(vma, pmd, addr, entry, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+
+split_fallback:
+ __split_huge_pmd(vma, pmd, addr, false);
+ return ret;
+}
+
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned int type)
@@ -2531,6 +2656,18 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
cond_resched();
next = pmd_addr_end(addr, end);
+
+ pmd_t pmdval = pmdp_get(pmd);
+
+ if (pmd_is_swap_entry(pmdval)) {
+ softleaf_t sl = softleaf_from_pmd(pmdval);
+
+ if (swp_type(sl) == type) {
+ if (!unuse_pmd_entry(vma, pmd, addr, sl))
+ continue;
+ }
+ }
+
ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
--
2.52.0