Re: [v2 15/16] mm: install PMD swap entries on swap-out
From: Usama Arif
Date: Mon Jun 22 2026 - 12:53:57 EST
On 22/06/2026 14:50, Alexandre Ghiti wrote:
> Hi Usama,
>
> On 6/2/26 16:24, Usama Arif wrote:
>> Reclaim today splits a PMD-mapped anonymous THP into 512 PTE swap
>> entries before unmap, losing the huge mapping across the swap
>> round-trip and forcing khugepaged to rebuild it later. The
>> contiguous swap range was already secured when the folio was added
>> to the swap cache (a non-contiguous allocation would have split the
>> folio earlier), so the PMD can be replaced by a single PMD-level
>> swap entry instead.
>> This patch mirrors the existing PTE swap-out path at PMD
>> granularity:
>> - shrink_folio_list() drops TTU_SPLIT_HUGE_PMD for PMD-mappable
>> swapcache folios, gated on zswap_never_enabled() since zswap
>> cannot reconstruct a 2 MB folio from per-page blobs (Best
>> to handle zswap case separately).
>> - try_to_unmap_one() now has a PMD branch that calls
>> set_pmd_swap_entry() and adjusts MM_ANONPAGES / MM_SWAPENTS by
>> HPAGE_PMD_NR before walk_done. TTU_SPLIT_HUGE_PMD remains the
>> fallback.
>> - set_pmd_swap_entry() is the installer. Mirroring the PTE
>> swap-out sequence at PMD granularity, it clears the present
>> mapping (keeping the original for rollback), bumps the swap_map
>> refcount for the folio's 512 slots, drops the exclusive mark if
>> the page was anon-exclusive, propagates the dirty bit to the
>> folio so writeback is not lost, and installs a swap PMD that
>> preserves the original soft-dirty / uffd-wp / exclusive bits.
>> Any failing step rolls back the present mapping.
>>
>> The swap entry value matches what 512 PTE swap entries would
>> encode, so swap_map refcounting is unchanged: each of the 512 slots
>> carries a count of 1, released individually on later split or
>> together on swap-in.
>>
>> Signed-off-by: Usama Arif <usama.arif@xxxxxxxxx>
>> ---
>> include/linux/huge_mm.h | 2 +
>> include/linux/vm_event_item.h | 1 +
>> mm/huge_memory.c | 78 +++++++++++++++++++++++++++++++++++
>> mm/rmap.c | 20 +++++++++
>> mm/vmscan.c | 14 ++++++-
>> mm/vmstat.c | 1 +
>> 6 files changed, 115 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 9ec475ccfc91..b746f8c8db69 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -533,6 +533,8 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
>> #ifdef CONFIG_THP_SWAP
>> vm_fault_t do_huge_pmd_swap_page(struct vm_fault *vmf);
>> +int set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
>> + struct folio *folio);
>> #else
>> static inline vm_fault_t do_huge_pmd_swap_page(struct vm_fault *vmf)
>> {
>> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
>> index 03fe95f5a020..7267c06674c0 100644
>> --- a/include/linux/vm_event_item.h
>> +++ b/include/linux/vm_event_item.h
>> @@ -108,6 +108,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>> THP_ZERO_PAGE_ALLOC_FAILED,
>> THP_SWPOUT,
>> THP_SWPOUT_FALLBACK,
>> + THP_SWPOUT_PMD,
>> #endif
>> #ifdef CONFIG_BALLOON
>> BALLOON_INFLATE,
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 3fc2f6e5eafa..1fed86065fd9 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -5385,3 +5385,81 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
>> trace_remove_migration_pmd(address, pmd_val(pmde));
>> }
>> #endif
>> +
>> +#ifdef CONFIG_THP_SWAP
>> +/**
>> + * set_pmd_swap_entry() - Replace a PMD mapping with a PMD-level swap entry.
>> + * @pvmw: Page vma mapped walk context, must have pvmw->pmd set and
>> + * pvmw->pte NULL (i.e. PMD-mapped).
>> + * @folio: The folio being swapped out. Must be in the swap cache.
>> + *
>> + * This installs a PMD-level swap entry in place of a present PMD mapping,
>> + * avoiding the need to split the PMD into PTE-level swap entries.
>> + *
>> + * Return: 0 on success, negative error code on failure.
>> + */
>> +int set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
>> + struct folio *folio)
>> +{
>> + struct vm_area_struct *vma = pvmw->vma;
>> + struct mm_struct *mm = vma->vm_mm;
>> + unsigned long address = pvmw->address;
>> + unsigned long haddr = address & HPAGE_PMD_MASK;
>> + struct page *page = folio_page(folio, 0);
>> + bool anon_exclusive;
>> + pmd_t pmdval;
>> + swp_entry_t entry;
>> + pmd_t pmdswp;
>> +
>> + if (!(pvmw->pmd && !pvmw->pte))
>> + return 0;
>> +
>> + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
>> + VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
>> +
>> + if (unlikely(folio_test_swapbacked(folio) !=
>> + folio_test_swapcache(folio))) {
>> + WARN_ON_ONCE(1);
>> + return -EBUSY;
>> + }
>> +
>> + flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
>> +
>> + pmdval = pmdp_invalidate(vma, haddr, pvmw->pmd);
>> +
>> + /* Update high watermark before we lower rss */
>> + update_hiwater_rss(mm);
>> +
>> + if (folio_dup_swap(folio, NULL) < 0) {
>> + set_pmd_at(mm, haddr, pvmw->pmd, pmdval);
>> + return -ENOMEM;
>> + }
>> +
>> + /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
>> + anon_exclusive = PageAnonExclusive(page);
>> + if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
>> + folio_put_swap(folio, NULL);
>> + set_pmd_at(mm, haddr, pvmw->pmd, pmdval);
>> + return -EBUSY;
>> + }
>> +
>> + if (pmd_dirty(pmdval))
>> + folio_mark_dirty(folio);
>> +
>> + entry = folio->swap;
>> + pmdswp = softleaf_to_pmd(entry);
>> + if (pmd_soft_dirty(pmdval))
>> + pmdswp = pmd_swp_mksoft_dirty(pmdswp);
>> + if (pmd_uffd_wp(pmdval))
>> + pmdswp = pmd_swp_mkuffd_wp(pmdswp);
>> + if (anon_exclusive)
>> + pmdswp = pmd_swp_mkexclusive(pmdswp);
>> + set_pmd_at(mm, haddr, pvmw->pmd, pmdswp);
>> +
>> + folio_remove_rmap_pmd(folio, page, vma);
>> + folio_put(folio);
>> +
>> + count_vm_event(THP_SWPOUT_PMD);
>> + return 0;
>> +}
>> +#endif /* CONFIG_THP_SWAP */
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 0fb7a1b82cf3..ffc7aa62a29e 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -2079,6 +2079,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>> goto walk_abort;
>> }
>> +#ifdef CONFIG_THP_SWAP
>> + /*
>> + * If the folio is in the swap cache and we're not
>> + * asked to split, install a PMD-level swap entry.
>> + */
>> + if (!(flags & TTU_SPLIT_HUGE_PMD) &&
>> + folio_test_anon(folio) &&
>> + folio_test_swapcache(folio)) {
>> + if (set_pmd_swap_entry(&pvmw, folio))
>> + goto walk_abort;
>> +
>> + mm_prepare_for_swap_entries(mm);
>> + add_mm_counter(mm, MM_ANONPAGES,
>> + -HPAGE_PMD_NR);
>> + add_mm_counter(mm, MM_SWAPENTS,
>> + HPAGE_PMD_NR);
>> + goto walk_done;
>> + }
>> +#endif
>> +
>> if (flags & TTU_SPLIT_HUGE_PMD) {
>> /*
>> * We temporarily have to drop the PTL and
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index e8a90911bf88..0f376fbf9bb3 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -64,6 +64,7 @@
>> #include <linux/swapops.h>
>> #include <linux/sched/sysctl.h>
>> +#include <linux/zswap.h>
>> #include "internal.h"
>> #include "swap.h"
>> @@ -1332,7 +1333,18 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>> enum ttu_flags flags = TTU_BATCH_FLUSH;
>> bool was_swapbacked = folio_test_swapbacked(folio);
>> - if (folio_test_pmd_mappable(folio))
>> + /*
>> + * With THP_SWAP, PMD-mappable folios already in the
>> + * swap cache can be unmapped with a PMD-level swap
>> + * entry, avoiding the cost of splitting the PMD.
>> + * Skip this when zswap has been enabled because
>> + * zswap stores pages individually and cannot
>> + * reconstruct a large folio on swap-in.
>> + */
>> + if (folio_test_pmd_mappable(folio) &&
>> + !(IS_ENABLED(CONFIG_THP_SWAP) &&
>> + folio_test_swapcache(folio) &&
>> + zswap_never_enabled()))
>
>
> While working on the PMD zswap support, I noticed the following (small) problem: if zswap is enabled at runtime, there could be PMD entries created before, then in zswap_load(), after zswap enablement, it would fail with -EINVAL and finally swap_read_folio() would not even try to read it from the disk.
Thanks Alexandre!
I think its similar to what Lance raised in https://lore.kernel.org/all/20260612142124.73367-1-lance.yang@xxxxxxxxx/.
What I have locally is at the end, but I need to test it and spend more time into it.
I will look at your change as well before sending the next revision (looks better at a first glance)!
The current strategy for this series is in: https://lore.kernel.org/all/c5ae6e6f-9b48-4ec2-a1c1-33ec3b1d3143@xxxxxxxxx/
diff --git a/mm/zswap.c b/mm/zswap.c
index 761cd699e0a3..f94133d876a1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1592,17 +1592,40 @@ int zswap_load(struct folio *folio)
if (zswap_never_enabled())
return -ENOENT;
+ entry = xa_load(tree, offset);
+
/*
* Large folios should not be swapped in while zswap is being used, as
* they are not properly handled. Zswap does not properly load large
* folios, and a large folio may only be partially in zswap.
+ *
+ * If no zswap entry exists for the folio, however, the caller can read
+ * it from the backing swap device. Scan the covered slots so a PMD
+ * swapin racing with zswap enable does not SIGBUS solely because zswap
+ * became available after the PMD swap entry was installed.
*/
- if (WARN_ON_ONCE(folio_test_large(folio))) {
+ if (folio_test_large(folio)) {
+ unsigned int type = swp_type(swp);
+ bool found = !!entry;
+ long index;
+
+ if (!found) {
+ for (index = 1; index < folio_nr_pages(folio); index++) {
+ swp_entry_t cur = swp_entry(type, offset + index);
+
+ if (xa_load(swap_zswap_tree(cur), offset + index)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ if (!found)
+ return -ENOENT;
+
+ WARN_ON_ONCE(1);
folio_unlock(folio);
return -EINVAL;
}
-
- entry = xa_load(tree, offset);
if (!entry)
return -ENOENT;
>
> I fixed it with the following patch, let me know what you think:
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 761cd699e0a3..9c3931d42a42 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1571,10 +1571,9 @@ bool zswap_store(struct folio *folio)
> * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
> * will SIGBUS).
> *
> - * -EINVAL: if the swapped out content was in zswap, but the page belongs
> - * to a large folio, which is not supported by zswap. The folio is unlocked,
> - * but NOT marked up-to-date, so that an IO error is emitted (e.g.
> - * do_swap_page() will SIGBUS).
> + * -EIO: if a slot in a large-folio range is unexpectedly still in zswap.
> + * The folio is unlocked, but NOT marked up-to-date, so that an IO error is
> + * emitted (e.g. do_swap_page() will SIGBUS).
> *
> * -ENOENT: if the swapped out content was not in zswap. The folio remains
> * locked on return.
> @@ -1593,13 +1592,29 @@ int zswap_load(struct folio *folio)
> return -ENOENT;
>
> /*
> - * Large folios should not be swapped in while zswap is being used, as
> - * they are not properly handled. Zswap does not properly load large
> - * folios, and a large folio may only be partially in zswap.
> + * A large (PMD) folio reaches zswap_load() only when its whole range
> + * is on disk: do_huge_pmd_swap_page() splits the PMD swap entry to
> + * PTEs and faults order-0 whenever any slot is still in zswap, so
> + * zswap never reconstructs a large folio. Confirm the range is
> + * entirely absent from zswap and return -ENOENT so the caller reads it
> + * from disk; if a slot is unexpectedly still in zswap, fail the read
> + * rather than return partially-initialised data.
> */
> - if (WARN_ON_ONCE(folio_test_large(folio))) {
> - folio_unlock(folio);
> - return -EINVAL;
> + if (folio_test_large(folio)) {
> + unsigned long nr_pages = folio_nr_pages(folio);
> + XA_STATE(xas, tree, offset);
> + bool any;
> +
> + /* One xa_state walk over the range, not a per-slot
> xa_load(). */
> + rcu_read_lock();
> + any = xas_find(&xas, offset + nr_pages - 1) != NULL;
> + rcu_read_unlock();
> +
> + if (any) {
> + folio_unlock(folio);
> + return -EIO;
> + }
> + return -ENOENT;
> }
>
> entry = xa_load(tree, offset);
>
>
> Thanks,
>
> Alex
>
>
>> flags |= TTU_SPLIT_HUGE_PMD;
>> /*
>> * Without TTU_SYNC, try_to_unmap will only begin to
>> diff --git a/mm/vmstat.c b/mm/vmstat.c
>> index f534972f517d..9b4963a7eb04 100644
>> --- a/mm/vmstat.c
>> +++ b/mm/vmstat.c
>> @@ -1421,6 +1421,7 @@ const char * const vmstat_text[] = {
>> [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed",
>> [I(THP_SWPOUT)] = "thp_swpout",
>> [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback",
>> + [I(THP_SWPOUT_PMD)] = "thp_swpout_pmd",
>> #endif
>> #ifdef CONFIG_BALLOON
>> [I(BALLOON_INFLATE)] = "balloon_inflate",