Re: [v2 15/16] mm: install PMD swap entries on swap-out

From: Usama Arif

Date: Mon Jun 22 2026 - 12:53:57 EST

On 22/06/2026 14:50, Alexandre Ghiti wrote:
> Hi Usama,
>
> On 6/2/26 16:24, Usama Arif wrote:
>> Reclaim today splits a PMD-mapped anonymous THP into 512 PTE swap
>> entries before unmap, losing the huge mapping across the swap
>> round-trip and forcing khugepaged to rebuild it later. The
>> contiguous swap range was already secured when the folio was added
>> to the swap cache (a non-contiguous allocation would have split the
>> folio earlier), so the PMD can be replaced by a single PMD-level
>> swap entry instead.
>> This patch mirrors the existing PTE swap-out path at PMD
>> granularity:
>> - shrink_folio_list() drops TTU_SPLIT_HUGE_PMD for PMD-mappable
>>    swapcache folios, gated on zswap_never_enabled() since zswap
>>    cannot reconstruct a 2 MB folio from per-page blobs (Best
>>    to handle zswap case separately).
>> - try_to_unmap_one() now has a PMD branch that calls
>>    set_pmd_swap_entry() and adjusts MM_ANONPAGES / MM_SWAPENTS by
>>    HPAGE_PMD_NR before walk_done. TTU_SPLIT_HUGE_PMD remains the
>>    fallback.
>> - set_pmd_swap_entry() is the installer. Mirroring the PTE
>>    swap-out sequence at PMD granularity, it clears the present
>>    mapping (keeping the original for rollback), bumps the swap_map
>>    refcount for the folio's 512 slots, drops the exclusive mark if
>>    the page was anon-exclusive, propagates the dirty bit to the
>>    folio so writeback is not lost, and installs a swap PMD that
>>    preserves the original soft-dirty / uffd-wp / exclusive bits.
>>    Any failing step rolls back the present mapping.
>>
>> The swap entry value matches what 512 PTE swap entries would
>> encode, so swap_map refcounting is unchanged: each of the 512 slots
>> carries a count of 1, released individually on later split or
>> together on swap-in.
>>
>> Signed-off-by: Usama Arif <usama.arif@xxxxxxxxx>
>> ---
>> include/linux/huge_mm.h       | 2 +
>> include/linux/vm_event_item.h | 1 +
>> mm/huge_memory.c              | 78 +++++++++++++++++++++++++++++++++++
>> mm/rmap.c                     | 20 +++++++++
>> mm/vmscan.c                   | 14 ++++++-
>> mm/vmstat.c                   | 1 +
>> 6 files changed, 115 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 9ec475ccfc91..b746f8c8db69 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -533,6 +533,8 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
>> #ifdef CONFIG_THP_SWAP
>> vm_fault_t do_huge_pmd_swap_page(struct vm_fault *vmf);
>> +int set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
>> +               struct folio *folio);
>> #else
>> static inline vm_fault_t do_huge_pmd_swap_page(struct vm_fault *vmf)
>> {
>> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
>> index 03fe95f5a020..7267c06674c0 100644
>> --- a/include/linux/vm_event_item.h
>> +++ b/include/linux/vm_event_item.h
>> @@ -108,6 +108,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>>           THP_ZERO_PAGE_ALLOC_FAILED,
>>           THP_SWPOUT,
>>           THP_SWPOUT_FALLBACK,
>> +        THP_SWPOUT_PMD,
>> #endif
>> #ifdef CONFIG_BALLOON
>>           BALLOON_INFLATE,
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 3fc2f6e5eafa..1fed86065fd9 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -5385,3 +5385,81 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
>>       trace_remove_migration_pmd(address, pmd_val(pmde));
>> }
>> #endif
>> +
>> +#ifdef CONFIG_THP_SWAP
>> +/**
>> + * set_pmd_swap_entry() - Replace a PMD mapping with a PMD-level swap entry.
>> + * @pvmw: Page vma mapped walk context, must have pvmw->pmd set and
>> + *        pvmw->pte NULL (i.e. PMD-mapped).
>> + * @folio: The folio being swapped out. Must be in the swap cache.
>> + *
>> + * This installs a PMD-level swap entry in place of a present PMD mapping,
>> + * avoiding the need to split the PMD into PTE-level swap entries.
>> + *
>> + * Return: 0 on success, negative error code on failure.
>> + */
>> +int set_pmd_swap_entry(struct page_vma_mapped_walk *pvmw,
>> +               struct folio *folio)
>> +{
>> +    struct vm_area_struct *vma = pvmw->vma;
>> +    struct mm_struct *mm = vma->vm_mm;
>> +    unsigned long address = pvmw->address;
>> +    unsigned long haddr = address & HPAGE_PMD_MASK;
>> +    struct page *page = folio_page(folio, 0);
>> +    bool anon_exclusive;
>> +    pmd_t pmdval;
>> +    swp_entry_t entry;
>> +    pmd_t pmdswp;
>> +
>> +    if (!(pvmw->pmd && !pvmw->pte))
>> +        return 0;
>> +
>> +    VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
>> +    VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
>> +
>> +    if (unlikely(folio_test_swapbacked(folio) !=
>> +            folio_test_swapcache(folio))) {
>> +        WARN_ON_ONCE(1);
>> +        return -EBUSY;
>> +    }
>> +
>> +    flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
>> +
>> +    pmdval = pmdp_invalidate(vma, haddr, pvmw->pmd);
>> +
>> +    /* Update high watermark before we lower rss */
>> +    update_hiwater_rss(mm);
>> +
>> +    if (folio_dup_swap(folio, NULL) < 0) {
>> +        set_pmd_at(mm, haddr, pvmw->pmd, pmdval);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
>> +    anon_exclusive = PageAnonExclusive(page);
>> +    if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
>> +        folio_put_swap(folio, NULL);
>> +        set_pmd_at(mm, haddr, pvmw->pmd, pmdval);
>> +        return -EBUSY;
>> +    }
>> +
>> +    if (pmd_dirty(pmdval))
>> +        folio_mark_dirty(folio);
>> +
>> +    entry = folio->swap;
>> +    pmdswp = softleaf_to_pmd(entry);
>> +    if (pmd_soft_dirty(pmdval))
>> +        pmdswp = pmd_swp_mksoft_dirty(pmdswp);
>> +    if (pmd_uffd_wp(pmdval))
>> +        pmdswp = pmd_swp_mkuffd_wp(pmdswp);
>> +    if (anon_exclusive)
>> +        pmdswp = pmd_swp_mkexclusive(pmdswp);
>> +    set_pmd_at(mm, haddr, pvmw->pmd, pmdswp);
>> +
>> +    folio_remove_rmap_pmd(folio, page, vma);
>> +    folio_put(folio);
>> +
>> +    count_vm_event(THP_SWPOUT_PMD);
>> +    return 0;
>> +}
>> +#endif /* CONFIG_THP_SWAP */
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 0fb7a1b82cf3..ffc7aa62a29e 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -2079,6 +2079,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>>                   goto walk_abort;
>>               }
>> +#ifdef CONFIG_THP_SWAP
>> +            /*
>> +             * If the folio is in the swap cache and we're not
>> +             * asked to split, install a PMD-level swap entry.
>> +             */
>> +            if (!(flags & TTU_SPLIT_HUGE_PMD) &&
>> +                folio_test_anon(folio) &&
>> +                folio_test_swapcache(folio)) {
>> +                if (set_pmd_swap_entry(&pvmw, folio))
>> +                    goto walk_abort;
>> +
>> +                mm_prepare_for_swap_entries(mm);
>> +                add_mm_counter(mm, MM_ANONPAGES,
>> +                           -HPAGE_PMD_NR);
>> +                add_mm_counter(mm, MM_SWAPENTS,
>> +                           HPAGE_PMD_NR);
>> +                goto walk_done;
>> +            }
>> +#endif
>> +
>>               if (flags & TTU_SPLIT_HUGE_PMD) {
>>                   /*
>>                    * We temporarily have to drop the PTL and
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index e8a90911bf88..0f376fbf9bb3 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -64,6 +64,7 @@
>> #include <linux/swapops.h>
>> #include <linux/sched/sysctl.h>
>> +#include <linux/zswap.h>
>> #include "internal.h"
>> #include "swap.h"
>> @@ -1332,7 +1333,18 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>>               enum ttu_flags flags = TTU_BATCH_FLUSH;
>>               bool was_swapbacked = folio_test_swapbacked(folio);
>> -            if (folio_test_pmd_mappable(folio))
>> +            /*
>> +             * With THP_SWAP, PMD-mappable folios already in the
>> +             * swap cache can be unmapped with a PMD-level swap
>> +             * entry, avoiding the cost of splitting the PMD.
>> +             * Skip this when zswap has been enabled because
>> +             * zswap stores pages individually and cannot
>> +             * reconstruct a large folio on swap-in.
>> +             */
>> +            if (folio_test_pmd_mappable(folio) &&
>> +                !(IS_ENABLED(CONFIG_THP_SWAP) &&
>> +                  folio_test_swapcache(folio) &&
>> +                  zswap_never_enabled()))
>
>
> While working on the PMD zswap support, I noticed the following (small) problem: if zswap is enabled at runtime, there could be PMD entries created before, then in zswap_load(), after zswap enablement, it would fail with -EINVAL and finally swap_read_folio() would not even try to read it from the disk.

Thanks Alexandre!

I think its similar to what Lance raised in https://lore.kernel.org/all/20260612142124.73367-1-lance.yang@xxxxxxxxx/.

What I have locally is at the end, but I need to test it and spend more time into it.
I will look at your change as well before sending the next revision (looks better at a first glance)!
The current strategy for this series is in: https://lore.kernel.org/all/c5ae6e6f-9b48-4ec2-a1c1-33ec3b1d3143@xxxxxxxxx/

diff --git a/mm/zswap.c b/mm/zswap.c
index 761cd699e0a3..f94133d876a1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1592,17 +1592,40 @@ int zswap_load(struct folio *folio)
if (zswap_never_enabled())
return -ENOENT;

+ entry = xa_load(tree, offset);
+
/*
* Large folios should not be swapped in while zswap is being used, as
* they are not properly handled. Zswap does not properly load large
* folios, and a large folio may only be partially in zswap.
+ *
+ * If no zswap entry exists for the folio, however, the caller can read
+ * it from the backing swap device. Scan the covered slots so a PMD
+ * swapin racing with zswap enable does not SIGBUS solely because zswap
+ * became available after the PMD swap entry was installed.
*/
- if (WARN_ON_ONCE(folio_test_large(folio))) {
+ if (folio_test_large(folio)) {
+ unsigned int type = swp_type(swp);
+ bool found = !!entry;
+ long index;
+
+ if (!found) {
+ for (index = 1; index < folio_nr_pages(folio); index++) {
+ swp_entry_t cur = swp_entry(type, offset + index);
+
+ if (xa_load(swap_zswap_tree(cur), offset + index)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ if (!found)
+ return -ENOENT;
+
+ WARN_ON_ONCE(1);
folio_unlock(folio);
return -EINVAL;
}
-
- entry = xa_load(tree, offset);
if (!entry)
return -ENOENT;

>
> I fixed it with the following patch, let me know what you think:
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 761cd699e0a3..9c3931d42a42 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1571,10 +1571,9 @@ bool zswap_store(struct folio *folio)
> * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
> * will SIGBUS).
> *
> - * -EINVAL: if the swapped out content was in zswap, but the page belongs
> - * to a large folio, which is not supported by zswap. The folio is unlocked,
> - * but NOT marked up-to-date, so that an IO error is emitted (e.g.
> - * do_swap_page() will SIGBUS).
> + * -EIO: if a slot in a large-folio range is unexpectedly still in zswap.
> + * The folio is unlocked, but NOT marked up-to-date, so that an IO error is
> + * emitted (e.g. do_swap_page() will SIGBUS).
> *
> * -ENOENT: if the swapped out content was not in zswap. The folio remains
> * locked on return.
> @@ -1593,13 +1592,29 @@ int zswap_load(struct folio *folio)
>                 return -ENOENT;
>
>         /*
> -        * Large folios should not be swapped in while zswap is being used, as
> -        * they are not properly handled. Zswap does not properly load large
> -        * folios, and a large folio may only be partially in zswap.
> +        * A large (PMD) folio reaches zswap_load() only when its whole range
> +        * is on disk: do_huge_pmd_swap_page() splits the PMD swap entry to
> +        * PTEs and faults order-0 whenever any slot is still in zswap, so
> +        * zswap never reconstructs a large folio. Confirm the range is
> +        * entirely absent from zswap and return -ENOENT so the caller reads it
> +        * from disk; if a slot is unexpectedly still in zswap, fail the read
> +        * rather than return partially-initialised data.
>          */
> -       if (WARN_ON_ONCE(folio_test_large(folio))) {
> -               folio_unlock(folio);
> -               return -EINVAL;
> +       if (folio_test_large(folio)) {
> +               unsigned long nr_pages = folio_nr_pages(folio);
> +               XA_STATE(xas, tree, offset);
> +               bool any;
> +
> +               /* One xa_state walk over the range, not a per-slot
> xa_load(). */
> +               rcu_read_lock();
> +               any = xas_find(&xas, offset + nr_pages - 1) != NULL;
> +               rcu_read_unlock();
> +
> +               if (any) {
> +                       folio_unlock(folio);
> +                       return -EIO;
> +               }
> +               return -ENOENT;
>         }
>
>         entry = xa_load(tree, offset);
>
>
> Thanks,
>
> Alex
>
>
>>                   flags |= TTU_SPLIT_HUGE_PMD;
>>               /*
>>                * Without TTU_SYNC, try_to_unmap will only begin to
>> diff --git a/mm/vmstat.c b/mm/vmstat.c
>> index f534972f517d..9b4963a7eb04 100644
>> --- a/mm/vmstat.c
>> +++ b/mm/vmstat.c
>> @@ -1421,6 +1421,7 @@ const char * const vmstat_text[] = {
>>       [I(THP_ZERO_PAGE_ALLOC_FAILED)]        = "thp_zero_page_alloc_failed",
>>       [I(THP_SWPOUT)]                = "thp_swpout",
>>       [I(THP_SWPOUT_FALLBACK)]        = "thp_swpout_fallback",
>> +    [I(THP_SWPOUT_PMD)]            = "thp_swpout_pmd",
>> #endif
>> #ifdef CONFIG_BALLOON
>>       [I(BALLOON_INFLATE)]            = "balloon_inflate",