[RFC PATCH 5/5] mm: swap: allow zswap-backed large folio swapin

From: fujunjie

Date: Fri May 08 2026 - 16:23:17 EST

alloc_swap_folio() has been falling back to order-0 in the anonymous
synchronous swapin path whenever zswap was ever enabled, because a large
folio range could contain a mixture of zswap and non-zswap entries and
zswap_load() could not handle large folios.

zswap_load() can now load a range that is fully present in zswap, and
zswap_entry_batch() can identify mixed zswap ranges. Use that check
alongside the existing zeromap and swapcache checks when selecting a large
folio for anonymous swapin, and recheck before inserting a large folio into
the swap cache while holding the swap cluster lock.

With mixed zswap ranges rejected and the insertion-race fallback in place,
remove the blanket zswap_never_enabled() fallback from the anonymous swapin
path so all-zswap and all-disk anonymous ranges can use mTHP swapin. Shmem
keeps its existing zswap fallback and is outside this RFC.

Signed-off-by: fujunjie <fujunjie1@xxxxxx>
---
mm/memory.c | 21 ++++++---------------
mm/swap_state.c | 23 +++++++++++++++--------
2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 84e3b77b8293..0be249108de1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,6 +78,7 @@
#include <linux/sched/sysctl.h>
#include <linux/pgalloc.h>
#include <linux/uaccess.h>
+#include <linux/zswap.h>

#include <trace/events/kmem.h>

@@ -4635,13 +4636,11 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
return false;

- /*
- * swap_read_folio() can't handle the case a large folio is hybridly
- * from different backends. And they are likely corner cases. Similar
- * things might be added once zswap support large folios.
- */
+ /* swap_read_folio() can't handle hybrid backend large folios. */
if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
return false;
+ if (unlikely(zswap_entry_batch(entry, nr_pages, NULL) != nr_pages))
+ return false;
if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
return false;

@@ -4690,14 +4689,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (unlikely(userfaultfd_armed(vma)))
goto fallback;

- /*
- * A large swapped out folio could be partially or fully in zswap. We
- * lack handling for such cases, so fallback to swapping in order-0
- * folio.
- */
- if (!zswap_never_enabled())
- goto fallback;
-
entry = softleaf_from_pte(vmf->orig_pte);
/*
* Get a list of all the (large) orders below PMD_ORDER that are enabled
@@ -4772,8 +4763,8 @@ static struct folio *swapin_synchronous_folio(swp_entry_t entry,
order = folio_order(folio);

/*
- * folio is charged, so swapin can only fail due to raced swapin and
- * return NULL.
+ * folio is charged, so NULL means the large folio could not be
+ * inserted and needs order-0 fallback.
*/
swapcache = swapin_folio(entry, folio);
if (swapcache == folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1415a5c54a43..4e58fad5e5f0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -22,6 +22,7 @@
#include <linux/vmalloc.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
+#include <linux/zswap.h>
#include "internal.h"
#include "swap_table.h"
#include "swap.h"
@@ -207,6 +208,11 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
if (swp_tb_is_shadow(old_tb))
shadow = swp_tb_to_shadow(old_tb);
} while (++ci_off < ci_end);
+ if (unlikely(folio_test_large(folio) &&
+ zswap_entry_batch(entry, nr_pages, NULL) != nr_pages)) {
+ err = -EAGAIN;
+ goto failed;
+ }
__swap_cache_add_folio(ci, folio, entry);
swap_cluster_unlock(ci);
if (shadowp)
@@ -460,7 +466,8 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
*
* Context: Caller must protect the swap device with reference count or locks.
* Return: Returns the folio being added on success. Returns the existing folio
- * if @entry is already cached. Returns NULL if raced with swapin or swapoff.
+ * if @entry is already cached. Returns NULL if raced with swapin or swapoff,
+ * or if a large folio fails a backend recheck before insertion.
*/
static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
struct folio *folio,
@@ -483,10 +490,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,

/*
* Large order allocation needs special handling on
- * race: if a smaller folio exists in cache, swapin needs
- * to fallback to order 0, and doing a swap cache lookup
- * might return a folio that is irrelevant to the faulting
- * entry because @entry is aligned down. Just return NULL.
+ * race or backend recheck failure: swapin needs to fall back
+ * to order 0, and doing a swap cache lookup might return a
+ * folio that is irrelevant to the faulting entry because
+ * @entry is aligned down. Just return NULL.
*/
if (ret != -EEXIST || folio_test_large(folio))
goto failed;
@@ -567,9 +574,9 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
* with the folio size.
*
* Return: returns pointer to @folio on success. If folio is a large folio
- * and this raced with another swapin, NULL will be returned to allow fallback
- * to order 0. Else, if another folio was already added to the swap cache,
- * return that swap cache folio instead.
+ * and it raced with another swapin or failed a backend recheck, NULL will be
+ * returned to allow fallback to order 0. Else, if another folio was already
+ * added to the swap cache, return that swap cache folio instead.
*/
struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
{
--
2.34.1