[RFC PATCH v2 2/9] mm: let swap_read_folio() report retryable zswap races
From: fujunjie
Date: Fri May 29 2026 - 08:21:56 EST
Large zswap loads need a way to ask the caller to drop a speculative large
swapcache folio and retry order-0. A void swap_read_folio() cannot express
that without turning a backend race into an IO failure.
Return int from swap_read_folio() and reserve -EAGAIN for retryable large
zswap races. Existing order-0 paths keep treating the read as before; the
synchronous swapin path only warns for now. A later patch will consume
-EAGAIN and retry order-0.
Signed-off-by: fujunjie <fujunjie1@xxxxxx>
---
mm/page_io.c | 19 +++++++++++++++++--
mm/swap.h | 5 +++--
mm/swap_state.c | 13 +++++++++++--
3 files changed, 31 insertions(+), 6 deletions(-)
diff --git a/mm/page_io.c b/mm/page_io.c
index f2d8fe7fd057..16724bdfb400 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -653,13 +653,21 @@ static void swap_read_folio_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+/*
+ * Return -EAGAIN only when a locked large swapcache folio hit a retryable
+ * zswap backend race. The caller owns that still-locked folio and must drop or
+ * retry it. Other zswap errors are still reported through the usual folio
+ * state: the folio is unlocked without PG_uptodate and the fault path will
+ * turn that into an I/O error.
+ */
+int swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
bool workingset = folio_test_workingset(folio);
unsigned long pflags;
bool in_thrashing;
+ int ret = 0;
VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -681,8 +689,14 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
goto finish;
}
- if (zswap_load(folio) != -ENOENT)
+ ret = zswap_load(folio);
+ if (ret == -EAGAIN) {
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
goto finish;
+ }
+ if (ret != -ENOENT)
+ goto finish;
+ ret = 0;
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
@@ -701,6 +715,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
psi_memstall_leave(&pflags);
}
delayacct_swapin_end();
+ return ret;
}
void __swap_read_unplug(struct swap_iocb *sio)
diff --git a/mm/swap.h b/mm/swap.h
index 77d2d14eda42..ea7e1f3c4410 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -241,7 +241,7 @@ extern void __swap_cluster_free_entries(struct swap_info_struct *si,
/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug);
+int swap_read_folio(struct folio *folio, struct swap_iocb **plug);
void __swap_read_unplug(struct swap_iocb *plug);
static inline void swap_read_unplug(struct swap_iocb *plug)
{
@@ -381,8 +381,9 @@ static inline void folio_put_swap(struct folio *folio, struct page *page)
{
}
-static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+static inline int swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
+ return 0;
}
static inline void swap_write_unplug(struct swap_iocb *sio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 04f5ce992401..d37097913b30 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -628,6 +628,7 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
struct swap_iocb **plug, bool readahead)
{
struct folio *folio;
+ int ret;
do {
folio = swap_cache_get_folio(entry);
@@ -639,7 +640,13 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
if (IS_ERR_OR_NULL(folio))
return NULL;
- swap_read_folio(folio, plug);
+ ret = swap_read_folio(folio, plug);
+ /*
+ * Swap readahead allocates order-0 folios. -EAGAIN is reserved for
+ * retryable large zswap backend races and must be handled by the
+ * synchronous common swapin path.
+ */
+ VM_WARN_ON_ONCE(ret == -EAGAIN);
if (readahead) {
folio_set_readahead(folio);
count_vm_event(SWAP_RA);
@@ -668,6 +675,7 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
{
struct folio *folio;
+ int ret;
do {
folio = swap_cache_get_folio(entry);
@@ -679,7 +687,8 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
if (IS_ERR(folio))
return folio;
- swap_read_folio(folio, NULL);
+ ret = swap_read_folio(folio, NULL);
+ VM_WARN_ON_ONCE(ret == -EAGAIN);
return folio;
}
--
2.34.1