Re: [RFC PATCH v2 4/9] mm: admit large swapin by backend range in swapin_sync()

From: Nhat Pham

Date: Fri May 29 2026 - 14:34:49 EST

On Fri, May 29, 2026 at 5:19 AM fujunjie <fujunjie1@xxxxxx> wrote:
>
> A large swapin can only read one folio when the whole range has compatible
> backing. Mixed zswap/disk ranges must not reach large-folio IO, and zswap
> range probes are only snapshots.
>
> Filter the orders passed to swap_cache_alloc_folio() in swapin_sync().
> Uniform zeromap ranges and all-disk ranges keep the existing large swapin
> path. Fully zswap-backed ranges may be tried. Mixed zswap/disk ranges fall
> back before allocation.
>
> After a large swapcache folio is installed, recheck the zswap range and
> drop the fresh folio if it became mixed. Also consume -EAGAIN from
> swap_read_folio() the same way. Both cases retry order-0, where each slot
> can resolve its current backend independently.
>
> Signed-off-by: fujunjie <fujunjie1@xxxxxx>
> ---
> mm/memcontrol-v1.c | 8 ++-
> mm/memory.c | 31 ++++++++-
> mm/swap_state.c | 169 ++++++++++++++++++++++++++++++++++++++++++---
> 3 files changed, 194 insertions(+), 14 deletions(-)
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 765069211567..5b11b8055c66 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -682,8 +682,8 @@ void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
> * memcg1_swapin - uncharge swap slot on swapin
> * @folio: folio being swapped in
> *
> - * Call this function after successfully adding the charged
> - * folio to swapcache.
> + * Call this after the charged folio has been added to swapcache and the caller
> + * is no longer going to drop it back to swapped-out state.
> *
> * Context: The folio has to be in swap cache and locked.
> */
> @@ -721,7 +721,9 @@ void memcg1_swapin(struct folio *folio)
> id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
> nr_pages);
> swap_cluster_unlock(ci);
> - mem_cgroup_uncharge_swap(id, nr_pages);
> +
> + if (id)
> + mem_cgroup_uncharge_swap(id, nr_pages);
> }
> #endif
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 5a365492a9a2..d73a19692dea 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4538,6 +4538,24 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
> folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
> }
>
> +static void memcg1_swapin_retry_folio(struct folio *folio,
> + struct vm_fault *vmf)
> +{
> + if (!folio_test_large(folio) || !folio_test_swapcache(folio))
> + return;
> +
> + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
> + if (!folio_trylock(folio))
> + return;
> + } else {
> + folio_lock(folio);
> + }
> +
> + if (folio_test_large(folio) && folio_test_swapcache(folio))
> + memcg1_swapin(folio);
> + folio_unlock(folio);
> +}
> +
> static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> {
> vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> @@ -4857,8 +4875,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>
> swapcache = folio;
> ret |= folio_lock_or_retry(folio, vmf);
> - if (ret & VM_FAULT_RETRY)
> + if (ret & VM_FAULT_RETRY) {
> + memcg1_swapin_retry_folio(folio, vmf);
> goto out_release;
> + }
>
> page = folio_file_page(folio, swp_offset(entry));
> /*
> @@ -5067,6 +5087,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> if (unlikely(folio != swapcache)) {
> folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
> folio_add_lru_vma(folio, vma);
> + if (folio_test_large(swapcache))
> + memcg1_swapin(swapcache);
> folio_put_swap(swapcache, NULL);
> } else if (!folio_test_anon(folio)) {
> /*
> @@ -5076,6 +5098,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
> VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
> folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
> + if (folio_test_large(folio))
> + memcg1_swapin(folio);
> folio_put_swap(folio, NULL);
> } else {
> VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
> @@ -5132,8 +5156,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> if (vmf->pte)
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> out_page:
> - if (folio_test_swapcache(folio))
> + if (folio_test_swapcache(folio)) {
> + if (folio_test_large(folio))
> + memcg1_swapin(folio);
> folio_free_swap(folio);
> + }
> folio_unlock(folio);
> out_release:
> folio_put(folio);
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index d37097913b30..f03ad4832f16 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -21,6 +21,7 @@
> #include <linux/migrate.h>
> #include <linux/vmalloc.h>
> #include <linux/huge_mm.h>
> +#include <linux/zswap.h>
> #include <linux/shmem_fs.h>
> #include "internal.h"
> #include "swap_table.h"
> @@ -403,7 +404,8 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
> static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
> swp_entry_t targ_entry, gfp_t gfp,
> unsigned int order, struct vm_fault *vmf,
> - struct mempolicy *mpol, pgoff_t ilx)
> + struct mempolicy *mpol, pgoff_t ilx,
> + bool defer_memcg1_swapin)
> {
> int err;
> swp_entry_t entry;
> @@ -466,7 +468,8 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
> }
>
> /* memsw uncharges swap when folio is added to swap cache */
> - memcg1_swapin(folio);
> + if (!defer_memcg1_swapin || !order)
> + memcg1_swapin(folio);
> if (shadow)
> workingset_refault(folio, shadow);
>
> @@ -495,9 +498,12 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
> * Return: Returns the folio if allocation succeeded and folio is in the swap
> * cache. Returns error code if failed due to race, OOM or invalid arguments.
> */
> -struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> - unsigned long orders, struct vm_fault *vmf,
> - struct mempolicy *mpol, pgoff_t ilx)
> +static struct folio *__swap_cache_alloc_folio(swp_entry_t targ_entry,
> + gfp_t gfp, unsigned long orders,
> + struct vm_fault *vmf,
> + struct mempolicy *mpol,
> + pgoff_t ilx,
> + bool defer_memcg1_swapin)
> {
> int order, err;
> struct folio *ret;
> @@ -512,7 +518,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>
> do {
> ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
> - vmf, mpol, ilx);
> + vmf, mpol, ilx,
> + defer_memcg1_swapin);
> if (!IS_ERR(ret))
> break;
> err = PTR_ERR(ret);
> @@ -525,6 +532,124 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> return ret;
> }
>
> +struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> + unsigned long orders, struct vm_fault *vmf,
> + struct mempolicy *mpol, pgoff_t ilx)
> +{
> + return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
> + mpol, ilx, false);
> +}
> +
> +static struct folio *swap_cache_alloc_speculative_folio(swp_entry_t targ_entry,
> + gfp_t gfp,
> + unsigned long orders,
> + struct vm_fault *vmf,
> + struct mempolicy *mpol,
> + pgoff_t ilx)
> +{
> + /*
> + * Speculative large swapin may drop this fresh swapcache folio and
> + * retry order-0 after backend or page-table revalidation. Keep the
> + * cgroup v1 memsw swap owner until the caller commits the folio.
> + */
> + return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
> + mpol, ilx, true);
> +}
> +
> +static bool swapin_zeromap_same(swp_entry_t entry, unsigned int nr_pages)
> +{
> + unsigned int ci_start = swp_cluster_offset(entry);
> + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
> + bool is_zero;
> + unsigned int i;
> +
> + if (ci_start + nr_pages > SWAPFILE_CLUSTER) {
> + VM_WARN_ON_ONCE(1);
> + return false;
> + }
> +
> + rcu_read_lock();
> + if (!rcu_dereference(ci->table)) {
> + rcu_read_unlock();
> + return true;
> + }
> +
> + is_zero = __swap_table_test_zero(ci, ci_start);
> + for (i = 1; i < nr_pages; i++) {
> + if (is_zero != __swap_table_test_zero(ci, ci_start + i)) {
> + rcu_read_unlock();
> + return false;
> + }
> + }
> + rcu_read_unlock();
> +
> + return true;
> +}
> +
> +static unsigned long swapin_admit_orders(swp_entry_t entry,
> + unsigned long orders)
> +{
> + unsigned long candidates = orders & ~BIT(0);
> + unsigned long admitted = orders & BIT(0);
> + int order;
> +
> + if (!candidates)
> + return orders;
> +
> + while (candidates) {
> + enum zswap_range_state state;
> + unsigned int nr_pages;
> + swp_entry_t range_entry;
> + bool admit = false;
> +
> + order = fls_long(candidates) - 1;
> + if (order > MAX_PAGE_ORDER) {
> + candidates &= ~BIT(order);
> + continue;
> + }
> +
> + nr_pages = 1U << order;
> + range_entry = swp_entry(swp_type(entry),
> + round_down(swp_offset(entry), nr_pages));
> + if (!swapin_zeromap_same(range_entry, nr_pages))
> + goto next;
> +
> + state = zswap_probe_range(range_entry, nr_pages);
> + switch (state) {
> + case ZSWAP_RANGE_MIXED:
> + break;
> + case ZSWAP_RANGE_ALL_ZSWAP:
> + case ZSWAP_RANGE_NEVER_ENABLED:
> + case ZSWAP_RANGE_NO_ZSWAP:
> + admit = true;
> + break;
> + }
> +
> +next:
> + if (admit)
> + admitted |= BIT(order);
> + else
> + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
> + candidates &= ~BIT(order);
> + }
> +
> + return admitted ? admitted : BIT(0);
> +}
> +
> +static bool zswap_needs_order0_retry(struct folio *folio)
> +{
> + if (!folio_test_large(folio))
> + return false;
> +
> + /*
> + * Admission sees only an advisory zswap snapshot. Recheck after the
> + * large swapcache folio is installed; if the range became mixed, drop
> + * the fresh folio before IO and let order-0 handle each slot.
> + */
> + return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
> + ZSWAP_RANGE_MIXED;
> +}
> +
> /*
> * If we are the only user, then try to free up the swap cache.
> *
> @@ -634,7 +759,8 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
> folio = swap_cache_get_folio(entry);
> if (folio)
> return folio;
> - folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx);
> + folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL,
> + mpol, ilx);
> } while (PTR_ERR(folio) == -EEXIST);
>
> if (IS_ERR_OR_NULL(folio))
> @@ -677,18 +803,43 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
> struct folio *folio;
> int ret;
>
> + orders = swapin_admit_orders(entry, orders);
> +again:
> do {
> folio = swap_cache_get_folio(entry);
> if (folio)
> return folio;
> - folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx);
> + folio = swap_cache_alloc_speculative_folio(entry, gfp, orders,
> + vmf, mpol, ilx);
> } while (PTR_ERR(folio) == -EEXIST);
>
> if (IS_ERR(folio))
> return folio;
>
> + if (zswap_needs_order0_retry(folio)) {
> + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN_FALLBACK);
> + /*
> + * The folio is newly allocated, locked, clean and not uptodate;
> + * no data has been read into it. Removing it only restores the
> + * swap table entries so order-0 swapin can resolve a backend
> + * race without attempting speculative large-folio zswapin.
> + */
> + swap_cache_del_folio(folio);
> + folio_unlock(folio);
> + folio_put(folio);
> + orders = BIT(0);
> + goto again;
> + }
> +
> ret = swap_read_folio(folio, NULL);
> - VM_WARN_ON_ONCE(ret == -EAGAIN);
> + if (ret == -EAGAIN) {

Can this happen? After you add the entire swap range to swap cache,
backend is locked. Zswap writeback bails out if it fails to add the
page to swap cache.

I think you can just check (zswap_probe_range or wev) before
swap_read_folio(). If the range is still fully backed by zswap, you
are good to go. Otherwise, bail here immediately.

Then you don't need all the complexity with extending swap_read_folio
to handle mixed range errors (for now at least).