Re: [RFC PATCH v2 4/9] mm: admit large swapin by backend range in swapin_sync()

From: Kairui Song

Date: Fri May 29 2026 - 10:51:32 EST


On Fri, May 29, 2026 at 8:26 PM fujunjie <fujunjie1@xxxxxx> wrote:
>
> A large swapin can only read one folio when the whole range has compatible
> backing. Mixed zswap/disk ranges must not reach large-folio IO, and zswap
> range probes are only snapshots.
>
> Filter the orders passed to swap_cache_alloc_folio() in swapin_sync().
> Uniform zeromap ranges and all-disk ranges keep the existing large swapin
> path. Fully zswap-backed ranges may be tried. Mixed zswap/disk ranges fall
> back before allocation.
>
> After a large swapcache folio is installed, recheck the zswap range and
> drop the fresh folio if it became mixed. Also consume -EAGAIN from
> swap_read_folio() the same way. Both cases retry order-0, where each slot
> can resolve its current backend independently.
>
> Signed-off-by: fujunjie <fujunjie1@xxxxxx>
> ---
> mm/memcontrol-v1.c | 8 ++-
> mm/memory.c | 31 ++++++++-
> mm/swap_state.c | 169 ++++++++++++++++++++++++++++++++++++++++++---
> 3 files changed, 194 insertions(+), 14 deletions(-)
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 765069211567..5b11b8055c66 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -682,8 +682,8 @@ void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
> * memcg1_swapin - uncharge swap slot on swapin
> * @folio: folio being swapped in
> *
> - * Call this function after successfully adding the charged
> - * folio to swapcache.
> + * Call this after the charged folio has been added to swapcache and the caller
> + * is no longer going to drop it back to swapped-out state.
> *
> * Context: The folio has to be in swap cache and locked.
> */
> @@ -721,7 +721,9 @@ void memcg1_swapin(struct folio *folio)
> id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
> nr_pages);
> swap_cluster_unlock(ci);
> - mem_cgroup_uncharge_swap(id, nr_pages);
> +
> + if (id)
> + mem_cgroup_uncharge_swap(id, nr_pages);
> }
> #endif
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 5a365492a9a2..d73a19692dea 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4538,6 +4538,24 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
> folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
> }
>
> +static void memcg1_swapin_retry_folio(struct folio *folio,
> + struct vm_fault *vmf)
> +{
> + if (!folio_test_large(folio) || !folio_test_swapcache(folio))
> + return;
> +
> + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
> + if (!folio_trylock(folio))
> + return;
> + } else {
> + folio_lock(folio);
> + }
> +
> + if (folio_test_large(folio) && folio_test_swapcache(folio))
> + memcg1_swapin(folio);
> + folio_unlock(folio);
> +}
> +
> static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> {
> vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> @@ -4857,8 +4875,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>
> swapcache = folio;
> ret |= folio_lock_or_retry(folio, vmf);
> - if (ret & VM_FAULT_RETRY)
> + if (ret & VM_FAULT_RETRY) {
> + memcg1_swapin_retry_folio(folio, vmf);
> goto out_release;
> + }
>
> page = folio_file_page(folio, swp_offset(entry));
> /*
> @@ -5067,6 +5087,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> if (unlikely(folio != swapcache)) {
> folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
> folio_add_lru_vma(folio, vma);
> + if (folio_test_large(swapcache))
> + memcg1_swapin(swapcache);
> folio_put_swap(swapcache, NULL);
> } else if (!folio_test_anon(folio)) {
> /*
> @@ -5076,6 +5098,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
> VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
> folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
> + if (folio_test_large(folio))
> + memcg1_swapin(folio);
> folio_put_swap(folio, NULL);
> } else {
> VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
> @@ -5132,8 +5156,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> if (vmf->pte)
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> out_page:
> - if (folio_test_swapcache(folio))
> + if (folio_test_swapcache(folio)) {
> + if (folio_test_large(folio))
> + memcg1_swapin(folio);
> folio_free_swap(folio);
> + }
> folio_unlock(folio);
> out_release:
> folio_put(folio);
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index d37097913b30..f03ad4832f16 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -21,6 +21,7 @@
> #include <linux/migrate.h>
> #include <linux/vmalloc.h>
> #include <linux/huge_mm.h>
> +#include <linux/zswap.h>
> #include <linux/shmem_fs.h>
> #include "internal.h"
> #include "swap_table.h"
> @@ -403,7 +404,8 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
> static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
> swp_entry_t targ_entry, gfp_t gfp,
> unsigned int order, struct vm_fault *vmf,
> - struct mempolicy *mpol, pgoff_t ilx)
> + struct mempolicy *mpol, pgoff_t ilx,
> + bool defer_memcg1_swapin)

Hi Fujunjie,

Thanks for the update, but this whole defer_memcg1_swapin thing is so
ugly I don't think this is the right way at all.

If you really need this, maybe you can always defer the memcg1
uncharge, I don't see why we need to treat large folio differently.
This charge doesn't effect the memory pressure, the reason we uncharge
memcg1's swap counter is to avoid long pinning swap cache holding the
swap cache of a cgroup so the cgroup will no longer be able to swap
out more folios. Deferring it won't hurt.

> {
> int err;
> swp_entry_t entry;
> @@ -466,7 +468,8 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
> }
>
> /* memsw uncharges swap when folio is added to swap cache */
> - memcg1_swapin(folio);
> + if (!defer_memcg1_swapin || !order)
> + memcg1_swapin(folio);
> if (shadow)
> workingset_refault(folio, shadow);
>
> @@ -495,9 +498,12 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
> * Return: Returns the folio if allocation succeeded and folio is in the swap
> * cache. Returns error code if failed due to race, OOM or invalid arguments.
> */
> -struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> - unsigned long orders, struct vm_fault *vmf,
> - struct mempolicy *mpol, pgoff_t ilx)
> +static struct folio *__swap_cache_alloc_folio(swp_entry_t targ_entry,
> + gfp_t gfp, unsigned long orders,
> + struct vm_fault *vmf,
> + struct mempolicy *mpol,
> + pgoff_t ilx,
> + bool defer_memcg1_swapin)
> {
> int order, err;
> struct folio *ret;
> @@ -512,7 +518,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>
> do {
> ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
> - vmf, mpol, ilx);
> + vmf, mpol, ilx,
> + defer_memcg1_swapin);
> if (!IS_ERR(ret))
> break;
> err = PTR_ERR(ret);
> @@ -525,6 +532,124 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> return ret;
> }
>
> +struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
> + unsigned long orders, struct vm_fault *vmf,
> + struct mempolicy *mpol, pgoff_t ilx)
> +{
> + return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
> + mpol, ilx, false);
> +}
> +
> +static struct folio *swap_cache_alloc_speculative_folio(swp_entry_t targ_entry,
> + gfp_t gfp,
> + unsigned long orders,
> + struct vm_fault *vmf,
> + struct mempolicy *mpol,
> + pgoff_t ilx)
> +{
> + /*
> + * Speculative large swapin may drop this fresh swapcache folio and
> + * retry order-0 after backend or page-table revalidation. Keep the
> + * cgroup v1 memsw swap owner until the caller commits the folio.
> + */
> + return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
> + mpol, ilx, true);
> +}
> +
> +static bool swapin_zeromap_same(swp_entry_t entry, unsigned int nr_pages)
> +{
> + unsigned int ci_start = swp_cluster_offset(entry);
> + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
> + bool is_zero;
> + unsigned int i;
> +
> + if (ci_start + nr_pages > SWAPFILE_CLUSTER) {
> + VM_WARN_ON_ONCE(1);
> + return false;
> + }
> +
> + rcu_read_lock();
> + if (!rcu_dereference(ci->table)) {
> + rcu_read_unlock();
> + return true;
> + }
> +
> + is_zero = __swap_table_test_zero(ci, ci_start);
> + for (i = 1; i < nr_pages; i++) {
> + if (is_zero != __swap_table_test_zero(ci, ci_start + i)) {
> + rcu_read_unlock();
> + return false;
> + }
> + }
> + rcu_read_unlock();
> +
> + return true;
> +}
> +
> +static unsigned long swapin_admit_orders(swp_entry_t entry,
> + unsigned long orders)

And this swapin_admit_orders chunk doesn't look good either...

> +{
> + unsigned long candidates = orders & ~BIT(0);
> + unsigned long admitted = orders & BIT(0);
> + int order;
> +
> + if (!candidates)
> + return orders;
> +
> + while (candidates) {
> + enum zswap_range_state state;
> + unsigned int nr_pages;
> + swp_entry_t range_entry;
> + bool admit = false;
> +
> + order = fls_long(candidates) - 1;
> + if (order > MAX_PAGE_ORDER) {
> + candidates &= ~BIT(order);
> + continue;
> + }
> +
> + nr_pages = 1U << order;
> + range_entry = swp_entry(swp_type(entry),
> + round_down(swp_offset(entry), nr_pages));
> + if (!swapin_zeromap_same(range_entry, nr_pages))
> + goto next;

I think you don't need to test zeromap at all? __swap_cache_alloc
handles that already.

> +
> + state = zswap_probe_range(range_entry, nr_pages);

If you just move the zswap_probe_range into __swap_cache_alloc and do
fallback there (or maybe you can shrink the order faster), then this
two new helpers are all redundant.

> + switch (state) {
> + case ZSWAP_RANGE_MIXED:
> + break;
> + case ZSWAP_RANGE_ALL_ZSWAP:
> + case ZSWAP_RANGE_NEVER_ENABLED:
> + case ZSWAP_RANGE_NO_ZSWAP:
> + admit = true;
> + break;
> + }
> +
> +next:
> + if (admit)
> + admitted |= BIT(order);
> + else
> + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
> + candidates &= ~BIT(order);
> + }
> +
> + return admitted ? admitted : BIT(0);
> +}
> +
> +static bool zswap_needs_order0_retry(struct folio *folio)
> +{
> + if (!folio_test_large(folio))
> + return false;
> +
> + /*
> + * Admission sees only an advisory zswap snapshot. Recheck after the
> + * large swapcache folio is installed; if the range became mixed, drop
> + * the fresh folio before IO and let order-0 handle each slot.
> + */
> + return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
> + ZSWAP_RANGE_MIXED;
> +}
> +

Again, I think you can just probe the suitable size in
__swap_cache_alloc directly, that way, we avoid the diverge of sync /
non-sync device, and avoid the whole chunk making the code much
simplier too, just like what we are alreadying doing for zero map in
__swap_cache_alloc, or am I over simpliying it?