Re: [RFC PATCH v2 4/9] mm: admit large swapin by backend range in swapin_sync()

From: Fujunjie

Date: Sun May 31 2026 - 09:21:20 EST

On 5/30/2026 2:34 AM, Nhat Pham wrote:
> On Fri, May 29, 2026 at 5:19 AM fujunjie <fujunjie1@xxxxxx> wrote:
>>
>> A large swapin can only read one folio when the whole range has compatible
>> backing. Mixed zswap/disk ranges must not reach large-folio IO, and zswap
>> range probes are only snapshots.
>>
>> Filter the orders passed to swap_cache_alloc_folio() in swapin_sync().
>> Uniform zeromap ranges and all-disk ranges keep the existing large swapin
>> path. Fully zswap-backed ranges may be tried. Mixed zswap/disk ranges fall
>> back before allocation.
>>
>> After a large swapcache folio is installed, recheck the zswap range and
>> drop the fresh folio if it became mixed. Also consume -EAGAIN from
>> swap_read_folio() the same way. Both cases retry order-0, where each slot
>> can resolve its current backend independently.
>>
>> Signed-off-by: fujunjie <fujunjie1@xxxxxx>
>> ---
>> mm/memcontrol-v1.c | 8 ++-
>> mm/memory.c | 31 ++++++++-
>> mm/swap_state.c | 169 ++++++++++++++++++++++++++++++++++++++++++---
>> 3 files changed, 194 insertions(+), 14 deletions(-)
>>
>> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
>> index 765069211567..5b11b8055c66 100644
>> --- a/mm/memcontrol-v1.c
>> +++ b/mm/memcontrol-v1.c
>> @@ -682,8 +682,8 @@ void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
>> * memcg1_swapin - uncharge swap slot on swapin
>> * @folio: folio being swapped in
>> *
>> - * Call this function after successfully adding the charged
>> - * folio to swapcache.
>> + * Call this after the charged folio has been added to swapcache and the caller
>> + * is no longer going to drop it back to swapped-out state.
>> *
>> * Context: The folio has to be in swap cache and locked.
>> */
>> @@ -721,7 +721,9 @@ void memcg1_swapin(struct folio *folio)
>> id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
>> nr_pages);
>> swap_cluster_unlock(ci);
>> - mem_cgroup_uncharge_swap(id, nr_pages);
>> +
>> + if (id)
>> + mem_cgroup_uncharge_swap(id, nr_pages);
>> }
>> #endif
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 5a365492a9a2..d73a19692dea 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4538,6 +4538,24 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
>> folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
>> }
>>
>> +static void memcg1_swapin_retry_folio(struct folio *folio,
>> + struct vm_fault *vmf)
>> +{
>> + if (!folio_test_large(folio) || !folio_test_swapcache(folio))
>> + return;
>> +
>> + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
>> + if (!folio_trylock(folio))
>> + return;
>> + } else {
>> + folio_lock(folio);
>> + }
>> +
>> + if (folio_test_large(folio) && folio_test_swapcache(folio))
>> + memcg1_swapin(folio);
>> + folio_unlock(folio);
>> +}
>> +
>> static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
>> {
>> vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
>> @@ -4857,8 +4875,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>
>> swapcache = folio;
>> ret |= folio_lock_or_retry(folio, vmf);
>> - if (ret & VM_FAULT_RETRY)
>> + if (ret & VM_FAULT_RETRY) {
>> + memcg1_swapin_retry_folio(folio, vmf);
>> goto out_release;
>> + }
>>
>> page = folio_file_page(folio, swp_offset(entry));
>> /*
>> @@ -5067,6 +5087,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> if (unlikely(folio != swapcache)) {
>> folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
>> folio_add_lru_vma(folio, vma);
>> + if (folio_test_large(swapcache))
>> + memcg1_swapin(swapcache);
>> folio_put_swap(swapcache, NULL);
>> } else if (!folio_test_anon(folio)) {
>> /*
>> @@ -5076,6 +5098,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
>> VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
>> folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
>> + if (folio_test_large(folio))
>> + memcg1_swapin(folio);
>> folio_put_swap(folio, NULL);
>> } else {
>> VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
>> @@ -5132,8 +5156,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> if (vmf->pte)
>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>> out_page:
>> - if (folio_test_swapcache(folio))
>> + if (folio_test_swapcache(folio)) {
>> + if (folio_test_large(folio))
>> + memcg1_swapin(folio);
>> folio_free_swap(folio);
>> + }
>> folio_unlock(folio);
>> out_release:
>> folio_put(folio);
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index d37097913b30..f03ad4832f16 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -21,6 +21,7 @@
>> #include <linux/migrate.h>
>> #include <linux/vmalloc.h>
>> #include <linux/huge_mm.h>
>> +#include <linux/zswap.h>
>> #include <linux/shmem_fs.h>
>> #include "internal.h"
>> #include "swap_table.h"
>> @@ -403,7 +404,8 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
>> static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
>> swp_entry_t targ_entry, gfp_t gfp,
>> unsigned int order, struct vm_fault *vmf,
>> - struct mempolicy *mpol, pgoff_t ilx)
>> + struct mempolicy *mpol, pgoff_t ilx,
>> + bool defer_memcg1_swapin)
>> {
>> int err;
>> swp_entry_t entry;
>> @@ -466,7 +468,8 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
>> }
>>
>> /* memsw uncharges swap when folio is added to swap cache */
>> - memcg1_swapin(folio);
>> + if (!defer_memcg1_swapin || !order)
>> + memcg1_swapin(folio);
>> if (shadow)
>> workingset_refault(folio, shadow);
>>
>> @@ -495,9 +498,12 @@ static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
>> * Return: Returns the folio if allocation succeeded and folio is in the swap
>> * cache. Returns error code if failed due to race, OOM or invalid arguments.
>> */
>> -struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>> - unsigned long orders, struct vm_fault *vmf,
>> - struct mempolicy *mpol, pgoff_t ilx)
>> +static struct folio *__swap_cache_alloc_folio(swp_entry_t targ_entry,
>> + gfp_t gfp, unsigned long orders,
>> + struct vm_fault *vmf,
>> + struct mempolicy *mpol,
>> + pgoff_t ilx,
>> + bool defer_memcg1_swapin)
>> {
>> int order, err;
>> struct folio *ret;
>> @@ -512,7 +518,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>>
>> do {
>> ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
>> - vmf, mpol, ilx);
>> + vmf, mpol, ilx,
>> + defer_memcg1_swapin);
>> if (!IS_ERR(ret))
>> break;
>> err = PTR_ERR(ret);
>> @@ -525,6 +532,124 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>> return ret;
>> }
>>
>> +struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
>> + unsigned long orders, struct vm_fault *vmf,
>> + struct mempolicy *mpol, pgoff_t ilx)
>> +{
>> + return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
>> + mpol, ilx, false);
>> +}
>> +
>> +static struct folio *swap_cache_alloc_speculative_folio(swp_entry_t targ_entry,
>> + gfp_t gfp,
>> + unsigned long orders,
>> + struct vm_fault *vmf,
>> + struct mempolicy *mpol,
>> + pgoff_t ilx)
>> +{
>> + /*
>> + * Speculative large swapin may drop this fresh swapcache folio and
>> + * retry order-0 after backend or page-table revalidation. Keep the
>> + * cgroup v1 memsw swap owner until the caller commits the folio.
>> + */
>> + return __swap_cache_alloc_folio(targ_entry, gfp, orders, vmf,
>> + mpol, ilx, true);
>> +}
>> +
>> +static bool swapin_zeromap_same(swp_entry_t entry, unsigned int nr_pages)
>> +{
>> + unsigned int ci_start = swp_cluster_offset(entry);
>> + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
>> + bool is_zero;
>> + unsigned int i;
>> +
>> + if (ci_start + nr_pages > SWAPFILE_CLUSTER) {
>> + VM_WARN_ON_ONCE(1);
>> + return false;
>> + }
>> +
>> + rcu_read_lock();
>> + if (!rcu_dereference(ci->table)) {
>> + rcu_read_unlock();
>> + return true;
>> + }
>> +
>> + is_zero = __swap_table_test_zero(ci, ci_start);
>> + for (i = 1; i < nr_pages; i++) {
>> + if (is_zero != __swap_table_test_zero(ci, ci_start + i)) {
>> + rcu_read_unlock();
>> + return false;
>> + }
>> + }
>> + rcu_read_unlock();
>> +
>> + return true;
>> +}
>> +
>> +static unsigned long swapin_admit_orders(swp_entry_t entry,
>> + unsigned long orders)
>> +{
>> + unsigned long candidates = orders & ~BIT(0);
>> + unsigned long admitted = orders & BIT(0);
>> + int order;
>> +
>> + if (!candidates)
>> + return orders;
>> +
>> + while (candidates) {
>> + enum zswap_range_state state;
>> + unsigned int nr_pages;
>> + swp_entry_t range_entry;
>> + bool admit = false;
>> +
>> + order = fls_long(candidates) - 1;
>> + if (order > MAX_PAGE_ORDER) {
>> + candidates &= ~BIT(order);
>> + continue;
>> + }
>> +
>> + nr_pages = 1U << order;
>> + range_entry = swp_entry(swp_type(entry),
>> + round_down(swp_offset(entry), nr_pages));
>> + if (!swapin_zeromap_same(range_entry, nr_pages))
>> + goto next;
>> +
>> + state = zswap_probe_range(range_entry, nr_pages);
>> + switch (state) {
>> + case ZSWAP_RANGE_MIXED:
>> + break;
>> + case ZSWAP_RANGE_ALL_ZSWAP:
>> + case ZSWAP_RANGE_NEVER_ENABLED:
>> + case ZSWAP_RANGE_NO_ZSWAP:
>> + admit = true;
>> + break;
>> + }
>> +
>> +next:
>> + if (admit)
>> + admitted |= BIT(order);
>> + else
>> + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
>> + candidates &= ~BIT(order);
>> + }
>> +
>> + return admitted ? admitted : BIT(0);
>> +}
>> +
>> +static bool zswap_needs_order0_retry(struct folio *folio)
>> +{
>> + if (!folio_test_large(folio))
>> + return false;
>> +
>> + /*
>> + * Admission sees only an advisory zswap snapshot. Recheck after the
>> + * large swapcache folio is installed; if the range became mixed, drop
>> + * the fresh folio before IO and let order-0 handle each slot.
>> + */
>> + return zswap_probe_range(folio->swap, folio_nr_pages(folio)) ==
>> + ZSWAP_RANGE_MIXED;
>> +}
>> +
>> /*
>> * If we are the only user, then try to free up the swap cache.
>> *
>> @@ -634,7 +759,8 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
>> folio = swap_cache_get_folio(entry);
>> if (folio)
>> return folio;
>> - folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx);
>> + folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL,
>> + mpol, ilx);
>> } while (PTR_ERR(folio) == -EEXIST);
>>
>> if (IS_ERR_OR_NULL(folio))
>> @@ -677,18 +803,43 @@ struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
>> struct folio *folio;
>> int ret;
>>
>> + orders = swapin_admit_orders(entry, orders);
>> +again:
>> do {
>> folio = swap_cache_get_folio(entry);
>> if (folio)
>> return folio;
>> - folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx);
>> + folio = swap_cache_alloc_speculative_folio(entry, gfp, orders,
>> + vmf, mpol, ilx);
>> } while (PTR_ERR(folio) == -EEXIST);
>>
>> if (IS_ERR(folio))
>> return folio;
>>
>> + if (zswap_needs_order0_retry(folio)) {
>> + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN_FALLBACK);
>> + /*
>> + * The folio is newly allocated, locked, clean and not uptodate;
>> + * no data has been read into it. Removing it only restores the
>> + * swap table entries so order-0 swapin can resolve a backend
>> + * race without attempting speculative large-folio zswapin.
>> + */
>> + swap_cache_del_folio(folio);
>> + folio_unlock(folio);
>> + folio_put(folio);
>> + orders = BIT(0);
>> + goto again;
>> + }
>> +
>> ret = swap_read_folio(folio, NULL);
>> - VM_WARN_ON_ONCE(ret == -EAGAIN);
>> + if (ret == -EAGAIN) {
>
> Can this happen? After you add the entire swap range to swap cache,
> backend is locked. Zswap writeback bails out if it fails to add the
> page to swap cache.
>
> I think you can just check (zswap_probe_range or wev) before
> swap_read_folio(). If the range is still fully backed by zswap, you
> are good to go. Otherwise, bail here immediately.
>
> Then you don't need all the complexity with extending swap_read_folio
> to handle mixed range errors (for now at least).

Yes, I think you are right.

I missed that property of zswap writeback. Once the whole range is covered by
the large swapcache folio, writeback should not be able to move a subslot to
disk because it has to allocate an order-0 swapcache folio first, and that
should fail.

Sorry for adding this extra complexity. I will rework this in a more unified way for the
next version.