Re: [PATCH v3 05/12] mm, swap: unify large folio allocation

From: Baolin Wang

Date: Tue May 12 2026 - 06:14:39 EST

On 4/21/26 2:16 PM, Kairui Song via B4 Relay wrote:

From: Kairui Song <kasong@xxxxxxxxxxx>

Now that direct large order allocation is supported in the swap cache,
both anon and shmem can use it instead of implementing their own methods.
This unifies the fallback and swap cache check, which also reduces the
TOCTOU race window of swap cache state: previously, high order swapin
required checking swap cache states first, then allocating and falling
back separately. Now all these steps happen in the same compact loop.

Order fallback and statistics are also unified, callers just need to
check and pass the acceptable order bitmask.

There is basically no behavior change. This only makes things more
unified and prepares for later commits. Cgroup and zero map checks can
also be moved into the compact loop, further reducing race windows and
redundancy

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
mm/memory.c | 77 ++++++------------------------
mm/shmem.c | 94 +++++++++---------------------------
mm/swap.h | 30 ++----------
mm/swap_state.c | 145 ++++++++++----------------------------------------------
mm/swapfile.c | 3 +-
5 files changed, 67 insertions(+), 282 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ea6568571131..404734a5bcff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4593,26 +4593,6 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
-static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
-{
- struct vm_area_struct *vma = vmf->vma;
- struct folio *folio;
- softleaf_t entry;
-
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
- if (!folio)
- return NULL;
-
- entry = softleaf_from_pte(vmf->orig_pte);
- if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
- GFP_KERNEL, entry)) {
- folio_put(folio);
- return NULL;
- }
-
- return folio;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* Check if the PTEs within a range are contiguous swap entries
@@ -4642,8 +4622,6 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
*/
if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
return false;
- if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
- return false;
return true;
}
@@ -4671,16 +4649,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
return orders;
}
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long orders;
- struct folio *folio;
unsigned long addr;
softleaf_t entry;
spinlock_t *ptl;
pte_t *pte;
- gfp_t gfp;
int order;
/*
@@ -4688,7 +4664,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* maintain the uffd semantics.
*/
if (unlikely(userfaultfd_armed(vma)))
- goto fallback;
+ return 0;
/*
* A large swapped out folio could be partially or fully in zswap. We
@@ -4696,7 +4672,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* folio.
*/
if (!zswap_never_enabled())
- goto fallback;
+ return 0;
entry = softleaf_from_pte(vmf->orig_pte);
/*
@@ -4710,12 +4686,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
vmf->address, orders);
if (!orders)
- goto fallback;
+ return 0;
pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address & PMD_MASK, &ptl);
if (unlikely(!pte))
- goto fallback;
+ return 0;
/*
* For do_swap_page, find the highest order where the aligned range is
@@ -4731,29 +4707,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
pte_unmap_unlock(pte, ptl);
- /* Try allocating the highest of the remaining orders. */
- gfp = vma_thp_gfp_mask(vma);
- while (orders) {
- addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr);
- if (folio) {
- if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
- gfp, entry))
- return folio;
- count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
- folio_put(folio);
- }
- count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
- order = next_order(&orders, order);
- }
-
-fallback:
- return __alloc_swap_folio(vmf);
+ return orders;
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
{
- return __alloc_swap_folio(vmf);
+ return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -4859,21 +4818,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (folio)
swap_update_readahead(folio, vma, vmf->address);
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
- folio = alloc_swap_folio(vmf);
- if (folio) {
- /*
- * folio is charged, so swapin can only fail due
- * to raced swapin and return NULL.
- */
- swapcache = swapin_folio(entry, folio);
- if (swapcache != folio)
- folio_put(folio);
- folio = swapcache;
- }
- } else {
+ /* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+ folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE,
+ thp_swapin_suitable_orders(vmf),
+ vmf, NULL, 0);
+ else
folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
- }
if (!folio) {
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 5916acf594a8..17e3da11bb1d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void)
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, vm_fault_t *fault_type);
+ struct vm_fault *vmf, vm_fault_t *fault_type);
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -2017,68 +2017,24 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
}
static struct folio *shmem_swap_alloc_folio(struct inode *inode,
- struct vm_area_struct *vma, pgoff_t index,
+ struct vm_fault *vmf, pgoff_t index,
swp_entry_t entry, int order, gfp_t gfp)
{
+ pgoff_t ilx;
+ struct folio *folio;
+ struct mempolicy *mpol;
+ unsigned long orders = BIT(order);
struct shmem_inode_info *info = SHMEM_I(inode);
- struct folio *new, *swapcache;
- int nr_pages = 1 << order;
- gfp_t alloc_gfp = gfp;
-
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- if (WARN_ON_ONCE(order))
- return ERR_PTR(-EINVAL);
- } else if (order) {
- /*
- * If uffd is active for the vma, we need per-page fault
- * fidelity to maintain the uffd semantics, then fallback
- * to swapin order-0 folio, as well as for zswap case.
- * Any existing sub folio in the swap cache also blocks
- * mTHP swapin.
- */
- if ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled() ||
- non_swapcache_batch(entry, nr_pages) != nr_pages)
- goto fallback;
- alloc_gfp = thp_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
- }
-retry:
- new = shmem_alloc_folio(alloc_gfp, order, info, index);
- if (!new) {
- new = ERR_PTR(-ENOMEM);
- goto fallback;
- }
+ if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) ||
+ !zswap_never_enabled())
+ orders = 0;
- if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
- alloc_gfp, entry)) {
- folio_put(new);
- new = ERR_PTR(-ENOMEM);
- goto fallback;
- }
+ mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
+ folio = swapin_entry(entry, gfp, orders, vmf, mpol, ilx);
+ mpol_cond_put(mpol);
- swapcache = swapin_folio(entry, new);
- if (swapcache != new) {
- folio_put(new);
- if (!swapcache) {
- /*
- * The new folio is charged already, swapin can
- * only fail due to another raced swapin.
- */
- new = ERR_PTR(-EEXIST);
- goto fallback;
- }
- }
- return swapcache;
-fallback:
- /* Order 0 swapin failed, nothing to fallback to, abort */
- if (!order)
- return new;
- entry.val += index - round_down(index, nr_pages);
- alloc_gfp = gfp;
- nr_pages = 1;
- order = 0;
- goto retry;
+ return folio;
}

IIUC, in the __swap_cache_alloc() implementation in patch 4, when shmem swapin falls back to order 0, it doesn't adjust the swap entry value like here. Because the original swap entry may not correspond to the swap entry for the order 0 index.

Of course, I haven't tested this yet, just pointing it out for you to double check.