Re: [PATCH v3 05/12] mm, swap: unify large folio allocation

From: Kairui Song

Date: Wed May 13 2026 - 02:49:16 EST

On Tue, May 12, 2026 at 6:14 PM Baolin Wang
<baolin.wang@xxxxxxxxxxxxxxxxx> wrote:
>
> On 4/21/26 2:16 PM, Kairui Song via B4 Relay wrote:
> > From: Kairui Song <kasong@xxxxxxxxxxx>
> >
> > Now that direct large order allocation is supported in the swap cache,
> > both anon and shmem can use it instead of implementing their own methods.
> > This unifies the fallback and swap cache check, which also reduces the
> > TOCTOU race window of swap cache state: previously, high order swapin
> > required checking swap cache states first, then allocating and falling
> > back separately. Now all these steps happen in the same compact loop.
> >
> > Order fallback and statistics are also unified, callers just need to
> > check and pass the acceptable order bitmask.
> >
> > There is basically no behavior change. This only makes things more
> > unified and prepares for later commits. Cgroup and zero map checks can
> > also be moved into the compact loop, further reducing race windows and
> > redundancy
> >
> > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> > ---
> > mm/memory.c | 77 ++++++------------------------
> > mm/shmem.c | 94 +++++++++---------------------------
> > mm/swap.h | 30 ++----------
> > mm/swap_state.c | 145 ++++++++++----------------------------------------------
> > mm/swapfile.c | 3 +-
> > 5 files changed, 67 insertions(+), 282 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index ea6568571131..404734a5bcff 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4593,26 +4593,6 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> > return VM_FAULT_SIGBUS;
> > }
> >
> > -static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
> > -{
> > - struct vm_area_struct *vma = vmf->vma;
> > - struct folio *folio;
> > - softleaf_t entry;
> > -
> > - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
> > - if (!folio)
> > - return NULL;
> > -
> > - entry = softleaf_from_pte(vmf->orig_pte);
> > - if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> > - GFP_KERNEL, entry)) {
> > - folio_put(folio);
> > - return NULL;
> > - }
> > -
> > - return folio;
> > -}
> > -
> > #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > /*
> > * Check if the PTEs within a range are contiguous swap entries
> > @@ -4642,8 +4622,6 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
> > */
> > if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
> > return false;
> > - if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
> > - return false;
> >
> > return true;
> > }
> > @@ -4671,16 +4649,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
> > return orders;
> > }
> >
> > -static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> > +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
> > {
> > struct vm_area_struct *vma = vmf->vma;
> > unsigned long orders;
> > - struct folio *folio;
> > unsigned long addr;
> > softleaf_t entry;
> > spinlock_t *ptl;
> > pte_t *pte;
> > - gfp_t gfp;
> > int order;
> >
> > /*
> > @@ -4688,7 +4664,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> > * maintain the uffd semantics.
> > */
> > if (unlikely(userfaultfd_armed(vma)))
> > - goto fallback;
> > + return 0;
> >
> > /*
> > * A large swapped out folio could be partially or fully in zswap. We
> > @@ -4696,7 +4672,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> > * folio.
> > */
> > if (!zswap_never_enabled())
> > - goto fallback;
> > + return 0;
> >
> > entry = softleaf_from_pte(vmf->orig_pte);
> > /*
> > @@ -4710,12 +4686,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> > vmf->address, orders);
> >
> > if (!orders)
> > - goto fallback;
> > + return 0;
> >
> > pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
> > vmf->address & PMD_MASK, &ptl);
> > if (unlikely(!pte))
> > - goto fallback;
> > + return 0;
> >
> > /*
> > * For do_swap_page, find the highest order where the aligned range is
> > @@ -4731,29 +4707,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >
> > pte_unmap_unlock(pte, ptl);
> >
> > - /* Try allocating the highest of the remaining orders. */
> > - gfp = vma_thp_gfp_mask(vma);
> > - while (orders) {
> > - addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> > - folio = vma_alloc_folio(gfp, order, vma, addr);
> > - if (folio) {
> > - if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> > - gfp, entry))
> > - return folio;
> > - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
> > - folio_put(folio);
> > - }
> > - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
> > - order = next_order(&orders, order);
> > - }
> > -
> > -fallback:
> > - return __alloc_swap_folio(vmf);
> > + return orders;
> > }
> > #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
> > -static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> > +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
> > {
> > - return __alloc_swap_folio(vmf);
> > + return 0;
> > }
> > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> >
> > @@ -4859,21 +4818,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > if (folio)
> > swap_update_readahead(folio, vma, vmf->address);
> > if (!folio) {
> > - if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> > - folio = alloc_swap_folio(vmf);
> > - if (folio) {
> > - /*
> > - * folio is charged, so swapin can only fail due
> > - * to raced swapin and return NULL.
> > - */
> > - swapcache = swapin_folio(entry, folio);
> > - if (swapcache != folio)
> > - folio_put(folio);
> > - folio = swapcache;
> > - }
> > - } else {
> > + /* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
> > + if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
> > + folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE,
> > + thp_swapin_suitable_orders(vmf),
> > + vmf, NULL, 0);
> > + else
> > folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
> > - }
> >
> > if (!folio) {
> > /*
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 5916acf594a8..17e3da11bb1d 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void)
> >
> > static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> > struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
> > - struct vm_area_struct *vma, vm_fault_t *fault_type);
> > + struct vm_fault *vmf, vm_fault_t *fault_type);
> >
> > static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
> > {
> > @@ -2017,68 +2017,24 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
> > }
> >
> > static struct folio *shmem_swap_alloc_folio(struct inode *inode,
> > - struct vm_area_struct *vma, pgoff_t index,
> > + struct vm_fault *vmf, pgoff_t index,
> > swp_entry_t entry, int order, gfp_t gfp)
> > {
> > + pgoff_t ilx;
> > + struct folio *folio;
> > + struct mempolicy *mpol;
> > + unsigned long orders = BIT(order);
> > struct shmem_inode_info *info = SHMEM_I(inode);
> > - struct folio *new, *swapcache;
> > - int nr_pages = 1 << order;
> > - gfp_t alloc_gfp = gfp;
> > -
> > - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
> > - if (WARN_ON_ONCE(order))
> > - return ERR_PTR(-EINVAL);
> > - } else if (order) {
> > - /*
> > - * If uffd is active for the vma, we need per-page fault
> > - * fidelity to maintain the uffd semantics, then fallback
> > - * to swapin order-0 folio, as well as for zswap case.
> > - * Any existing sub folio in the swap cache also blocks
> > - * mTHP swapin.
> > - */
> > - if ((vma && unlikely(userfaultfd_armed(vma))) ||
> > - !zswap_never_enabled() ||
> > - non_swapcache_batch(entry, nr_pages) != nr_pages)
> > - goto fallback;
> >
> > - alloc_gfp = thp_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
> > - }
> > -retry:
> > - new = shmem_alloc_folio(alloc_gfp, order, info, index);
> > - if (!new) {
> > - new = ERR_PTR(-ENOMEM);
> > - goto fallback;
> > - }
> > + if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) ||
> > + !zswap_never_enabled())
> > + orders = 0;
> >
> > - if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
> > - alloc_gfp, entry)) {
> > - folio_put(new);
> > - new = ERR_PTR(-ENOMEM);
> > - goto fallback;
> > - }
> > + mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
> > + folio = swapin_entry(entry, gfp, orders, vmf, mpol, ilx);
> > + mpol_cond_put(mpol);
> >
> > - swapcache = swapin_folio(entry, new);
> > - if (swapcache != new) {
> > - folio_put(new);
> > - if (!swapcache) {
> > - /*
> > - * The new folio is charged already, swapin can
> > - * only fail due to another raced swapin.
> > - */
> > - new = ERR_PTR(-EEXIST);
> > - goto fallback;
> > - }
> > - }
> > - return swapcache;
> > -fallback:
> > - /* Order 0 swapin failed, nothing to fallback to, abort */
> > - if (!order)
> > - return new;
> > - entry.val += index - round_down(index, nr_pages);
> > - alloc_gfp = gfp;
> > - nr_pages = 1;
> > - order = 0;
> > - goto retry;
> > + return folio;
> > }
>
> IIUC, in the __swap_cache_alloc() implementation in patch 4, when shmem
> swapin falls back to order 0, it doesn't adjust the swap entry value
> like here. Because the original swap entry may not correspond to the
> swap entry for the order 0 index.
>
> Of course, I haven't tested this yet, just pointing it out for you to
> double check.

Thanks for pointing it out. No worry, we have the below change in this
commit already:

/* Direct swapin skipping swap cache & readahead */
- folio = shmem_swap_alloc_folio(inode, vma, index,
- index_entry, order, gfp);
- if (IS_ERR(folio)) {
- error = PTR_ERR(folio);
- folio = NULL;
- goto failed;
- }
+ folio = shmem_swap_alloc_folio(inode, vmf, index,
+ swap, order, gfp);

It's using swap instead of index_entry now, so __swap_cache_alloc will
do the round down for large order instead and skip the round_down if
ordedr is zero. So we are fine here.