[PATCH 07/13] mm: handle PMD swap entries in fork path
From: Usama Arif
Date: Mon Apr 27 2026 - 06:20:19 EST
Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries,
mirroring copy_nonpresent_pte().
swap_dup_entry_direct() gains a nr parameter (and is renamed to
swap_dup_entries_direct()) so it can duplicate a contiguous range of
swap slots in one call, matching the existing
swap_put_entries_direct(entry, nr) API. Existing callers pass 1.
copy_huge_non_present_pmd() "copies" PMD swap entries during fork
instead of splitting, preserving the THP. This mirrors
copy_nonpresent_pte() which duplicates the swap slot refcount,
clears the exclusive bit on the source, and adds the destination
mm to mmlist. If swap_dup_entries_direct() fails (GFP_ATOMIC table
alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with
GFP_KERNEL, matching the PTE retry in copy_pte_range(). The PMD is
stable across the retry because dup_mmap() holds write mmap_lock on
both mm_structs.
Signed-off-by: Usama Arif <usama.arif@xxxxxxxxx>
---
include/linux/swap.h | 4 ++--
mm/huge_memory.c | 52 +++++++++++++++++++++++++++++++++++++++-----
mm/memory.c | 2 +-
mm/swapfile.c | 7 +++---
4 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1930f81e6be4..2f12c20baba1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -457,7 +457,7 @@ sector_t swap_folio_sector(struct folio *folio);
* All entries must be allocated by folio_alloc_swap(). And they must have
* a swap count > 1. See comments of folio_*_swap helpers for more info.
*/
-int swap_dup_entry_direct(swp_entry_t entry);
+int swap_dup_entries_direct(swp_entry_t entry, int nr);
void swap_put_entries_direct(swp_entry_t entry, int nr);
/*
@@ -501,7 +501,7 @@ static inline void free_swap_cache(struct folio *folio)
{
}
-static inline int swap_dup_entry_direct(swp_entry_t ent)
+static inline int swap_dup_entries_direct(swp_entry_t ent, int nr)
{
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f67638e43c8..42887cf518cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1867,7 +1867,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static void copy_huge_non_present_pmd(
+static int copy_huge_non_present_pmd(
struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
@@ -1913,14 +1913,35 @@ static void copy_huge_non_present_pmd(
*/
folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
dst_vma, src_vma);
+ } else if (softleaf_is_swap(entry)) {
+ int err;
+
+ /*
+ * PMD swap entry: duplicate swap references and clear
+ * exclusive on source, matching copy_nonpresent_pte().
+ */
+ err = swap_dup_entries_direct(entry, HPAGE_PMD_NR);
+ if (err < 0)
+ return err;
+
+ ensure_on_mmlist(dst_mm);
+
+ if (pmd_swp_exclusive(pmd)) {
+ pmd = pmd_swp_clear_exclusive(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
}
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ if (softleaf_is_swap(entry))
+ add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR);
+ else
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
if (!userfaultfd_wp(dst_vma))
pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ return 0;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1961,6 +1982,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
+retry:
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1968,10 +1990,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(thp_migration_supported() &&
- pmd_is_valid_softleaf(pmd))) {
- copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
- dst_vma, src_vma, pmd, pgtable);
+ if (unlikely(pmd_is_valid_softleaf(pmd))) {
+ ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma, pmd,
+ pgtable);
+ if (ret) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ /*
+ * For PMD swap entries -ENOMEM means the per-cluster
+ * swap-extend table couldn't be GFP_ATOMIC-allocated.
+ * try the GFP_KERNEL fallback once before giving up.
+ */
+ if (ret == -ENOMEM) {
+ softleaf_t entry = softleaf_from_pmd(pmd);
+
+ if (softleaf_is_swap(entry) &&
+ !swap_retry_table_alloc(entry, GFP_KERNEL))
+ goto retry;
+ }
+ pte_free(dst_mm, pgtable);
+ goto out;
+ }
ret = 0;
goto out_unlock;
}
diff --git a/mm/memory.c b/mm/memory.c
index 33d7cc274e23..8aa90afd601a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct page *page;
if (likely(softleaf_is_swap(entry))) {
- if (swap_dup_entry_direct(entry) < 0)
+ if (swap_dup_entries_direct(entry, 1) < 0)
return -EIO;
ensure_on_mmlist(dst_mm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c7e173b93e11..390f191be9a6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3801,8 +3801,9 @@ void si_swapinfo(struct sysinfo *val)
}
/*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * swap_dup_entries_direct() - Increase reference count of swap entries by one.
* @entry: first swap entry from which we want to increase the refcount.
+ * @nr: number of contiguous swap entries to duplicate.
*
* Returns 0 for success, or -ENOMEM if the extend table is required
* but could not be atomically allocated. Returns -EINVAL if the swap
@@ -3814,7 +3815,7 @@ void si_swapinfo(struct sysinfo *val)
* Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
* be used.
*/
-int swap_dup_entry_direct(swp_entry_t entry)
+int swap_dup_entries_direct(swp_entry_t entry, int nr)
{
struct swap_info_struct *si;
@@ -3831,7 +3832,7 @@ int swap_dup_entry_direct(swp_entry_t entry)
*/
VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));
- return swap_dup_entries_cluster(si, swp_offset(entry), 1);
+ return swap_dup_entries_cluster(si, swp_offset(entry), nr);
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
--
2.52.0