[RFC PATCH v2 04/11] mm: khugepaged: add shmem mTHP collapse support
From: Baolin Wang
Date: Wed Jun 10 2026 - 06:39:50 EST
Khugepaged already supports the anonymous mTHP collapse. Similarly, let
khugepaged also support the shmem mTHP collapse. The strategy for shmem
mTHP collapse follows the anonymous mTHP collapse:
Track present pages via a bitmap while scanning PMD ranges for collapse
candidates. After the scan completes, use the bitmap to determine the
most efficient mTHP order to collapse to. Scale 'max_ptes_none' by the
attempted collapse order to determine the minimum fill threshold for
eligibility. Similarly, shmem mTHP collapse rejects regions containing
swapped-out pages to avoid creep.
Currently, the collapse_pte_mapped_thp() does not build the mapping for mTHP.
Cause we still expect to establish the mTHP mapping via refault under the
control of fault_around. So collapse_pte_mapped_thp() remains responsible
only for building the mapping for PMD-sized THP, which is reasonable and
makes life easier.
Note that we do not need to remove pte page tables for shmem mTHP collapse.
Signed-off-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx>
---
mm/khugepaged.c | 115 ++++++++++++++++++++++++++++++++++++++----------
1 file changed, 91 insertions(+), 24 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0c8dfbd48410..818d51915748 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -135,6 +135,10 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc, int order);
+
#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -1487,6 +1491,7 @@ static unsigned int max_order_from_offset(unsigned int offset)
* mTHP.
*/
static enum scan_result mthp_collapse(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
unsigned long address, int referenced, int unmapped,
struct collapse_control *cc, unsigned long enabled_orders)
{
@@ -1512,8 +1517,12 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
enum scan_result ret;
collapse_address = address + offset * PAGE_SIZE;
- ret = collapse_huge_page(mm, collapse_address, referenced,
- unmapped, cc, order);
+ if (file)
+ ret = collapse_file(mm, collapse_address, file,
+ start + offset, cc, order);
+ else
+ ret = collapse_huge_page(mm, collapse_address,
+ referenced, unmapped, cc, order);
switch (ret) {
/* Cases where we continue to next collapse candidate */
@@ -1521,6 +1530,7 @@ static enum scan_result mthp_collapse(struct mm_struct *mm,
collapsed += nr_ptes;
fallthrough;
case SCAN_PTE_MAPPED_HUGEPAGE:
+ case SCAN_PAGE_COMPOUND:
goto next_offset;
/* Cases where lower orders might still succeed */
case SCAN_ALLOC_HUGE_PAGE_FAIL:
@@ -1774,7 +1784,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
if (result == SCAN_SUCCEED) {
/* collapse_huge_page expects the lock to be dropped before calling */
mmap_read_unlock(mm);
- result = mthp_collapse(mm, start_addr, referenced,
+ result = mthp_collapse(mm, NULL, 0, start_addr, referenced,
unmapped, cc, enabled_orders);
/* mmap_lock was released above, set lock_dropped */
*lock_dropped = true;
@@ -2306,7 +2316,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
if (++nr_none > max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ if (is_pmd_order(order))
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto xa_locked;
}
@@ -2316,6 +2328,19 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
xas_unlock_irq(&xas);
+
+ /*
+ * TODO: Support swapin without leading to further mTHP
+ * collapses. Currently bringing in new pages via swapin may
+ * cause a future higher order collapse on a rescan of the same
+ * range.
+ */
+ if (!is_pmd_order(order)) {
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto xa_unlocked;
+ }
+
/* swap in or instantiate fallocated page */
if (shmem_get_folio(mapping->host, index, 0,
&folio, SGP_NOALLOC)) {
@@ -2399,6 +2424,18 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
+ /*
+ * If the folio order is greater than the collapse order, there is
+ * no need to continue attempting to collapse.
+ * And should return SCAN_PAGE_COMPOUND instead of SCAN_PTE_MAPPED_HUGEPAGE,
+ * then we can build the mapping under the control of fault_around
+ * when refaulting.
+ */
+ if (folio_order(folio) >= order) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unlock;
+ }
+
if (folio_mapping(folio) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
@@ -2621,12 +2658,11 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
xas_unlock_irq(&xas);
/*
- * Remove pte page tables, so we can re-fault the page as huge.
- * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp().
+ * Remove pte page tables for PMD-sized THP collapse, so we can
+ * re-fault the page as huge.
*/
- retract_page_tables(mapping, start);
- if (cc && !cc->is_khugepaged)
- result = SCAN_PTE_MAPPED_HUGEPAGE;
+ if (is_pmd_order(order))
+ retract_page_tables(mapping, start);
folio_unlock(new_folio);
/*
@@ -2675,22 +2711,35 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
}
static enum scan_result collapse_scan_file(struct mm_struct *mm,
- unsigned long addr, struct file *file, pgoff_t start,
- struct collapse_control *cc)
+ struct vm_area_struct *vma, unsigned long addr,
+ struct file *file, pgoff_t start, struct collapse_control *cc)
{
- const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
+ enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
+ unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
- struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
- int present, swap;
- int node = NUMA_NO_NODE;
enum scan_result result = SCAN_SUCCEED;
+ unsigned long enabled_orders, nr_pages;
+ struct folio *folio = NULL;
+ int node = NUMA_NO_NODE;
+ int present, swap;
+ pgoff_t pgoff;
present = 0;
swap = 0;
+ bitmap_zero(cc->mthp_present_ptes, MAX_PTRS_PER_PTE);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+
+ enabled_orders = collapse_possible_orders(vma, vma->vm_flags, tva_flags);
+ /*
+ * If PMD is the only enabled order, enforce max_ptes_none, otherwise
+ * scan all pages to populate the bitmap for mTHP collapse.
+ */
+ if (enabled_orders != BIT(HPAGE_PMD_ORDER))
+ max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT;
+
rcu_read_lock();
xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, folio))
@@ -2754,7 +2803,17 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
* is just too costly...
*/
- present += folio_nr_pages(folio);
+ nr_pages = folio_nr_pages(folio);
+ present += nr_pages;
+
+ /*
+ * If there are folios present, keep track of it in the bitmap
+ * for file/shmem mTHP collapse.
+ */
+ pgoff = max_t(pgoff_t, start, folio->index) - start;
+ nr_pages = min_t(int, HPAGE_PMD_NR - pgoff, nr_pages);
+ bitmap_set(cc->mthp_present_ptes, pgoff, nr_pages);
+
folio_put(folio);
if (need_resched()) {
@@ -2768,15 +2827,23 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
else
cc->progress += HPAGE_PMD_NR;
- if (result == SCAN_SUCCEED) {
- if (present < HPAGE_PMD_NR - max_ptes_none) {
- result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
- } else {
- result = collapse_file(mm, addr, file, start, cc, HPAGE_PMD_ORDER);
- }
+ if (result != SCAN_SUCCEED)
+ goto out;
+
+ if (present < HPAGE_PMD_NR - max_ptes_none) {
+ result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(HPAGE_PMD_ORDER,
+ MTHP_STAT_COLLAPSE_EXCEED_NONE);
+ goto out;
}
+ result = mthp_collapse(mm, file, start, addr, 0, 0, cc, enabled_orders);
+ if (result == SCAN_SUCCEED && !cc->is_khugepaged) {
+ /* If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ }
+out:
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
@@ -2808,7 +2875,7 @@ static enum scan_result collapse_single_pmd(unsigned long addr,
mmap_read_unlock(mm);
*lock_dropped = true;
retry:
- result = collapse_scan_file(mm, addr, file, pgoff, cc);
+ result = collapse_scan_file(mm, vma, addr, file, pgoff, cc);
/*
* For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
--
2.47.3