[RFC PATCH 14/40] mm: page_alloc: add per-superpageblock free lists

From: Rik van Riel

Date: Wed May 20 2026 - 11:47:42 EST


Per-superpageblock free lists keep allocation steering effective at every
order: all pages belonging to a superpageblock are tracked on its own
free_area[NR_PAGE_ORDERS], not on the zone-level free_area. This lets
__rmqueue_smallest target a specific SPB by category/fullness without
walking the whole zone.

Sub-pageblock-order frees route to the containing SPB's free list via
__free_one_page; whole-pageblock and higher orders likewise. PCP refill,
buddy coalescing, and migratetype steering all consult the per-SPB
free_area.

Memory-hotplug correctness. Once the resize loop in
resize_zone_superpageblocks() may be invoked on a previously-empty zone
(memoryless NUMA node receiving its first online memory, CXL hot-add
into a zone with no prior pages), two latent bugs surface:

- The SPB list heads (zone->spb_empty and the spb_lists[cat][full]
matrix) are initialized only by setup_superpageblocks(), which is
__init and runs only at boot. Hot-add into a previously-empty zone
invokes init_one_superpageblock() with zero-initialized list_heads,
and the inlined list_add_tail() NULL-derefs walking ->next->prev.
Factor list-head init out of setup_superpageblocks() into
init_zone_spb_lists(), call it from resize_zone_superpageblocks()
on the first-time path (zone->superpageblocks == NULL); subsequent
resizes skip it.

- The resize loop copies struct superpageblock entries to a newly
kvmalloc()'d array but does not fix up the embedded
free_area[order].free_list[mt] list_heads. Pages on those lists
have buddy_list.prev/next pointing into the *old* array's list
heads, so as soon as the swap takes effect, __rmqueue_smallest
walks pointers into freed memory. Extend the per-SPB list_replace
pass to walk all NR_PAGE_ORDERS * MIGRATE_TYPES free lists too.

The same critical section that copies struct contents and fixes up
list heads must run under zone->lock to prevent a concurrent allocator
from observing partial state; take the lock around the
copy+fixup+swap.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
include/linux/mmzone.h | 10 +
mm/compaction.c | 36 +-
mm/internal.h | 10 +
mm/mm_init.c | 146 +++++--
mm/page_alloc.c | 853 ++++++++++++++++++++++++++++++++---------
mm/vmstat.c | 66 ++--
6 files changed, 883 insertions(+), 238 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b8ada3d13a34..85846bb041a8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1021,9 +1021,19 @@ struct superpageblock {
u16 nr_reserved; /* holes, firmware, etc. */
u16 total_pageblocks; /* zone-clipped total */

+ /* Total free pages across all per-superpageblock free lists */
+ unsigned long nr_free_pages;
+
/* For organizing superpageblocks by fullness category */
struct list_head list;

+ /*
+ * Per-superpageblock free lists for all buddy orders.
+ * All pages belonging to this superpageblock are tracked here,
+ * keeping allocation steering effective at every order.
+ */
+ struct free_area free_area[NR_PAGE_ORDERS];
+
/* Identity */
unsigned long start_pfn;
struct zone *zone;
diff --git a/mm/compaction.c b/mm/compaction.c
index e8ca651e2b07..6d2aefdbc0c8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -979,6 +979,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
low_pfn += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
}
+ /*
+ * Skipped a movable page; clearing
+ * PB_has_movable here would orphan SPB type
+ * counters (debugfs invariant 1).
+ */
+ movable_skipped = true;
goto isolate_fail;
}
/* for alloc_contig case */
@@ -1058,6 +1064,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
low_pfn += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
}
+ /*
+ * Skipped a movable compound page; clearing
+ * PB_has_movable here would orphan SPB type
+ * counters (debugfs invariant 1).
+ */
+ movable_skipped = true;
goto isolate_fail;
}
}
@@ -1083,6 +1095,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
movable_skipped = true;
}

+ /*
+ * Non-LRU non-movable_ops page: still occupies the
+ * pageblock, so clearing PB_has_movable here would
+ * orphan SPB type counters (debugfs invariant 1).
+ */
+ movable_skipped = true;
goto isolate_fail;
}

@@ -1320,12 +1338,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* isolated (pinned, writeback, dirty, etc.), leave the
* flag set so a future migration attempt can try again.
*/
- if (!nr_isolated && !movable_skipped && valid_page &&
- get_pfnblock_bit(valid_page, pageblock_start_pfn(start_pfn),
- PB_has_movable))
- clear_pfnblock_bit(valid_page,
- pageblock_start_pfn(start_pfn),
- PB_has_movable);
+ if (!nr_isolated && !movable_skipped && valid_page)
+ superpageblock_clear_has_movable(cc->zone,
+ valid_page);
}

trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
@@ -1873,6 +1888,15 @@ static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long da
prep_compound_page(&dst->page, order);
cc->nr_freepages -= 1 << order;
cc->nr_migratepages -= 1 << order;
+
+ /*
+ * Compaction isolates free pages via __isolate_free_page, which
+ * bypasses page_del_and_expand and its PB_has_* tracking. The
+ * destination will hold movable pages after migration, so mark
+ * PB_has_movable on the destination pageblock now.
+ */
+ superpageblock_set_has_movable(cc->zone, &dst->page);
+
return page_rmappable_folio(&dst->page);
}

diff --git a/mm/internal.h b/mm/internal.h
index 6a089bc4aa09..7091dc557f1f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1101,6 +1101,16 @@ void init_cma_reserved_pageblock(struct page *page);

#endif /* CONFIG_COMPACTION || CONFIG_CMA */

+#ifdef CONFIG_COMPACTION
+void superpageblock_clear_has_movable(struct zone *zone, struct page *page);
+void superpageblock_set_has_movable(struct zone *zone, struct page *page);
+#else
+static inline void superpageblock_clear_has_movable(struct zone *zone,
+ struct page *page) {}
+static inline void superpageblock_set_has_movable(struct zone *zone,
+ struct page *page) {}
+#endif
+
#ifdef CONFIG_MEMORY_HOTPLUG
void resize_zone_superpageblocks(struct zone *zone);
#endif
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2dc73d8a8d6c..92e5f396cbd7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1523,16 +1523,27 @@ static void __meminit init_one_superpageblock(struct superpageblock *sb,
unsigned long sb_end = start_pfn + SUPERPAGEBLOCK_NR_PAGES;
unsigned long pb_start = max(start_pfn, zone_start);
unsigned long pb_end = min(sb_end, zone_end);
+ int order, t;
u16 actual_pbs;

sb->nr_unmovable = 0;
sb->nr_reclaimable = 0;
sb->nr_movable = 0;
sb->nr_free = 0;
+ sb->nr_free_pages = 0;
INIT_LIST_HEAD(&sb->list);
sb->start_pfn = start_pfn;
sb->zone = zone;

+ /* Initialize per-superpageblock free areas */
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct free_area *area = &sb->free_area[order];
+
+ for (t = 0; t < MIGRATE_TYPES; t++)
+ INIT_LIST_HEAD(&area->free_list[t]);
+ area->nr_free = 0;
+ }
+
/*
* Start with all pageblock slots as reserved.
* init_pageblock_migratetype() will decrement nr_reserved and
@@ -1561,6 +1572,22 @@ static void __meminit init_one_superpageblock(struct superpageblock *sb,
}
}

+/*
+ * Initialize the per-zone SPB list heads. Called from boot
+ * (setup_superpageblocks) and from memory hotplug
+ * (resize_zone_superpageblocks) the first time SPBs are set up
+ * for a zone.
+ */
+static void __meminit init_zone_spb_lists(struct zone *zone)
+{
+ int cat, full;
+
+ INIT_LIST_HEAD(&zone->spb_empty);
+ for (cat = 0; cat < __NR_SB_CATEGORIES; cat++)
+ for (full = 0; full < __NR_SB_FULLNESS; full++)
+ INIT_LIST_HEAD(&zone->spb_lists[cat][full]);
+}
+
static void __init setup_superpageblocks(struct zone *zone)
{
unsigned long zone_start = zone->zone_start_pfn;
@@ -1568,17 +1595,22 @@ static void __init setup_superpageblocks(struct zone *zone)
unsigned long sb_base, nr_superpageblocks;
size_t alloc_size;
unsigned long i;
- int cat, full;

zone->superpageblocks = NULL;
zone->nr_superpageblocks = 0;
zone->superpageblock_base_pfn = 0;

/* Fullness lists steer allocations to preferred superpageblocks */
- INIT_LIST_HEAD(&zone->spb_empty);
- for (cat = 0; cat < __NR_SB_CATEGORIES; cat++)
- for (full = 0; full < __NR_SB_FULLNESS; full++)
- INIT_LIST_HEAD(&zone->spb_lists[cat][full]);
+ init_zone_spb_lists(zone);
+
+ /*
+ * Warn if pages have already been freed into this zone's
+ * free_area before superpageblocks are set up -- those pages
+ * would become stranded because __rmqueue_smallest only
+ * searches per-superpageblock free lists.
+ */
+ for (i = 0; i < NR_PAGE_ORDERS; i++)
+ WARN_ON_ONCE(zone->free_area[i].nr_free);

if (!zone->spanned_pages)
return;
@@ -1619,8 +1651,10 @@ static void __init setup_superpageblocks(struct zone *zone)
* the full zone span, copies existing superpageblocks (fixing up list heads),
* and initializes new superpageblocks for the added range.
*
- * Must be called under mem_hotplug_lock (write). No concurrent
- * allocations can occur since the hotplugged pages are not yet online.
+ * Must be called under mem_hotplug_lock (write). The hot-added pages
+ * themselves are not yet online, but allocations on previously-online
+ * pages within the same zone can still race the superpageblock-array
+ * swap; the function takes zone->lock for that critical section.
*/
void __meminit resize_zone_superpageblocks(struct zone *zone)
{
@@ -1634,6 +1668,7 @@ void __meminit resize_zone_superpageblocks(struct zone *zone)
size_t alloc_size;
unsigned long i;
int nid = zone_to_nid(zone);
+ unsigned long flags;

if (!zone->spanned_pages)
return;
@@ -1648,6 +1683,18 @@ void __meminit resize_zone_superpageblocks(struct zone *zone)
new_nr_sbs == zone->nr_superpageblocks)
return;

+ /*
+ * First time superpageblocks are being set up for this zone
+ * (memory hot-added to a previously-empty zone, e.g. CXL bringing
+ * a memoryless node online): the SPB fullness/category list heads
+ * are still zero-initialized from the zone struct allocation.
+ * setup_superpageblocks() runs only at boot via __init, so do that
+ * piece of init here for the hotplug path. Subsequent calls for
+ * the same zone will skip this -- superpageblocks is non-NULL.
+ */
+ if (!zone->superpageblocks)
+ init_zone_spb_lists(zone);
+
alloc_size = new_nr_sbs * sizeof(struct superpageblock);
new_sbs = kvmalloc_node(alloc_size, GFP_KERNEL | __GFP_ZERO, nid);
if (!new_sbs) {
@@ -1656,6 +1703,37 @@ void __meminit resize_zone_superpageblocks(struct zone *zone)
return;
}

+ /* Initialize new superpageblocks (not from old array) first, outside lock */
+ if (zone->superpageblocks) {
+ old_offset = (zone->superpageblock_base_pfn - new_sb_base) >>
+ SUPERPAGEBLOCK_ORDER;
+ } else {
+ old_offset = 0;
+ }
+
+ for (i = 0; i < new_nr_sbs; i++) {
+ struct superpageblock *sb = &new_sbs[i];
+ bool is_old = false;
+
+ if (zone->superpageblocks &&
+ i >= old_offset &&
+ i < old_offset + zone->nr_superpageblocks)
+ is_old = true;
+
+ if (is_old)
+ continue;
+
+ init_one_superpageblock(sb, zone,
+ new_sb_base + (i << SUPERPAGEBLOCK_ORDER),
+ zone_start, zone_end);
+ }
+
+ /*
+ * Take zone->lock for the copy+fixup+swap to prevent concurrent
+ * allocations from traversing free lists while we relocate them.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+
/*
* Copy existing superpageblocks to their new position.
* The old array covers [old_base, old_base + old_nr * SB_SIZE).
@@ -1669,39 +1747,39 @@ void __meminit resize_zone_superpageblocks(struct zone *zone)
zone->nr_superpageblocks * sizeof(struct superpageblock));

/*
- * Fix up list_head pointers that were self-referencing
- * (empty lists) or pointing into the old array.
+ * Fix up all list_head pointers: both the SPB category list
+ * and every free_area[order].free_list[migratetype]. Pages on
+ * buddy free lists have buddy_list.prev/next pointing at the
+ * old array's list heads -- those must be updated to point at
+ * the new array.
*/
for (i = old_offset; i < old_offset + zone->nr_superpageblocks; i++) {
struct superpageblock *sb = &new_sbs[i];
+ struct superpageblock *old_sb =
+ &zone->superpageblocks[i - old_offset];
+ int order, mt;

- if (list_empty(&sb->list))
+ /* Fix up sb->list (zone category/fullness list) */
+ if (list_empty(&old_sb->list))
INIT_LIST_HEAD(&sb->list);
else
- list_replace(&zone->superpageblocks[i - old_offset].list,
- &sb->list);
- }
- }
-
- /* Initialize new superpageblocks (slots not covered by old array) */
- for (i = 0; i < new_nr_sbs; i++) {
- struct superpageblock *sb = &new_sbs[i];
- bool is_old = false;
-
- if (zone->superpageblocks) {
- old_offset = (zone->superpageblock_base_pfn - new_sb_base) >>
- SUPERPAGEBLOCK_ORDER;
- if (i >= old_offset &&
- i < old_offset + zone->nr_superpageblocks)
- is_old = true;
+ list_replace(&old_sb->list, &sb->list);
+
+ /* Fix up all free_area list heads */
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+ struct list_head *old_list =
+ &old_sb->free_area[order].free_list[mt];
+ struct list_head *new_list =
+ &sb->free_area[order].free_list[mt];
+
+ if (list_empty(old_list))
+ INIT_LIST_HEAD(new_list);
+ else
+ list_replace(old_list, new_list);
+ }
+ }
}
-
- if (is_old)
- continue;
-
- init_one_superpageblock(sb, zone,
- new_sb_base + (i << SUPERPAGEBLOCK_ORDER),
- zone_start, zone_end);
}

/*
@@ -1740,6 +1818,8 @@ void __meminit resize_zone_superpageblocks(struct zone *zone)
zone->superpageblock_base_pfn = new_sb_base;
zone->spb_kvmalloced = true;

+ spin_unlock_irqrestore(&zone->lock, flags);
+
/*
* The boot-time array was allocated with memblock_alloc, which
* is not individually freeable after boot. Only kvfree arrays
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1b619304864a..b9c957fb4783 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -515,6 +515,140 @@ static void __spb_set_has_type(struct page *page, int migratetype)
}
}

+/*
+ * __spb_clear_has_type - clear PB_has_* and decrement type counter
+ *
+ * Idempotent: only decrements the counter on the 1→0 bit transition.
+ */
+static void __spb_clear_has_type(struct page *page, int migratetype)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct superpageblock *sb = pfn_to_superpageblock(page_zone(page), pfn);
+ int bit;
+
+ if (!sb)
+ return;
+
+ bit = migratetype_to_has_bit(migratetype);
+ if (bit < 0)
+ return;
+
+ if (get_pfnblock_bit(page, pfn, bit)) {
+ clear_pfnblock_bit(page, pfn, bit);
+ switch (bit) {
+ case PB_has_unmovable:
+ if (sb->nr_unmovable)
+ sb->nr_unmovable--;
+ break;
+ case PB_has_reclaimable:
+ if (sb->nr_reclaimable)
+ sb->nr_reclaimable--;
+ break;
+ case PB_has_movable:
+ if (sb->nr_movable)
+ sb->nr_movable--;
+ break;
+ }
+ }
+}
+
+#ifdef CONFIG_COMPACTION
+/*
+ * spb_pageblock_has_free_movable_fragments - probe SPB free lists for movable
+ * @zone: zone containing @page
+ * @page: any page within the target pageblock
+ *
+ * Returns true if the SPB containing @page has any free MOVABLE pages on its
+ * per-order free lists at orders below pageblock_order whose PFN falls within
+ * the target pageblock. The compaction migrate scanner only sees in-use pages,
+ * so a pageblock can look "empty of movable" to the scanner while the SPB
+ * still owns small-order MOVABLE fragments inside it. Clearing PB_has_movable
+ * in that case would orphan those fragments from the SPB type accounting and
+ * trigger debugfs invariant 1 (sum_types undercount).
+ *
+ * Returns false (no fragments found) when the SPB lookup fails, which
+ * preserves the legacy clear-on-empty behavior for edge cases.
+ *
+ * Caller must hold zone->lock.
+ */
+static bool spb_pageblock_has_free_movable_fragments(struct zone *zone,
+ struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long pb_start = pageblock_start_pfn(pfn);
+ unsigned long pb_end = pb_start + pageblock_nr_pages;
+ unsigned long frag_pfn;
+ struct superpageblock *sb;
+ struct list_head *list;
+ struct page *frag;
+ unsigned int order;
+
+ sb = pfn_to_superpageblock(zone, pfn);
+ if (!sb)
+ return false;
+
+ for (order = 0; order < pageblock_order; order++) {
+ list = &sb->free_area[order].free_list[MIGRATE_MOVABLE];
+ list_for_each_entry(frag, list, buddy_list) {
+ frag_pfn = page_to_pfn(frag);
+ if (frag_pfn >= pb_start && frag_pfn < pb_end)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * superpageblock_clear_has_movable - clear PB_has_movable with SPB counter update
+ * @page: page within the pageblock
+ *
+ * Called from compaction when a full pageblock scan determines no movable
+ * pages remain. Clears PB_has_movable and decrements the superpageblock's
+ * nr_movable counter atomically (under zone->lock).
+ *
+ * Without this, clearing PB_has_movable directly via clear_pfnblock_bit()
+ * would leave the SPB counter stale, causing nr_movable to grow unbounded
+ * as subsequent movable allocations re-set the bit and re-increment.
+ *
+ * The migrate scanner only inspects in-use pages, so it is blind to MOVABLE
+ * fragments below pageblock_order sitting on the SPB free lists. Probe those
+ * lists first; if any fragment of @page's pageblock is still tracked by the
+ * SPB, leave PB_has_movable set so the SPB type accounting stays consistent
+ * (debugfs invariant 1: unmov + recl + mov + free >= total - rsv).
+ */
+void superpageblock_clear_has_movable(struct zone *zone, struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (!spb_pageblock_has_free_movable_fragments(zone, page))
+ __spb_clear_has_type(page, MIGRATE_MOVABLE);
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/**
+ * superpageblock_set_has_movable - set PB_has_movable with SPB counter update
+ * @zone: zone containing the page
+ * @page: page within the pageblock
+ *
+ * Called from compaction when a movable page is migrated into a pageblock.
+ * Compaction bypasses page_del_and_expand (which normally sets PB_has_*)
+ * by using __isolate_free_page + direct migration, so PB_has_movable must
+ * be set explicitly for the destination pageblock.
+ *
+ * Idempotent: only increments the counter on the 0→1 bit transition.
+ */
+void superpageblock_set_has_movable(struct zone *zone, struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ __spb_set_has_type(page, MIGRATE_MOVABLE);
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif /* CONFIG_COMPACTION */
+
/**
* spb_get_category - Determine if a superpageblock is clean or tainted
* @sb: superpageblock to classify
@@ -585,7 +719,7 @@ static void spb_update_list(struct superpageblock *sb)

list_del_init(&sb->list);

- if (sb->nr_free == SUPERPAGEBLOCK_NR_PAGEBLOCKS) {
+ if (sb->nr_free == sb->total_pageblocks) {
list_add_tail(&sb->list, &zone->spb_empty);
return;
}
@@ -1023,12 +1157,41 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
zone->nr_free_highatomic + nr_pages);
}

+/**
+ * pfn_sb_free_area - Get the correct free_area for a page at given order
+ * @zone: the zone
+ * @pfn: page frame number
+ * @order: buddy order
+ *
+ * Returns the per-superpageblock free_area if the page belongs to a valid
+ * superpageblock. Otherwise returns the zone free_area (for zones where the
+ * superpageblock setup failed).
+ */
+static inline struct free_area *pfn_sb_free_area(struct zone *zone,
+ unsigned long pfn,
+ unsigned int order,
+ struct superpageblock **sbp)
+{
+ struct superpageblock *sb = pfn_to_superpageblock(zone, pfn);
+
+ if (sb) {
+ if (sbp)
+ *sbp = sb;
+ return &sb->free_area[order];
+ }
+ if (sbp)
+ *sbp = NULL;
+ return &zone->free_area[order];
+}
+
/* Used for pages not on another list */
static inline void __add_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype,
bool tail)
{
- struct free_area *area = &zone->free_area[order];
+ unsigned long pfn = page_to_pfn(page);
+ struct superpageblock *sb;
+ struct free_area *area = pfn_sb_free_area(zone, pfn, order, &sb);
int nr_pages = 1 << order;

VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
@@ -1041,6 +1204,13 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;

+ if (sb) {
+ /* Keep zone-level nr_free accurate for watermark checks */
+ zone->free_area[order].nr_free++;
+ /* Track total free pages per superpageblock */
+ sb->nr_free_pages += nr_pages;
+ }
+
if (order >= pageblock_order && !is_migrate_isolate(migratetype))
__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
@@ -1053,7 +1223,8 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int old_mt, int new_mt)
{
- struct free_area *area = &zone->free_area[order];
+ unsigned long pfn = page_to_pfn(page);
+ struct free_area *area = pfn_sb_free_area(zone, pfn, order, NULL);
int nr_pages = 1 << order;

/* Free page moving can fail, so it happens before the type update */
@@ -1077,6 +1248,9 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
+ unsigned long pfn = page_to_pfn(page);
+ struct superpageblock *sb;
+ struct free_area *area = pfn_sb_free_area(zone, pfn, order, &sb);
int nr_pages = 1 << order;

VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
@@ -1090,7 +1264,14 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
list_del(&page->buddy_list);
__ClearPageBuddy(page);
set_page_private(page, 0);
- zone->free_area[order].nr_free--;
+ area->nr_free--;
+
+ if (sb) {
+ /* Keep zone-level nr_free accurate for watermark checks */
+ zone->free_area[order].nr_free--;
+ /* Track total free pages per superpageblock */
+ sb->nr_free_pages -= nr_pages;
+ }

if (order >= pageblock_order && !is_migrate_isolate(migratetype))
__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
@@ -1146,33 +1327,44 @@ static void change_pageblock_range(struct page *pageblock_page,
}
}

-/*
+/**
* mark_pageblock_free - handle a pageblock becoming fully free
* @page: page at the start of the pageblock
* @pfn: page frame number
+ * @migratetype: pointer to the caller's migratetype variable (may be updated)
*
- * Clear stale PCP ownership and actual-contents tracking flags when
- * buddy merging reconstructs a full pageblock or a whole pageblock is
- * freed directly. No PCP can still hold pages from this block (otherwise
- * the buddy merge couldn't have completed), so the ownership entry would
- * just cause misrouted frees.
+ * Clear stale PCP ownership and actual-contents tracking flags, mark the
+ * pageblock as fully free for superpageblock accounting, and reset the
+ * migratetype to MOVABLE so the page lands on free_list[MIGRATE_MOVABLE].
+ * Non-movable allocations must go through RMQUEUE_CLAIM to reuse it,
+ * which properly handles PB_all_free and superpageblock accounting.
*/
-static void mark_pageblock_free(struct page *page, unsigned long pfn)
+static void mark_pageblock_free(struct page *page, unsigned long pfn,
+ int *migratetype)
{
clear_pcpblock_owner(page);

/*
- * The entire block is now free -- clear actual-contents tracking
- * flags since no allocated pages remain.
+ * Clear PB_has_* bits and decrement corresponding SPB type
+ * counters. Use __spb_clear_has_type (no list update) to avoid
+ * bouncing the SPB between lists; pb_now_free's spb_update_list
+ * handles the final reclassification.
*/
- clear_pfnblock_bit(page, pfn, PB_has_unmovable);
- clear_pfnblock_bit(page, pfn, PB_has_reclaimable);
- clear_pfnblock_bit(page, pfn, PB_has_movable);
+ __spb_clear_has_type(page, MIGRATE_UNMOVABLE);
+ __spb_clear_has_type(page, MIGRATE_RECLAIMABLE);
+ __spb_clear_has_type(page, MIGRATE_MOVABLE);

if (!get_pfnblock_bit(page, pfn, PB_all_free)) {
set_pfnblock_bit(page, pfn, PB_all_free);
superpageblock_pb_now_free(page);
}
+
+ if (*migratetype == MIGRATE_UNMOVABLE ||
+ *migratetype == MIGRATE_RECLAIMABLE ||
+ *migratetype == MIGRATE_HIGHATOMIC) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ *migratetype = MIGRATE_MOVABLE;
+ }
}

/*
@@ -1205,6 +1397,7 @@ static inline void __free_one_page(struct page *page,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
+ unsigned int orig_order = order;
unsigned long buddy_pfn = 0;
unsigned long combined_pfn;
struct page *buddy;
@@ -1217,18 +1410,31 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);

- account_freepages(zone, 1 << order, migratetype);
+ if (order >= pageblock_order) {
+ int i, nr_pbs = 1 << (order - pageblock_order);

- /*
- * When freeing a whole pageblock, clear stale PCP ownership
- * and actual-contents tracking flags up front, and mark it
- * as fully free for superpageblock accounting. The in-loop
- * check only fires when sub-pageblock pages merge *up to*
- * pageblock_order, not when entering at pageblock_order
- * directly.
- */
- if (order == pageblock_order)
- mark_pageblock_free(page, pfn);
+ for (i = 0; i < nr_pbs; i++) {
+ int pb_mt = get_pfnblock_migratetype(
+ page + i * pageblock_nr_pages,
+ pfn + i * pageblock_nr_pages);
+ mark_pageblock_free(page + i * pageblock_nr_pages,
+ pfn + i * pageblock_nr_pages,
+ &pb_mt);
+ }
+ /*
+ * After mark_pageblock_free, non-CMA sub-pageblocks are
+ * MOVABLE. CMA pageblocks retain their CMA type so pages
+ * land on the correct free list for CMA allocations.
+ * ISOLATE pageblocks must stay ISOLATE so that
+ * account_freepages() correctly skips them -- otherwise
+ * NR_FREE_PAGES gets incremented for isolated pages.
+ */
+ if (!is_migrate_cma(migratetype) &&
+ !is_migrate_isolate(migratetype))
+ migratetype = MIGRATE_MOVABLE;
+ }
+
+ account_freepages(zone, 1 << order, migratetype);

while (order < MAX_PAGE_ORDER) {
int buddy_mt = migratetype;
@@ -1285,8 +1491,29 @@ static inline void __free_one_page(struct page *page,
* clear any stale PCP ownership and actual-contents
* tracking flags.
*/
- if (order == pageblock_order)
- mark_pageblock_free(page, pfn);
+ if (order == pageblock_order) {
+ int old_mt = migratetype;
+
+ mark_pageblock_free(page, pfn, &migratetype);
+ /*
+ * mark_pageblock_free may convert migratetype to
+ * MOVABLE. Transfer the accounting done earlier so
+ * nr_free_highatomic doesn't leak.
+ *
+ * We transfer 1 << orig_order pages -- the amount
+ * credited by this __free_one_page call. Buddies
+ * consumed during merging may also have HIGHATOMIC
+ * credits from their own frees; those are not tracked
+ * here. In practice HIGHATOMIC reserves are small and
+ * short-lived, so any residual drift is minor.
+ */
+ if (old_mt != migratetype) {
+ account_freepages(zone, -(1 << orig_order),
+ old_mt);
+ account_freepages(zone, 1 << orig_order,
+ migratetype);
+ }
+ }
}

done_merging:
@@ -2163,20 +2390,42 @@ static __always_inline void page_del_and_expand(struct zone *zone,
struct page *page, int low,
int high, int migratetype)
{
+ struct superpageblock *sb;
int nr_pages = 1 << high;

/*
* If we're splitting a page that spans at least a full pageblock,
- * the allocated pageblock transitions from fully-free to in-use.
- * Clear PB_all_free and update superpageblock accounting.
+ * each constituent pageblock transitions from fully-free to in-use.
+ * Clear PB_all_free and update superpageblock accounting for ALL
+ * pageblocks in the range, not just the first one.
*/
if (high >= pageblock_order) {
unsigned long pfn = page_to_pfn(page);
+ unsigned long end_pfn = pfn + (1 << high);

- if (get_pfnblock_bit(page, pfn, PB_all_free)) {
- clear_pfnblock_bit(page, pfn, PB_all_free);
- superpageblock_pb_now_used(page);
+ for (; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *pb_page = pfn_to_page(pfn);
+
+ if (get_pfnblock_bit(pb_page, pfn, PB_all_free)) {
+ clear_pfnblock_bit(pb_page, pfn, PB_all_free);
+ superpageblock_pb_now_used(pb_page);
+ }
+ __spb_set_has_type(pb_page, migratetype);
}
+ /* Single list update after all pageblocks processed */
+ sb = pfn_to_superpageblock(zone, page_to_pfn(page));
+ if (sb)
+ spb_update_list(sb);
+ } else {
+ /*
+ * Sub-pageblock allocation: set PB_has_<migratetype> for
+ * the containing pageblock. Idempotent: only increments
+ * the counter on the first allocation of this type.
+ */
+ __spb_set_has_type(page, migratetype);
+ sb = pfn_to_superpageblock(zone, page_to_pfn(page));
+ if (sb)
+ spb_update_list(sb);
}

__del_page_from_free_list(page, zone, high, migratetype);
@@ -2330,6 +2579,15 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
/* Bounded scan limit when searching free lists for tainted superpageblock pages */
#define SPB_SCAN_LIMIT 8

+/*
+ * Reserve free pageblocks in tainted superpageblocks for unmovable/reclaimable
+ * allocations. Movable allocations skip tainted superpageblocks that have
+ * fewer than this many free pageblocks, ensuring that unmovable claims
+ * always find room in existing tainted superpageblocks instead of spilling
+ * into clean ones.
+ */
+#define SPB_TAINTED_RESERVE 4
+
/**
* sb_preferred_for_movable - Find the fullest clean superpageblock for movable
* @zone: zone to search
@@ -2369,38 +2627,38 @@ static struct page *__rmqueue_from_sb(struct zone *zone, unsigned int order,
int migratetype, struct superpageblock *sb)
{
unsigned int current_order;
- unsigned long sb_start = sb->start_pfn;
- unsigned long sb_end = sb_start + (1UL << SUPERPAGEBLOCK_ORDER);
struct free_area *area;
struct page *page;
- int scanned;

- for (current_order = order; current_order < NR_PAGE_ORDERS;
+ /*
+ * Search the superpageblock's own free lists for all orders.
+ */
+ for (current_order = order;
+ current_order < NR_PAGE_ORDERS;
++current_order) {
- area = &zone->free_area[current_order];
- scanned = 0;
-
- list_for_each_entry(page, &area->free_list[migratetype],
- buddy_list) {
- unsigned long pfn = page_to_pfn(page);
+ area = &sb->free_area[current_order];
+ page = get_page_from_free_area(area, migratetype);
+ if (!page)
+ continue;

- if (pfn >= sb_start && pfn < sb_end) {
- page_del_and_expand(zone, page, order,
- current_order,
- migratetype);
- return page;
- }
- if (++scanned >= SPB_SCAN_LIMIT)
- break;
- }
+ page_del_and_expand(zone, page, order, current_order,
+ migratetype);
+ return page;
}
+
return NULL;
}

/*
* Go through the free lists for the given migratetype and remove
- * the smallest available page from the freelists
+ * the smallest available page from the freelists.
+ *
+ * When superpageblocks are enabled, search per-superpageblock free lists first,
+ * falling back to zone free lists for pages not in any superpageblock.
*/
+static struct page *claim_whole_block(struct zone *zone, struct page *page,
+ int current_order, int order, int new_type, int old_type);
+
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
@@ -2408,14 +2666,179 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
unsigned int current_order;
struct free_area *area;
struct page *page;
+ int full;
+ struct superpageblock *sb;
+ /*
+ * Category search order: 2 passes.
+ * Movable: clean first, then tainted (pack into clean SBs).
+ * Others: tainted first, then clean (concentrate in tainted SBs).
+ */
+ static const enum sb_category cat_order[2][2] = {
+ [0] = { SB_TAINTED, SB_CLEAN }, /* unmovable/reclaimable */
+ [1] = { SB_CLEAN, SB_TAINTED }, /* movable */
+ };
+ int movable = (migratetype == MIGRATE_MOVABLE) ? 1 : 0;

- /* Find a page of the appropriate size in the preferred list */
- for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
+ /*
+ * Search per-superpageblock free lists for pages of the requested
+ * migratetype, walking superpageblocks from fullest to emptiest
+ * to pack allocations.
+ *
+ * For unmovable/reclaimable, prefer tainted superpageblocks to
+ * concentrate non-movable allocations into fewer superpageblocks.
+ * For movable, prefer clean superpageblocks to keep them homogeneous.
+ *
+ * Search empty superpageblocks between the preferred and fallback
+ * category passes to avoid movable allocations consuming free
+ * pageblocks in tainted superpageblocks (which unmovable needs for
+ * future CLAIMs), and vice versa.
+ */
+ for (full = SB_FULL; full < __NR_SB_FULLNESS; full++) {
+ enum sb_category cat = cat_order[movable][0];
+
+ list_for_each_entry(sb,
+ &zone->spb_lists[cat][full], list) {
+ if (!sb->nr_free_pages)
+ continue;
+ for (current_order = order;
+ current_order < NR_PAGE_ORDERS;
+ ++current_order) {
+ area = &sb->free_area[current_order];
+ page = get_page_from_free_area(
+ area, migratetype);
+ if (!page)
+ continue;
+ page_del_and_expand(zone, page,
+ order, current_order,
+ migratetype);
+ trace_mm_page_alloc_zone_locked(
+ page, order, migratetype,
+ pcp_allowed_order(order) &&
+ migratetype < MIGRATE_PCPTYPES);
+ return page;
+ }
+ }
+ }
+
+ /*
+ * For non-movable allocations, try to reclaim free pageblocks
+ * from tainted superpageblocks before looking at empty or clean
+ * ones. Free pageblocks in tainted SBs have pages on the MOVABLE
+ * free list (reset by mark_pageblock_free), so the search above
+ * misses them. Claim them inline to keep non-movable allocations
+ * concentrated in already-tainted superpageblocks.
+ */
+ if (!movable && !is_migrate_cma(migratetype)) {
+ for (full = SB_FULL; full < __NR_SB_FULLNESS; full++) {
+ list_for_each_entry(sb,
+ &zone->spb_lists[SB_TAINTED][full], list) {
+ if (!sb->nr_free)
+ continue;
+ for (current_order = max_t(unsigned int,
+ order, pageblock_order);
+ current_order < NR_PAGE_ORDERS;
+ ++current_order) {
+ area = &sb->free_area[current_order];
+ page = get_page_from_free_area(
+ area, MIGRATE_MOVABLE);
+ if (!page)
+ continue;
+ if (get_pageblock_isolate(page))
+ continue;
+ if (is_migrate_cma(
+ get_pageblock_migratetype(page)))
+ continue;
+ page = claim_whole_block(zone, page,
+ current_order, order,
+ migratetype, MIGRATE_MOVABLE);
+ trace_mm_page_alloc_zone_locked(
+ page, order, migratetype,
+ pcp_allowed_order(order) &&
+ migratetype < MIGRATE_PCPTYPES);
+ return page;
+ }
+ }
+ }
+ }
+
+ /* Empty superpageblocks: try before falling back to non-preferred category */
+ list_for_each_entry(sb, &zone->spb_empty, list) {
+ if (!sb->nr_free_pages)
+ continue;
+ for (current_order = max(order, pageblock_order);
+ current_order < NR_PAGE_ORDERS;
+ ++current_order) {
+ area = &sb->free_area[current_order];
+ page = get_page_from_free_area(area, migratetype);
+ if (!page)
+ continue;
+ page_del_and_expand(zone, page, order,
+ current_order, migratetype);
+ trace_mm_page_alloc_zone_locked(page, order,
+ migratetype,
+ pcp_allowed_order(order) &&
+ migratetype < MIGRATE_PCPTYPES);
+ return page;
+ }
+ }
+
+ /*
+ * Pass 4: movable allocations fall back to tainted SPBs.
+ * Non-movable allocations must NOT search clean SPBs here;
+ * stale migratetype labels create phantom non-movable free
+ * pages in clean SPBs that would cause unnecessary tainting.
+ * Let __rmqueue_claim and __rmqueue_steal handle non-movable
+ * fallback with proper ALLOC_NOFRAGMENT protection.
+ */
+ if (movable) {
+ for (full = SB_FULL; full < __NR_SB_FULLNESS; full++) {
+ enum sb_category cat = cat_order[movable][1];
+
+ list_for_each_entry(sb,
+ &zone->spb_lists[cat][full], list) {
+ if (!sb->nr_free_pages)
+ continue;
+ /*
+ * Movable falling back to tainted: skip SBs
+ * with few free pageblocks to reserve space
+ * for future unmovable/reclaimable claims.
+ */
+ if (sb->nr_free <= SPB_TAINTED_RESERVE)
+ continue;
+ for (current_order = order;
+ current_order < NR_PAGE_ORDERS;
+ ++current_order) {
+ area = &sb->free_area[current_order];
+ page = get_page_from_free_area(
+ area, migratetype);
+ if (!page)
+ continue;
+ page_del_and_expand(zone, page,
+ order, current_order,
+ migratetype);
+ trace_mm_page_alloc_zone_locked(
+ page, order, migratetype,
+ pcp_allowed_order(order) &&
+ migratetype < MIGRATE_PCPTYPES);
+ return page;
+ }
+ }
+ }
+ }
+
+ /*
+ * Zone free lists: all pages should be on superpageblock lists.
+ * Finding a page here means zone hotplug added memory without
+ * setting up superpageblocks for the new range.
+ */
+ for (current_order = order;
+ current_order < NR_PAGE_ORDERS; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;

+ WARN_ON_ONCE(zone->superpageblocks);
page_del_and_expand(zone, page, order, current_order,
migratetype);
trace_mm_page_alloc_zone_locked(page, order, migratetype,
@@ -2761,6 +3184,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
*
* Handle the PB_all_free → used transition, change the pageblock
* migratetype, split the block down to @order, and return the page.
+ * Used by both the claim fallback path and __rmqueue_smallest when
+ * reclaiming free pageblocks from tainted superpageblocks.
*/
static struct page *
claim_whole_block(struct zone *zone, struct page *page,
@@ -2772,11 +3197,6 @@ claim_whole_block(struct zone *zone, struct page *page,

VM_WARN_ON_ONCE(current_order < order);

- /*
- * Clear PB_all_free for pageblocks being claimed.
- * This path bypasses page_del_and_expand(), so we
- * must handle the free→used transition here.
- */
for (pb_pfn = page_to_pfn(page);
pb_pfn < page_to_pfn(page) + (1 << current_order);
pb_pfn += pageblock_nr_pages) {
@@ -2827,6 +3247,16 @@ try_to_claim_block(struct zone *zone, struct page *page,
if (get_pageblock_isolate(page))
return NULL;

+ /*
+ * Never steal from CMA pageblocks. CMA pages freed through
+ * PCP may land on the MOVABLE free list (PCP caches the
+ * allocation-time migratetype), making them visible to the
+ * fallback search. Stealing would corrupt CMA by changing
+ * the pageblock type away from MIGRATE_CMA.
+ */
+ if (is_migrate_cma(get_pageblock_migratetype(page)))
+ return NULL;
+
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order)
return claim_whole_block(zone, page, current_order, order,
@@ -2893,10 +3323,134 @@ try_to_claim_block(struct zone *zone, struct page *page,
return NULL;
}

+/*
+ * Search per-superpageblock free lists for a page of a fallback migratetype.
+ * Sub-pageblock-order free pages live on superpageblock free lists, not zone
+ * free lists, so __rmqueue_claim and __rmqueue_steal need this helper to
+ * find fallback pages at those orders.
+ *
+ * For unmovable/reclaimable allocations, prefer tainted superpageblocks to
+ * keep clean ones clean for future large contiguous allocations.
+ * For movable allocations, prefer clean superpageblocks to keep movable
+ * pages consolidated and superpageblocks homogeneous.
+ *
+ * @search_cats: bitmask controlling which categories to search.
+ * bit 0: search the preferred category (tainted for unmov, clean for mov)
+ * bit 1: search empty superpageblocks
+ * bit 2: search the fallback category (clean for unmov, tainted for mov)
+ * All bits set (0x7) gives the original behavior.
+ */
+#define SB_SEARCH_PREFERRED (1 << 0)
+#define SB_SEARCH_EMPTY (1 << 1)
+#define SB_SEARCH_FALLBACK (1 << 2)
+#define SB_SEARCH_ALL (SB_SEARCH_PREFERRED | SB_SEARCH_EMPTY | SB_SEARCH_FALLBACK)
+
+static struct page *
+__rmqueue_sb_find_fallback(struct zone *zone, unsigned int order,
+ int start_migratetype, int *fallback_mt,
+ unsigned int search_cats)
+{
+ int full, i;
+ struct superpageblock *sb;
+ /*
+ * Category search order: 2 passes.
+ * Movable: clean, tainted. Others: tainted, clean.
+ */
+ static const enum sb_category cat_order[2][2] = {
+ [0] = { SB_TAINTED, SB_CLEAN }, /* unmovable/reclaimable */
+ [1] = { SB_CLEAN, SB_TAINTED }, /* movable */
+ };
+ int movable = (start_migratetype == MIGRATE_MOVABLE) ? 1 : 0;
+
+ /* Pass 0: preferred category */
+ if (search_cats & SB_SEARCH_PREFERRED) {
+ enum sb_category cat = cat_order[movable][0];
+
+ for (full = SB_FULL; full < __NR_SB_FULLNESS; full++) {
+ list_for_each_entry(sb,
+ &zone->spb_lists[cat][full], list) {
+ struct free_area *area =
+ &sb->free_area[order];
+
+ if (movable && cat == SB_TAINTED &&
+ sb->nr_free <= SPB_TAINTED_RESERVE)
+ continue;
+
+ for (i = 0; i < MIGRATE_PCPTYPES - 1; i++) {
+ int fmt = fallbacks[start_migratetype][i];
+ struct page *page;
+
+ page = get_page_from_free_area(area,
+ fmt);
+ if (page) {
+ *fallback_mt = fmt;
+ return page;
+ }
+ }
+ }
+ }
+ }
+
+ /* Empty superpageblocks: between preferred and fallback */
+ if (search_cats & SB_SEARCH_EMPTY) {
+ list_for_each_entry(sb, &zone->spb_empty, list) {
+ struct free_area *area =
+ &sb->free_area[order];
+
+ for (i = 0; i < MIGRATE_PCPTYPES - 1; i++) {
+ int fmt = fallbacks[start_migratetype][i];
+ struct page *page;
+
+ page = get_page_from_free_area(area,
+ fmt);
+ if (page) {
+ *fallback_mt = fmt;
+ return page;
+ }
+ }
+ }
+ }
+
+ /* Pass 1: fallback category */
+ if (search_cats & SB_SEARCH_FALLBACK) {
+ enum sb_category cat = cat_order[movable][1];
+
+ for (full = SB_FULL; full < __NR_SB_FULLNESS; full++) {
+ list_for_each_entry(sb,
+ &zone->spb_lists[cat][full], list) {
+ struct free_area *area =
+ &sb->free_area[order];
+
+ if (movable && cat == SB_TAINTED &&
+ sb->nr_free <= SPB_TAINTED_RESERVE)
+ continue;
+
+ for (i = 0; i < MIGRATE_PCPTYPES - 1; i++) {
+ int fmt = fallbacks[start_migratetype][i];
+ struct page *page;
+
+ page = get_page_from_free_area(area,
+ fmt);
+ if (page) {
+ *fallback_mt = fmt;
+ return page;
+ }
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
/*
* Try to allocate from some fallback migratetype by claiming the entire block,
* i.e. converting it to the allocation's start migratetype.
*
+ * Search by category first, then by order within each category, to avoid
+ * claiming clean/empty superpageblocks when tainted ones still have space
+ * at smaller orders.
+ *
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
@@ -2905,11 +3459,16 @@ static __always_inline struct page *
__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
- struct free_area *area;
int current_order;
int min_order = order;
struct page *page;
int fallback_mt;
+ static const unsigned int cat_search[] = {
+ SB_SEARCH_PREFERRED,
+ SB_SEARCH_EMPTY,
+ SB_SEARCH_FALLBACK,
+ };
+ int c;

/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2920,65 +3479,34 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
min_order = pageblock_order;

/*
- * Find the largest available free page in the other list. This roughly
- * approximates finding the pageblock with the most free pages, which
- * would be too costly to do exactly.
+ * Find the largest available free page in a fallback migratetype.
+ * Search each superpageblock category across all orders before
+ * moving to the next category, so that smaller blocks in tainted
+ * superpageblocks are preferred over larger blocks in empty/clean
+ * ones.
*/
- for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
- --current_order) {
- area = &(zone->free_area[current_order]);
- fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, true);
-
- /* No block in that order */
- if (fallback_mt == -1)
- continue;
-
- /* Advanced into orders too low to claim, abort */
- if (fallback_mt == -2)
- break;
-
- page = get_page_from_free_area(area, fallback_mt);
+ for (c = 0; c < ARRAY_SIZE(cat_search); c++) {
+ for (current_order = MAX_PAGE_ORDER;
+ current_order >= min_order; --current_order) {
+ if (!should_try_claim_block(current_order,
+ start_migratetype))
+ break;
+ page = __rmqueue_sb_find_fallback(zone, current_order,
+ start_migratetype,
+ &fallback_mt, cat_search[c]);
+ if (!page)
+ continue;

- /*
- * For unmovable/reclaimable stealing, prefer pages from
- * tainted superpageblocks (already contaminated) to keep clean
- * superpageblocks clean for future 1GB allocations.
- */
- if (start_migratetype != MIGRATE_MOVABLE &&
- zone->superpageblocks && page) {
- struct superpageblock *sb;
- struct page *alt;
- int scanned = 0;
-
- sb = pfn_to_superpageblock(zone, page_to_pfn(page));
- if (sb && spb_get_category(sb) == SB_CLEAN) {
- list_for_each_entry(alt,
- &area->free_list[fallback_mt],
- buddy_list) {
- struct superpageblock *asb;
-
- if (++scanned > SPB_SCAN_LIMIT)
- break;
- asb = pfn_to_superpageblock(zone,
- page_to_pfn(alt));
- if (asb && spb_get_category(asb) ==
- SB_TAINTED) {
- page = alt;
- break;
- }
- }
+ page = try_to_claim_block(zone, page, current_order,
+ order, start_migratetype,
+ fallback_mt, alloc_flags);
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order,
+ current_order, start_migratetype,
+ fallback_mt);
+ return page;
}
}
-
- page = try_to_claim_block(zone, page, current_order, order,
- start_migratetype, fallback_mt,
- alloc_flags);
- if (page) {
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
- return page;
- }
}

return NULL;
@@ -2992,19 +3520,23 @@ static __always_inline struct page *
__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
{
struct superpageblock *sb;
- struct free_area *area;
int current_order;
struct page *page;
int fallback_mt;

+ /*
+ * Search per-superpageblock free lists for fallback migratetypes.
+ * Superpageblocks are always enabled for populated zones.
+ */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
- area = &(zone->free_area[current_order]);
- fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false);
- if (fallback_mt == -1)
+ page = __rmqueue_sb_find_fallback(zone, current_order,
+ start_migratetype,
+ &fallback_mt,
+ SB_SEARCH_PREFERRED | SB_SEARCH_FALLBACK);
+
+ if (!page)
continue;

- page = get_page_from_free_area(area, fallback_mt);
page_del_and_expand(zone, page, order, current_order, fallback_mt);

/*
@@ -3239,33 +3771,11 @@ static bool rmqueue_bulk(struct zone *zone, unsigned int order,
goto out;

/*
- * Phase 2: Zone too fragmented for whole pageblocks.
- * Sweep zone free lists top-down for same-migratetype
- * chunks. Avoids cross-type stealing and keeps PCP
- * functional under fragmentation.
- *
- * No ownership claim or PagePCPBuddy - these are
- * sub-pageblock fragments cached for batching only.
- *
- * Stop above the requested order -- at that point,
- * phase 3's __rmqueue() does the same lookup but with
- * migratetype fallback.
+ * Phase 2 was removed: it swept zone free lists for sub-pageblock
+ * fragments, which are always empty when superpageblocks are enabled.
+ * Phase 3's __rmqueue() -> __rmqueue_smallest() properly searches
+ * per-superpageblock free lists at all orders.
*/
- for (o = pageblock_order - 1;
- o > (int)order && refilled < pages_needed; o--) {
- struct free_area *area = &zone->free_area[o];
- struct page *page;
-
- while (refilled + (1 << o) <= pages_needed) {
- page = get_page_from_free_area(area, migratetype);
- if (!page)
- break;
-
- del_page_from_free_list(page, zone, o, migratetype);
- pcp_enqueue_tail(pcp, page, migratetype, o);
- refilled += 1 << o;
- }
- }

/*
* Phase 3: Last resort. Use __rmqueue() which does
@@ -4367,10 +4877,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,

spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
- struct free_area *area = &(zone->free_area[order]);
+ struct free_area *area;
+ struct superpageblock *sb;
unsigned long size;
-
- page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ unsigned long i;
+
+ page = NULL;
+ /* Search per-superpageblock free lists */
+ for (i = 0; i < zone->nr_superpageblocks && !page; i++) {
+ sb = &zone->superpageblocks[i];
+ area = &sb->free_area[order];
+ page = get_page_from_free_area(area,
+ MIGRATE_HIGHATOMIC);
+ }
if (!page)
continue;

@@ -4501,29 +5020,20 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
if (!order)
return true;

- /* For a high-order request, check at least one suitable page is free */
+ /*
+ * For a high-order request, check at least one suitable page is free.
+ * Zone free_area nr_free is shadowed -- it includes pages on
+ * per-superpageblock free lists. A non-zero nr_free means the allocator
+ * will find pages on superpageblock lists even if zone list heads are
+ * empty.
+ */
for (o = order; o < NR_PAGE_ORDERS; o++) {
struct free_area *area = &z->free_area[o];
- int mt;

if (!area->nr_free)
continue;

- for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
- if (!free_area_empty(area, mt))
- return true;
- }
-
-#ifdef CONFIG_CMA
- if ((alloc_flags & ALLOC_CMA) &&
- !free_area_empty(area, MIGRATE_CMA)) {
- return true;
- }
-#endif
- if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
- !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
- return true;
- }
+ return true;
}
return false;
}
@@ -8991,11 +9501,12 @@ static int superpageblock_debugfs_show(struct seq_file *m, void *v)
/* Per-superpageblock detail */
for (i = 0; i < zone->nr_superpageblocks; i++) {
sb = &zone->superpageblocks[i];
- seq_printf(m, " sb[%lu] pfn=0x%lx: unmov=%u recl=%u mov=%u rsv=%u free=%u total=%u\n",
+ seq_printf(m, " sb[%lu] pfn=0x%lx: unmov=%u recl=%u mov=%u rsv=%u free=%u total=%u free_pages=%lu\n",
i, sb->start_pfn,
sb->nr_unmovable, sb->nr_reclaimable,
sb->nr_movable, sb->nr_reserved,
- sb->nr_free, sb->total_pageblocks);
+ sb->nr_free, sb->total_pageblocks,
+ sb->nr_free_pages);
}
}
return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7b48b84287a7..9133254b6b87 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1575,41 +1575,51 @@ static int frag_show(struct seq_file *m, void *arg)
static void pagetypeinfo_showfree_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone)
{
+ unsigned long counts[MIGRATE_TYPES][NR_PAGE_ORDERS] = { };
+ bool overflow[MIGRATE_TYPES][NR_PAGE_ORDERS] = { };
+ unsigned long sb_idx, nr_sbs = zone->nr_superpageblocks;
int order, mtype;

+ /*
+ * Free pages live on per-superpageblock free lists. Walk the SPBs,
+ * accumulating per (migratetype, order) counts. The 100000 cap per
+ * cell limits time under zone->lock; this is a debugging interface,
+ * knowing there is "a lot" of one size is sufficient. zone->lock is
+ * dropped between SPBs, so concurrent memory hotplug may produce
+ * inconsistent counts -- acceptable for a debug-only interface.
+ */
+ for (sb_idx = 0; sb_idx < nr_sbs; sb_idx++) {
+ struct superpageblock *sb = &zone->superpageblocks[sb_idx];
+
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct free_area *area = &sb->free_area[order];
+ struct list_head *curr;
+
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
+ if (overflow[mtype][order])
+ continue;
+ list_for_each(curr, &area->free_list[mtype]) {
+ if (++counts[mtype][order] >= 100000) {
+ overflow[mtype][order] = true;
+ break;
+ }
+ }
+ }
+ }
+ spin_unlock_irq(&zone->lock);
+ cond_resched();
+ spin_lock_irq(&zone->lock);
+ }
+
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
seq_printf(m, "Node %4d, zone %8s, type %12s ",
pgdat->node_id,
zone->name,
migratetype_names[mtype]);
- for (order = 0; order < NR_PAGE_ORDERS; ++order) {
- unsigned long freecount = 0;
- struct free_area *area;
- struct list_head *curr;
- bool overflow = false;
-
- area = &(zone->free_area[order]);
-
- list_for_each(curr, &area->free_list[mtype]) {
- /*
- * Cap the free_list iteration because it might
- * be really large and we are under a spinlock
- * so a long time spent here could trigger a
- * hard lockup detector. Anyway this is a
- * debugging tool so knowing there is a handful
- * of pages of this order should be more than
- * sufficient.
- */
- if (++freecount >= 100000) {
- overflow = true;
- break;
- }
- }
- seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
- spin_unlock_irq(&zone->lock);
- cond_resched();
- spin_lock_irq(&zone->lock);
- }
+ for (order = 0; order < NR_PAGE_ORDERS; order++)
+ seq_printf(m, "%s%6lu ",
+ overflow[mtype][order] ? ">" : "",
+ counts[mtype][order]);
seq_putc(m, '\n');
}
}
--
2.54.0