[RFC PATCH 06/40] mm: page_alloc: async evacuation of stolen movable pageblocks

From: Rik van Riel

Date: Wed May 20 2026 - 11:47:23 EST


When the page allocator steals a movable pageblock for unmovable or
reclaimable allocations (via try_to_claim_block), the remaining movable
pages in that block can prevent future unmovable/reclaimable allocations
from being concentrated in fewer pageblocks, leading to long-term memory
fragmentation.

Add a lightweight asynchronous evacuation mechanism: when a movable
pageblock is claimed for unmovable/reclaimable use, queue a work item to
migrate the remaining movable pages out. This allows future
unmovable/reclaimable allocations to be satisfied from the now-evacuated
block, keeping those allocation types concentrated and reducing
fragmentation.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
include/linux/mmzone.h | 4 +
mm/page_alloc.c | 223 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 227 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 13e29b2ebb86..90498bbbf60b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
+#include <linux/irq_work_types.h>
#include <linux/zswap.h>
#include <linux/sizes.h>
#include <asm/page.h>
@@ -1540,6 +1541,9 @@ typedef struct pglist_data {
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
bool proactive_compact_trigger;
+ struct workqueue_struct *evacuate_wq;
+ struct llist_head evacuate_pending;
+ struct irq_work evacuate_irq_work;
#endif
/*
* This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e01e58aca54..0f3d734bd296 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/interrupt.h>
+#include <linux/irq_work.h>
#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
@@ -51,6 +52,7 @@
#include <linux/lockdep.h>
#include <linux/psi.h>
#include <linux/khugepaged.h>
+#include <linux/workqueue.h>
#include <linux/delayacct.h>
#include <linux/cacheinfo.h>
#include <linux/pgalloc_tag.h>
@@ -59,6 +61,10 @@
#include "shuffle.h"
#include "page_reporting.h"

+#ifdef CONFIG_COMPACTION
+static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn);
+#endif
+
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;

@@ -2428,6 +2434,13 @@ try_to_claim_block(struct zone *zone, struct page *page,
int free_pages, movable_pages, alike_pages;
unsigned long start_pfn;

+ /*
+ * Don't steal from pageblocks that are isolated for
+ * evacuation -- that would undo the work in progress.
+ */
+ if (get_pageblock_isolate(page))
+ return NULL;
+
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
unsigned int nr_added;
@@ -2473,6 +2486,18 @@ try_to_claim_block(struct zone *zone, struct page *page,
page_group_by_mobility_disabled) {
__move_freepages_block(zone, start_pfn, block_type, start_type);
set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
+#ifdef CONFIG_COMPACTION
+ /*
+ * A movable pageblock was just claimed for unmovable or
+ * reclaimable use. Queue async evacuation of the remaining
+ * movable pages so future unmovable/reclaimable allocations
+ * can stay concentrated in fewer pageblocks.
+ */
+ if (block_type == MIGRATE_MOVABLE &&
+ (start_type == MIGRATE_UNMOVABLE ||
+ start_type == MIGRATE_RECLAIMABLE))
+ queue_pageblock_evacuate(zone, start_pfn);
+#endif
return __rmqueue_smallest(zone, order, start_type);
}

@@ -7184,6 +7209,204 @@ void __init page_alloc_sysctl_init(void)
register_sysctl_init("vm", page_alloc_sysctl_table);
}

+#ifdef CONFIG_COMPACTION
+/*
+ * Pageblock evacuation: asynchronously migrate movable pages out of
+ * pageblocks that were stolen for unmovable/reclaimable allocations.
+ * This keeps unmovable/reclaimable allocations concentrated in fewer
+ * pageblocks, reducing long-term fragmentation.
+ *
+ * Uses a global pool of 64 pre-allocated work items (~3.5KB total)
+ * and a per-pgdat workqueue to keep migration node-local.
+ */
+
+struct evacuate_item {
+ struct work_struct work;
+ struct zone *zone;
+ unsigned long start_pfn;
+ struct llist_node free_node;
+};
+
+#define NR_EVACUATE_ITEMS 64
+static struct evacuate_item evacuate_pool[NR_EVACUATE_ITEMS];
+static struct llist_head evacuate_freelist;
+
+static struct evacuate_item *evacuate_item_alloc(void)
+{
+ struct llist_node *node;
+
+ node = llist_del_first(&evacuate_freelist);
+ if (!node)
+ return NULL;
+ return container_of(node, struct evacuate_item, free_node);
+}
+
+static void evacuate_item_free(struct evacuate_item *item)
+{
+ llist_add(&item->free_node, &evacuate_freelist);
+}
+
+static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn)
+{
+ unsigned long end_pfn = start_pfn + pageblock_nr_pages;
+ unsigned long pfn = start_pfn;
+ int nr_reclaimed;
+ int ret = 0;
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = zone,
+ .mode = MIGRATE_ASYNC,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE,
+ };
+ struct migration_target_control mtc = {
+ .nid = zone_to_nid(zone),
+ .gfp_mask = GFP_HIGHUSER_MOVABLE,
+ };
+
+ /* Verify this pageblock is still worth evacuating */
+ if (get_pageblock_migratetype(pfn_to_page(start_pfn)) == MIGRATE_MOVABLE)
+ return;
+
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ /*
+ * Loop through the entire pageblock, isolating and migrating
+ * in batches. isolate_migratepages_range stops at
+ * COMPACT_CLUSTER_MAX, so we must loop to cover the full block.
+ */
+ while (pfn < end_pfn || !list_empty(&cc.migratepages)) {
+ if (list_empty(&cc.migratepages)) {
+ cc.nr_migratepages = 0;
+ cc.migrate_pfn = pfn;
+ ret = isolate_migratepages_range(&cc, pfn, end_pfn);
+ if (ret && ret != -EAGAIN)
+ break;
+ pfn = cc.migrate_pfn;
+ if (list_empty(&cc.migratepages))
+ break;
+ }
+
+ nr_reclaimed = reclaim_clean_pages_from_list(zone,
+ &cc.migratepages);
+ cc.nr_migratepages -= nr_reclaimed;
+
+ if (!list_empty(&cc.migratepages)) {
+ ret = migrate_pages(&cc.migratepages,
+ alloc_migration_target, NULL,
+ (unsigned long)&mtc, cc.mode,
+ MR_COMPACTION, NULL);
+ if (ret) {
+ putback_movable_pages(&cc.migratepages);
+ break;
+ }
+ }
+
+ cond_resched();
+ }
+
+ if (!list_empty(&cc.migratepages))
+ putback_movable_pages(&cc.migratepages);
+}
+
+static void evacuate_work_fn(struct work_struct *work)
+{
+ struct evacuate_item *item = container_of(work, struct evacuate_item,
+ work);
+ evacuate_pageblock(item->zone, item->start_pfn);
+ evacuate_item_free(item);
+}
+
+/**
+ * evacuate_irq_work_fn - IRQ work callback to drain pending evacuations
+ * @work: the irq_work embedded in pg_data_t
+ *
+ * queue_work() can deadlock when called from inside the page allocator
+ * because it may try to allocate memory with locks already held.
+ * Use irq_work to defer the queue_work() calls to a safe context.
+ */
+static void evacuate_irq_work_fn(struct irq_work *work)
+{
+ pg_data_t *pgdat = container_of(work, pg_data_t,
+ evacuate_irq_work);
+ struct llist_node *pending;
+ struct evacuate_item *item, *next;
+
+ if (!pgdat->evacuate_wq)
+ return;
+
+ /*
+ * Collect all pending items first, then queue them. Use _safe
+ * because evacuate_work_fn() may run immediately on another
+ * CPU and free the item before we follow the next pointer.
+ */
+ pending = llist_del_all(&pgdat->evacuate_pending);
+ llist_for_each_entry_safe(item, next, pending, free_node) {
+ INIT_WORK(&item->work, evacuate_work_fn);
+ queue_work(pgdat->evacuate_wq, &item->work);
+ }
+}
+
+/**
+ * queue_pageblock_evacuate - schedule async evacuation of movable pages
+ * @zone: the zone containing the pageblock
+ * @pfn: start PFN of the pageblock (must be pageblock-aligned)
+ *
+ * Called from the page allocator when a movable pageblock is claimed
+ * for unmovable or reclaimable allocations. Queues the pageblock for
+ * background migration of its remaining movable pages. Uses irq_work
+ * to defer the actual queue_work() call outside the allocator's lock
+ * context.
+ */
+static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn)
+{
+ struct evacuate_item *item;
+ pg_data_t *pgdat = zone->zone_pgdat;
+
+ if (!pgdat->evacuate_irq_work.func)
+ return;
+
+ item = evacuate_item_alloc();
+ if (!item)
+ return;
+
+ item->zone = zone;
+ item->start_pfn = pfn;
+ llist_add(&item->free_node, &pgdat->evacuate_pending);
+ irq_work_queue(&pgdat->evacuate_irq_work);
+}
+
+static int __init pageblock_evacuate_init(void)
+{
+ int nid, i;
+
+ /* Initialize the global freelist of work items */
+ init_llist_head(&evacuate_freelist);
+ for (i = 0; i < NR_EVACUATE_ITEMS; i++)
+ llist_add(&evacuate_pool[i].free_node, &evacuate_freelist);
+
+ /* Create a per-pgdat workqueue */
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ char name[32];
+
+ snprintf(name, sizeof(name), "kevacuate/%d", nid);
+ pgdat->evacuate_wq = alloc_workqueue(name, WQ_MEM_RECLAIM, 1);
+ if (!pgdat->evacuate_wq) {
+ pr_warn("Failed to create evacuate workqueue for node %d\n", nid);
+ continue;
+ }
+
+ init_llist_head(&pgdat->evacuate_pending);
+ init_irq_work(&pgdat->evacuate_irq_work,
+ evacuate_irq_work_fn);
+ }
+
+ return 0;
+}
+late_initcall(pageblock_evacuate_init);
+#endif /* CONFIG_COMPACTION */
+
#ifdef CONFIG_CONTIG_ALLOC
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
--
2.54.0