Re: [PATCH V2 0/6] mm: page_alloc: freelist migratetype hygiene
From: Zi Yan
Date: Mon Oct 16 2023 - 09:36:14 EST
> The attached patch has all the suggested changes, let me know how it
> looks to you. Thanks.
The one I sent has free page accounting issues. The attached one fixes them.
--
Best Regards,
Yan, Zi
From b428b4919e30dc0556406325d3c173a87f45f135 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@xxxxxxxxxx>
Date: Mon, 25 Sep 2023 16:55:18 -0400
Subject: [PATCH v2] mm/page_isolation: split cross-pageblock free pages during
isolation
alloc_contig_range() uses set_migrateype_isolate(), which eventually calls
move_freepages(), to isolate free pages. But move_freepages() was not able
to move free pages partially covered by the specified range, leaving a race
window open[1]. Fix it by splitting such pages before calling
move_freepages().
A common code to find the start pfn of a free page straddling a given pfn
is refactored in find_straddling_buddy(). split_free_page() is modified
to change pageblock migratetype inside the function.
[1] https://lore.kernel.org/linux-mm/20230920160400.GC124289@xxxxxxxxxxx/
Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
---
include/linux/page-isolation.h | 12 +++-
mm/internal.h | 3 -
mm/page_alloc.c | 103 ++++++++++++++++++------------
mm/page_isolation.c | 113 ++++++++++++++++++++++-----------
4 files changed, 151 insertions(+), 80 deletions(-)
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 901915747960..e82ab67867df 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -33,9 +33,17 @@ static inline bool is_migrate_isolate(int migratetype)
#define MEMORY_OFFLINE 0x1
#define REPORT_FAILURE 0x2
+unsigned long find_straddling_buddy(unsigned long start_pfn);
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset,
+ int mt1, int mt2);
void set_pageblock_migratetype(struct page *page, int migratetype);
-int move_freepages_block(struct zone *zone, struct page *page,
- int old_mt, int new_mt);
+int move_freepages(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn, int old_mt, int new_mt);
+bool prep_move_freepages_block(struct zone *zone, struct page *page,
+ unsigned long *start_pfn,
+ unsigned long *end_pfn,
+ int *num_free, int *num_movable);
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int flags, gfp_t gfp_flags);
diff --git a/mm/internal.h b/mm/internal.h
index 8c90e966e9f8..cda702359c0f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -457,9 +457,6 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
unsigned long, enum meminit_context, struct vmem_altmap *, int);
-int split_free_page(struct page *free_page,
- unsigned int order, unsigned long split_pfn_offset);
-
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 928bb595d7cc..e877fbdb700e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -851,6 +851,8 @@ static inline void __free_one_page(struct page *page,
* @free_page: the original free page
* @order: the order of the page
* @split_pfn_offset: split offset within the page
+ * @mt1: migratetype set before the offset
+ * @mt2: migratetype set after the offset
*
* Return -ENOENT if the free page is changed, otherwise 0
*
@@ -860,20 +862,21 @@ static inline void __free_one_page(struct page *page,
* nothing.
*/
int split_free_page(struct page *free_page,
- unsigned int order, unsigned long split_pfn_offset)
+ unsigned int order, unsigned long split_pfn_offset,
+ int mt1, int mt2)
{
struct zone *zone = page_zone(free_page);
unsigned long free_page_pfn = page_to_pfn(free_page);
unsigned long pfn;
- unsigned long flags;
int free_page_order;
int mt;
int ret = 0;
- if (split_pfn_offset == 0)
- return ret;
+ /* zone lock should be held when this function is called */
+ lockdep_assert_held(&zone->lock);
- spin_lock_irqsave(&zone->lock, flags);
+ if (split_pfn_offset == 0 || split_pfn_offset >= (1 << order))
+ return ret;
if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
ret = -ENOENT;
@@ -883,6 +886,10 @@ int split_free_page(struct page *free_page,
mt = get_pfnblock_migratetype(free_page, free_page_pfn);
del_page_from_free_list(free_page, zone, order, mt);
+ set_pageblock_migratetype(free_page, mt1);
+ set_pageblock_migratetype(pfn_to_page(free_page_pfn + split_pfn_offset),
+ mt2);
+
for (pfn = free_page_pfn;
pfn < free_page_pfn + (1UL << order);) {
int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
@@ -899,7 +906,6 @@ int split_free_page(struct page *free_page,
split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
}
out:
- spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
/*
@@ -1588,21 +1594,52 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order) { return NULL; }
#endif
+/*
+ * Scan the range before this pfn for a buddy that straddles it
+ */
+unsigned long find_straddling_buddy(unsigned long start_pfn)
+{
+ int order = 0;
+ struct page *page;
+ unsigned long pfn = start_pfn;
+
+ while (!PageBuddy(page = pfn_to_page(pfn))) {
+ /* Nothing found */
+ if (++order > MAX_ORDER)
+ return start_pfn;
+ pfn &= ~0UL << order;
+ }
+
+ /*
+ * Found a preceding buddy, but does it straddle?
+ */
+ if (pfn + (1 << buddy_order(page)) > start_pfn)
+ return pfn;
+
+ /* Nothing found */
+ return start_pfn;
+}
+
/*
* Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
-static int move_freepages(struct zone *zone, unsigned long start_pfn,
+int move_freepages(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn, int old_mt, int new_mt)
{
- struct page *page;
- unsigned long pfn;
- unsigned int order;
+ struct page *start_page = pfn_to_page(start_pfn);
int pages_moved = 0;
+ unsigned long pfn = start_pfn;
+
+ VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
+ VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn);
+
+ /* Move buddies within the block */
+ while (pfn <= end_pfn) {
+ struct page *page = pfn_to_page(pfn);
+ int order, nr_pages;
- for (pfn = start_pfn; pfn <= end_pfn;) {
- page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
pfn++;
continue;
@@ -1613,16 +1650,20 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn,
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = buddy_order(page);
+ nr_pages = 1 << order;
+
move_to_free_list(page, zone, order, old_mt, new_mt);
- pfn += 1 << order;
- pages_moved += 1 << order;
+
+ pfn += nr_pages;
+ pages_moved += nr_pages;
}
- set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+
+ set_pageblock_migratetype(start_page, new_mt);
return pages_moved;
}
-static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+bool prep_move_freepages_block(struct zone *zone, struct page *page,
unsigned long *start_pfn,
unsigned long *end_pfn,
int *num_free, int *num_movable)
@@ -6138,7 +6179,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
- int order;
int ret = 0;
struct compact_control cc = {
@@ -6212,28 +6252,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* isolated thus they won't get removed from buddy.
*/
- order = 0;
- outer_start = start;
- while (!PageBuddy(pfn_to_page(outer_start))) {
- if (++order > MAX_ORDER) {
- outer_start = start;
- break;
- }
- outer_start &= ~0UL << order;
- }
-
- if (outer_start != start) {
- order = buddy_order(pfn_to_page(outer_start));
-
- /*
- * outer_start page could be small order buddy page and
- * it doesn't include start page. Adjust outer_start
- * in this case to report failed page properly
- * on tracepoint in test_pages_isolated()
- */
- if (outer_start + (1UL << order) <= start)
- outer_start = start;
- }
+ /*
+ * outer_start page could be small order buddy page and it doesn't
+ * include start page. outer_start is set to start in
+ * find_straddling_buddy() to report failed page properly on tracepoint
+ * in test_pages_isolated()
+ */
+ outer_start = find_straddling_buddy(start);
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, 0)) {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5f8c658c0853..0500dff477f8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -139,6 +139,62 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
return NULL;
}
+/*
+ * additional steps for moving free pages during page isolation
+ */
+static int move_freepages_for_isolation(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn, int old_mt, int new_mt)
+{
+ struct page *start_page = pfn_to_page(start_pfn);
+ unsigned long pfn;
+
+ VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
+ VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn);
+
+ /*
+ * A free page may be comprised of 2^n blocks, which means our
+ * block of interest could be head or tail in such a page.
+ *
+ * If we're a tail, update the type of our block, then split
+ * the page into pageblocks. The splitting will do the leg
+ * work of sorting the blocks into the right freelists.
+ *
+ * If we're a head, split the page into pageblocks first. This
+ * ensures the migratetypes still match up during the freelist
+ * removal. Then do the regular scan for buddies in the block
+ * of interest, which will handle the rest.
+ *
+ * In theory, we could try to preserve 2^1 and larger blocks
+ * that lie outside our range. In practice, MAX_ORDER is
+ * usually one or two pageblocks anyway, so don't bother.
+ *
+ * Note that this only applies to page isolation, which calls
+ * this on random blocks in the pfn range! When we move stuff
+ * from inside the page allocator, the pages are coming off
+ * the freelist (can't be tail) and multi-block pages are
+ * handled directly in the stealing code (can't be a head).
+ */
+
+ /* We're a tail */
+ pfn = find_straddling_buddy(start_pfn);
+ if (pfn != start_pfn) {
+ struct page *free_page = pfn_to_page(pfn);
+
+ split_free_page(free_page, buddy_order(free_page),
+ pageblock_nr_pages, old_mt, new_mt);
+ return pageblock_nr_pages;
+ }
+
+ /* We're a head */
+ if (PageBuddy(start_page) && buddy_order(start_page) > pageblock_order) {
+ split_free_page(start_page, buddy_order(start_page),
+ pageblock_nr_pages, new_mt, old_mt);
+ return pageblock_nr_pages;
+ }
+
+ return 0;
+}
+
/*
* This function set pageblock migratetype to isolate if no unmovable page is
* present in [start_pfn, end_pfn). The pageblock must intersect with
@@ -178,15 +234,17 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
migratetype, isol_flags);
if (!unmovable) {
- int nr_pages;
int mt = get_pageblock_migratetype(page);
+ unsigned long start_pfn, end_pfn;
- nr_pages = move_freepages_block(zone, page, mt, MIGRATE_ISOLATE);
- /* Block spans zone boundaries? */
- if (nr_pages == -1) {
+ if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, NULL, NULL)) {
spin_unlock_irqrestore(&zone->lock, flags);
return -EBUSY;
}
+
+ if (!move_freepages_for_isolation(zone, start_pfn, end_pfn, mt, MIGRATE_ISOLATE))
+ move_freepages(zone, start_pfn, end_pfn, mt, MIGRATE_ISOLATE);
+
zone->nr_isolate_pageblock++;
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
@@ -253,13 +311,16 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
* allocation.
*/
if (!isolated_page) {
- int nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
- migratetype);
+ unsigned long start_pfn, end_pfn;
+
/*
* Isolating this block already succeeded, so this
* should not fail on zone boundaries.
*/
- WARN_ON_ONCE(nr_pages == -1);
+ if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, NULL, NULL))
+ WARN_ON_ONCE(1);
+ else if (!move_freepages_for_isolation(zone, start_pfn, end_pfn, MIGRATE_ISOLATE, migratetype))
+ move_freepages(zone, start_pfn, end_pfn, MIGRATE_ISOLATE, migratetype);
} else {
set_pageblock_migratetype(page, migratetype);
__putback_isolated_page(page, order, migratetype);
@@ -380,11 +441,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (PageBuddy(page)) {
int order = buddy_order(page);
- if (pfn + (1UL << order) > boundary_pfn) {
- /* free page changed before split, check it again */
- if (split_free_page(page, order, boundary_pfn - pfn))
- continue;
- }
+ VM_WARN_ONCE(pfn + (1UL << order) > boundary_pfn,
+ "a free page sits across isolation boundary");
pfn += 1UL << order;
continue;
@@ -408,8 +466,6 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
* can be migrated. Otherwise, fail the isolation.
*/
if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
- int order;
- unsigned long outer_pfn;
int page_mt = get_pageblock_migratetype(page);
bool isolate_page = !is_migrate_isolate_page(page);
struct compact_control cc = {
@@ -427,9 +483,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
/*
* XXX: mark the page as MIGRATE_ISOLATE so that
* no one else can grab the freed page after migration.
- * Ideally, the page should be freed as two separate
- * pages to be added into separate migratetype free
- * lists.
+ * The page should be freed into separate migratetype
+ * free lists, unless the free page order is greater
+ * than pageblock order. It is not the case now,
+ * since gigantic hugetlb is freed as order-0
+ * pages and LRU pages do not cross pageblocks.
*/
if (isolate_page) {
ret = set_migratetype_isolate(page, page_mt,
@@ -451,25 +509,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (ret)
goto failed;
- /*
- * reset pfn to the head of the free page, so
- * that the free page handling code above can split
- * the free page to the right migratetype list.
- *
- * head_pfn is not used here as a hugetlb page order
- * can be bigger than MAX_ORDER, but after it is
- * freed, the free page order is not. Use pfn within
- * the range to find the head of the free page.
- */
- order = 0;
- outer_pfn = pfn;
- while (!PageBuddy(pfn_to_page(outer_pfn))) {
- /* stop if we cannot find the free page */
- if (++order > MAX_ORDER)
- goto failed;
- outer_pfn &= ~0UL << order;
- }
- pfn = outer_pfn;
+
+ pfn = head_pfn + nr_pages;
continue;
} else
#endif
--
2.42.0
Attachment:
signature.asc
Description: OpenPGP digital signature