Re: [PATCH] mm: Fix endless reclaim on machines with unaccepted memory.

From: Kirill A. Shutemov
Date: Tue Jul 23 2024 - 09:53:40 EST


On Tue, Jul 23, 2024 at 01:55:13PM +0200, Michal Hocko wrote:
> On Tue 23-07-24 12:49:41, Kirill A. Shutemov wrote:
> > On Tue, Jul 23, 2024 at 09:30:27AM +0200, Vlastimil Babka wrote:
> [...]
> > > Although just removing the lazy accept mode would be much more appealing
> > > solution than this :)
> >
> > :P
> >
> > Not really an option for big VMs. It might add many minutes to boot time.
>
> Well a huge part of that can be done in the background so the boot
> doesn't really have to wait for all of it. If we really have to start
> playing whack-a-mole to plug all the potential ways to trigger reclaim
> imbalance I think it is fair to re-evaluate how much lazy should the
> initialization really be.

One other option I see is to treat unaccepted memory as free, so
watermarks would not fail if we have unaccepted memory. No spinning in
kswapd in this case.

Only get_page_from_freelist() and __alloc_pages_bulk() is aware about
unaccepted memory.

The quick patch below shows the idea.

I am not sure how it would affect __isolate_free_page() callers. IIUC,
they expect to see pages on free lists, but might not find them there
in this scenario because they are not accepted yet.
I need to look closer at this.

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c11b7cde81ef..5e0bdfbe2f1f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -667,6 +667,7 @@ enum zone_watermarks {
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d39f34d336..254bfe29eaf1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -304,7 +304,7 @@ EXPORT_SYMBOL(nr_online_nodes);

static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
-static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);

@@ -2947,9 +2947,6 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
-#ifdef CONFIG_UNACCEPTED_MEMORY
- unusable_free += zone_page_state(z, NR_UNACCEPTED);
-#endif

return unusable_free;
}
@@ -3243,6 +3240,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
}

+ cond_accept_memory(zone, order);
+
/*
* Detect whether the number of free pages is below high
* watermark. If so, we will decrease pcp->high and free
@@ -3268,10 +3267,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
gfp_mask)) {
int ret;

- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
@@ -3325,10 +3322,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,

return page;
} else {
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
@@ -4456,12 +4451,25 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
goto failed;
}

+ cond_accept_memory(zone, 0);
+retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
zonelist_zone_idx(ac.preferred_zoneref),
alloc_flags, gfp)) {
break;
}
+
+ if (cond_accept_memory(zone, 0))
+ goto retry_this_zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /* Try again if zone has deferred pages */
+ if (deferred_pages_enabled()) {
+ if (_deferred_grow_zone(zone, 0))
+ goto retry_this_zone;
+ }
+#endif
}

/*
@@ -6833,9 +6841,6 @@ static bool try_to_accept_memory_one(struct zone *zone)
struct page *page;
bool last;

- if (list_empty(&zone->unaccepted_pages))
- return false;
-
spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
struct page, lru);
@@ -6861,23 +6866,29 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}

-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
int ret = false;

- /* How much to accept to get to high watermark? */
- to_accept = high_wmark_pages(zone) -
- (zone_page_state(zone, NR_FREE_PAGES) -
- __zone_watermark_unusable_free(zone, order, 0));
+ if (!has_unaccepted_memory())
+ return false;

- /* Accept at least one page */
- do {
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
+
+ /* How much to accept to get to high watermark? */
+ to_accept = promo_wmark_pages(zone) -
+ (zone_page_state(zone, NR_FREE_PAGES) -
+ __zone_watermark_unusable_free(zone, order, 0) -
+ zone_page_state(zone, NR_UNACCEPTED));
+
+ while (to_accept > 0) {
if (!try_to_accept_memory_one(zone))
break;
ret = true;
to_accept -= MAX_ORDER_NR_PAGES;
- } while (to_accept > 0);
+ }

return ret;
}
@@ -6920,7 +6931,7 @@ static void accept_page(struct page *page, unsigned int order)
{
}

-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_ccept_memory(struct zone *zone, unsigned int order)
{
return false;
}
--
Kiryl Shutsemau / Kirill A. Shutemov