Re: [PATCH] mm/mglru: fix cgroup OOM during MGLRU state switching
From: Barry Song
Date: Sat Feb 28 2026 - 23:10:41 EST
On Sun, Mar 1, 2026 at 6:41 AM Barry Song <21cnbao@xxxxxxxxx> wrote:
>
> On Sun, Mar 1, 2026 at 5:28 AM Barry Song <21cnbao@xxxxxxxxx> wrote:
> [...]
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 3e51190a55e4..ba306e986050 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -509,6 +509,8 @@ struct lru_gen_folio {
> > atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
> > /* whether the multi-gen LRU is enabled */
> > bool enabled;
> > + /* whether the multi-gen LRU is switching from/to active/inactive LRU */
> > + bool switching;
> > /* the memcg generation this lru_gen_folio belongs to */
> > u8 gen;
> > /* the list segment this lru_gen_folio belongs to */
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 0fc9373e8251..60fc611067c7 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -5196,6 +5196,7 @@ static void lru_gen_change_state(bool enabled)
> > VM_WARN_ON_ONCE(!state_is_valid(lruvec));
> >
> > lruvec->lrugen.enabled = enabled;
> > + smp_store_release(&lruvec->lrugen.switching, true);
>
> Sorry, I actually meant:
>
> + smp_store_release(&lruvec->lrugen.switching, true);
> lruvec->lrugen.enabled = enabled;
>
> But I guess we could still hit a race condition in extreme cases—switching
> MGLRU on or off as frequently as possible. The only reliable way is to check
> enabled during shrinking while holding the lruvec’s lock.
Sorry, I was talking to myself.... Since the switching and the 'enabled'
state are not inherently serialized with shrink_lruvec(), their values
can change at any time, leading to race conditions.
Therefore, I believe the only safe approach is:
1. Do not allow enabling or disabling MGLRU on an lruvec while
shrink_lruvec() is running.
2. Do not allow shrink_lruvec() to run while MGLRU is being enabled
or disabled on that lruvec.
Something like the following:
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..c4b07159577e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -509,6 +509,7 @@ struct lru_gen_folio {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */
bool enabled;
+ struct rw_semaphore switch_lock;
/* the memcg generation this lru_gen_folio belongs to */
u8 gen;
/* the list segment this lru_gen_folio belongs to */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0fc9373e8251..aadf1e7c31cf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5190,6 +5190,7 @@ static void lru_gen_change_state(bool enabled)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ down_write(&lruvec->lrugen.switch_lock);
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -5204,6 +5205,7 @@ static void lru_gen_change_state(bool enabled)
}
spin_unlock_irq(&lruvec->lru_lock);
+ up_write(&lruvec->lrugen.switch_lock);
}
cond_resched();
@@ -5680,6 +5682,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
+ init_rwsem(&lrugen->switch_lock);
for (i = 0; i <= MIN_NR_GENS + 1; i++)
lrugen->timestamps[i] = jiffies;
@@ -5780,10 +5783,14 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim;
struct blk_plug plug;
- if (lru_gen_enabled() && !root_reclaim(sc)) {
+#ifdef CONFIG_LRU_GEN
+ down_read(&lruvec->lrugen.switch_lock);
+ if (lruvec->lrugen.enabled && !root_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc);
+ up_read(&lruvec->lrugen.switch_lock);
return;
}
+#endif
get_scan_count(lruvec, sc, nr);
@@ -5885,6 +5892,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
+#ifdef CONFIG_LRU_GEN
+ up_read(&lruvec->lrugen.switch_lock);
+#endif
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
--
Thanks
Barry