[PATCH mm-unstable v1] mm/mglru: fix ineffective protection calculation

From: Yu Zhao
Date: Fri Jul 12 2024 - 19:30:11 EST


mem_cgroup_calculate_protection() is not stateless and should only be
used as part of a top-down tree traversal. shrink_one() traverses the
per-node memcg LRU instead of the root_mem_cgroup tree, and therefore
it should not call mem_cgroup_calculate_protection().

The existing misuse in shrink_one() can cause ineffective protection
of sub-trees that are grandchildren of root_mem_cgroup. Fix it by
reusing lru_gen_age_node(), which already traverses the
root_mem_cgroup tree, to calculate the protection.

Previously lru_gen_age_node() opportunistically skips the first pass,
i.e., when scan_control->priority is DEF_PRIORITY. On the second pass,
lruvec_is_sizable() uses appropriate scan_control->priority, set by
set_initial_priority() from lru_gen_shrink_node(), to decide whether a
memcg is too small to reclaim from.

Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup
tree. So it should call set_initial_priority() upfront, to make sure
lruvec_is_sizable() uses appropriate scan_control->priority on the
first pass. Otherwise, lruvec_is_reclaimable() can return false
negatives and result in premature OOM kills when min_ttl_ms is used.

Reported-by: T.J. Mercier <tjmercier@xxxxxxxxxx>
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Cc: stable@xxxxxxxxxxxxxxx
Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
---
mm/vmscan.c | 86 +++++++++++++++++++++++++----------------------------
1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6216d79edb7f..525d3ffa8451 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3915,6 +3915,32 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
* working set protection
******************************************************************************/

+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on
+ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+ * where reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ /*
+ * The estimation is based on LRU pages only, so cap it to prevent
+ * overshoots of shrinker objects by large margins.
+ */
+ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+}
+
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
@@ -3948,19 +3974,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);

+ if (mem_cgroup_below_min(NULL, memcg))
+ return false;
+
+ if (!lruvec_is_sizable(lruvec, sc))
+ return false;
+
/* see the comment on lru_gen_folio */
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);

- if (time_is_after_jiffies(birth + min_ttl))
- return false;
-
- if (!lruvec_is_sizable(lruvec, sc))
- return false;
-
- mem_cgroup_calculate_protection(NULL, memcg);
-
- return !mem_cgroup_below_min(NULL, memcg);
+ return time_is_before_jiffies(birth + min_ttl);
}

/* to protect the working set of the last N jiffies */
@@ -3970,23 +3994,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ bool reclaimable = !min_ttl;

VM_WARN_ON_ONCE(!current_is_kswapd());

- /* check the order to exclude compaction-induced reclaim */
- if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
- return;
+ set_initial_priority(pgdat, sc);

memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);

- if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
- mem_cgroup_iter_break(NULL, memcg);
- return;
- }
+ mem_cgroup_calculate_protection(NULL, memcg);

- cond_resched();
+ if (!reclaimable)
+ reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));

/*
@@ -3994,7 +4015,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* younger than min_ttl. However, another possibility is all memcgs are
* either too small or below min.
*/
- if (mutex_trylock(&oom_lock)) {
+ if (!reclaimable && mutex_trylock(&oom_lock)) {
struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
@@ -4786,8 +4807,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);

- mem_cgroup_calculate_protection(NULL, memcg);
-
+ /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;

@@ -4911,32 +4931,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}

-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
-{
- int priority;
- unsigned long reclaimable;
-
- if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
- return;
- /*
- * Determine the initial priority based on
- * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
- * where reclaimed_to_scanned_ratio = inactive / total.
- */
- reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
- reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
-
- /* round down reclaimable and round up sc->nr_to_reclaim */
- priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
-
- /*
- * The estimation is based on LRU pages only, so cap it to prevent
- * overshoots of shrinker objects by large margins.
- */
- sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
-}
-
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct blk_plug plug;
--
2.45.2.993.g49e7a77208-goog