[PATCH v7 mm-unstable] mm: vmscan: retry folios written back while isolated for traditional LRU

From: Chen Ridong
Date: Sat Jan 11 2025 - 04:26:07 EST


From: Chen Ridong <chenridong@xxxxxxxxxx>

As commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back
while isolated") mentioned:

The page reclaim isolates a batch of folios from the tail of one of the
LRU lists and works on those folios one by one. For a suitable
swap-backed folio, if the swap device is async, it queues that folio for
writeback. After the page reclaim finishes an entire batch, it puts back
the folios it queued for writeback to the head of the original LRU list.

In the meantime, the page writeback flushes the queued folios also by
batches. Its batching logic is independent from that of the page
reclaim. For each of the folios it writes back, the page writeback calls
folio_rotate_reclaimable() which tries to rotate a folio to the tail.

folio_rotate_reclaimable() only works for a folio after the page reclaim
has put it back. If an async swap device is fast enough, the page
writeback can finish with that folio while the page reclaim is still
working on the rest of the batch containing it. In this case, that folio
will remain at the head and the page reclaim will not retry it before
reaching there".

The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back
while isolated") only fixed the issue for mglru. However, this issue
also exists in the traditional active/inactive LRU and was found at [1].

It can be reproduced with below steps:

1. Compile with CONFIG_TRANSPARENT_HUGEPAGE=y
2. Mount memcg v1, and create memcg named test_memcg and set
limit_in_bytes=1G, memsw.limit_in_bytes=2G.
3. Create a 1G swap file, and allocate 1.05G anon memory in test_memcg.

It was found that:

cat memory.limit_in_bytes
1073741824
cat memory.memsw.limit_in_bytes
2147483648
cat memory.usage_in_bytes
1073664000
cat memory.memsw.usage_in_bytes
1129840640

free -h
total used free
Mem: 31Gi 1.2Gi 28Gi
Swap: 1.0Gi 1.0Gi 2.0Mi

As shown above, the test_memcg used about 50M swap, but almost 1G swap
memory was used, which means that 900M+ may be wasted because other memcgs
can not use these swap memory.

This issue should be fixed in the same way as mglru. Therefore, the common
logic was extracted to the 'find_folios_written_back' function firstly,
which is then reused in the 'shrink_inactive_list' function. Finally,
retry reclaiming those folios that may have missed the rotation for
traditional LRU.

After change, the same test case. only 54M swap was used.

cat memory.usage_in_bytes
1073463296
cat memory.memsw.usage_in_bytes
1129828352

free -h
total used free
Mem: 31Gi 1.2Gi 28Gi
Swap: 1.0Gi 54Mi 969Mi

[1] https://lore.kernel.org/linux-kernel/20241010081802.290893-1-chenridong@xxxxxxxxxxxxxxx/
[2] https://lore.kernel.org/linux-kernel/CAGsJ_4zqL8ZHNRZ44o_CC69kE7DBVXvbZfvmQxMGiFqRxqHQdA@xxxxxxxxxxxxxx/
Signed-off-by: Chen Ridong <chenridong@xxxxxxxxxx>
---

v6->v7:
- fix conflict based on mm-unstable.
- update the commit message(quote from YU's commit message, and add
improvements after change.)
- restore 'is_retrying' to 'skip_retry' to keep original semantics.

v6: https://lore.kernel.org/linux-kernel/20241223082004.3759152-1-chenridong@xxxxxxxxxxxxxxx/

mm/vmscan.c | 114 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 76 insertions(+), 38 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 01dce6f26..6861b6937 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -183,6 +183,9 @@ struct scan_control {
struct reclaim_state reclaim_state;
};

+static inline void find_folios_written_back(struct list_head *list,
+ struct list_head *clean, struct lruvec *lruvec, int type, bool is_retrying);
+
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_folio(_folio, _base, _field) \
do { \
@@ -1960,14 +1963,18 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
enum lru_list lru)
{
LIST_HEAD(folio_list);
+ LIST_HEAD(clean_list);
unsigned long nr_scanned;
- unsigned int nr_reclaimed = 0;
+ unsigned int nr_reclaimed, total_reclaimed = 0;
+ unsigned int nr_pageout = 0;
+ unsigned int nr_unqueued_dirty = 0;
unsigned long nr_taken;
struct reclaim_stat stat;
bool file = is_file_lru(lru);
enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
bool stalled = false;
+ bool skip_retry = false;

while (unlikely(too_many_isolated(pgdat, file, sc))) {
if (stalled)
@@ -2001,22 +2008,47 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;

+retry:
nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);

+ sc->nr.dirty += stat.nr_dirty;
+ sc->nr.congested += stat.nr_congested;
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+ sc->nr.writeback += stat.nr_writeback;
+ sc->nr.immediate += stat.nr_immediate;
+ total_reclaimed += nr_reclaimed;
+ nr_pageout += stat.nr_pageout;
+ nr_unqueued_dirty += stat.nr_unqueued_dirty;
+
+ trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+ nr_scanned, nr_reclaimed, &stat, sc->priority, file);
+
+ find_folios_written_back(&folio_list, &clean_list, lruvec, 0, skip_retry);
+
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);

__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
stat.nr_demoted);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
+
+ if (!list_empty(&clean_list)) {
+ list_splice_init(&clean_list, &folio_list);
+ skip_retry = true;
+ spin_unlock_irq(&lruvec->lru_lock);
+ goto retry;
+ }
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
+ sc->nr.taken += nr_taken;
+ if (file)
+ sc->nr.file_taken += nr_taken;

- lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ lru_note_cost(lruvec, file, nr_pageout, nr_scanned - total_reclaimed);

/*
* If dirty folios are scanned that are not queued for IO, it
@@ -2029,7 +2061,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
- if (stat.nr_unqueued_dirty == nr_taken) {
+ if (nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN);
/*
* For cgroupv1 dirty throttling is achieved by waking up
@@ -2044,18 +2076,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}

- sc->nr.dirty += stat.nr_dirty;
- sc->nr.congested += stat.nr_congested;
- sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
- sc->nr.writeback += stat.nr_writeback;
- sc->nr.immediate += stat.nr_immediate;
- sc->nr.taken += nr_taken;
- if (file)
- sc->nr.file_taken += nr_taken;
-
- trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
- nr_scanned, nr_reclaimed, &stat, sc->priority, file);
- return nr_reclaimed;
+ return total_reclaimed;
}

/*
@@ -4637,8 +4658,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
int reclaimed;
LIST_HEAD(list);
LIST_HEAD(clean);
- struct folio *folio;
- struct folio *next;
enum vm_event_item item;
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
@@ -4668,26 +4687,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
scanned, reclaimed, &stat, sc->priority,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);

- list_for_each_entry_safe_reverse(folio, next, &list, lru) {
- DEFINE_MIN_SEQ(lruvec);
-
- if (!folio_evictable(folio)) {
- list_del(&folio->lru);
- folio_putback_lru(folio);
- continue;
- }
-
- /* retry folios that may have missed folio_rotate_reclaimable() */
- if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
- !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
- list_move(&folio->lru, &clean);
- continue;
- }
-
- /* don't add rejected folios to the oldest generation */
- if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
- }
+ find_folios_written_back(&list, &clean, lruvec, type, skip_retry);

spin_lock_irq(&lruvec->lru_lock);

@@ -5706,6 +5706,44 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *

#endif /* CONFIG_LRU_GEN */

+/**
+ * find_folios_written_back - Find and move the written back folios to a new list.
+ * @list: filios list
+ * @clean: the written back folios list
+ * @lruvec: the lruvec
+ * @type: LRU_GEN_ANON/LRU_GEN_FILE, only for multi-gen LRU
+ * @skip_retry: whether skip retry.
+ */
+static inline void find_folios_written_back(struct list_head *list,
+ struct list_head *clean, struct lruvec *lruvec, int type, bool skip_retry)
+{
+ struct folio *folio;
+ struct folio *next;
+
+ list_for_each_entry_safe_reverse(folio, next, list, lru) {
+#ifdef CONFIG_LRU_GEN
+ DEFINE_MIN_SEQ(lruvec);
+#endif
+ if (!folio_evictable(folio)) {
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ continue;
+ }
+
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+ !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+ list_move(&folio->lru, clean);
+ continue;
+ }
+#ifdef CONFIG_LRU_GEN
+ /* don't add rejected folios to the oldest generation */
+ if (lru_gen_enabled() && lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
+#endif
+ }
+}
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
--
2.34.1