[PATCH v14 09/14] mm: multi-gen LRU: optimize multiple memcgs

From: Yu Zhao
Date: Mon Aug 15 2022 - 03:15:26 EST


When multiple memcgs are available, it is possible to make better
choices based on generations and tiers and therefore improve the
overall performance under global memory pressure. This patch adds a
rudimentary optimization to select memcgs that can drop single-use
unmapped clean pages first. Doing so reduces the chance of going into
the aging path or swapping. These two decisions can be costly.

A typical example that benefits from this optimization is a server
running mixed types of workloads, e.g., heavy anon workload in one
memcg and heavy buffered I/O workload in the other.

Though this optimization can be applied to both kswapd and direct
reclaim, it is only added to kswapd to keep the patchset manageable.
Later improvements will cover the direct reclaim path.

Server benchmark results:
Mixed workloads:
fio (buffered I/O): +[19, 21]%
IOPS BW
patch1-8: 1880k 7343MiB/s
patch1-9: 2252k 8796MiB/s

memcached (anon): +[119, 123]%
Ops/sec KB/sec
patch1-8: 862768.65 33514.68
patch1-9: 1911022.12 74234.54

Mixed workloads:
fio (buffered I/O): +[75, 77]%
IOPS BW
5.19-rc1: 1279k 4996MiB/s
patch1-9: 2252k 8796MiB/s

memcached (anon): +[13, 15]%
Ops/sec KB/sec
5.19-rc1: 1673524.04 65008.87
patch1-9: 1911022.12 74234.54

Configurations:
(changes since patch 6)

cat mixed.sh
modprobe brd rd_nr=2 rd_size=56623104

swapoff -a
mkswap /dev/ram0
swapon /dev/ram0

mkfs.ext4 /dev/ram1
mount -t ext4 /dev/ram1 /mnt

memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
--ratio 1:0 --pipeline 8 -d 2000

fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
--buffered=1 --ioengine=io_uring --iodepth=128 \
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
--rw=randread --random_distribution=random --norandommap \
--time_based --ramp_time=10m --runtime=90m --group_reporting &
pid=$!

sleep 200

memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed

kill -INT $pid
wait

Client benchmark results:
no change (CONFIG_MEMCG=n)

Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
Acked-by: Brian Geffon <bgeffon@xxxxxxxxxx>
Acked-by: Jan Alexander Steffens (heftig) <heftig@xxxxxxxxxxxxx>
Acked-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx>
Acked-by: Steven Barrett <steven@xxxxxxxxxxxx>
Acked-by: Suleiman Souhlal <suleiman@xxxxxxxxxx>
Tested-by: Daniel Byrne <djbyrne@xxxxxxx>
Tested-by: Donald Carr <d@xxxxxxxxxxxxxxx>
Tested-by: Holger Hoffstätte <holger@xxxxxxxxxxxxxxxxxxxxxx>
Tested-by: Konstantin Kharlamov <Hi-Angel@xxxxxxxxx>
Tested-by: Shuang Zhai <szhai2@xxxxxxxxxxxxxxxx>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@xxxxxxxxxxxxx>
---
mm/vmscan.c | 55 ++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d1dfc0a77b6f..ee51c752a3af 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,13 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;

+#ifdef CONFIG_LRU_GEN
+ /* help make better choices when multiple memcgs are available */
+ unsigned int memcgs_need_aging:1;
+ unsigned int memcgs_need_swapping:1;
+ unsigned int memcgs_avoid_swapping:1;
+#endif
+
/* Allocation order */
s8 order;

@@ -4437,6 +4444,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)

VM_WARN_ON_ONCE(!current_is_kswapd());

+ /*
+ * To reduce the chance of going into the aging path or swapping, which
+ * can be costly, optimistically skip them unless their corresponding
+ * flags were cleared in the eviction path. This improves the overall
+ * performance when multiple memcgs are available.
+ */
+ if (!sc->memcgs_need_aging) {
+ sc->memcgs_need_aging = true;
+ sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping;
+ sc->memcgs_need_swapping = true;
+ return;
+ }
+
+ sc->memcgs_need_swapping = true;
+ sc->memcgs_avoid_swapping = true;
+
set_mm_walk(pgdat);

memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -4846,7 +4869,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return scanned;
}

-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ bool *need_swapping)
{
int type;
int scanned;
@@ -4909,6 +4933,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap

sc->nr_reclaimed += reclaimed;

+ if (type == LRU_GEN_ANON && need_swapping)
+ *need_swapping = true;
+
return scanned;
}

@@ -4918,10 +4945,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
* reclaim.
*/
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap, unsigned long reclaimed)
+ bool can_swap, unsigned long reclaimed, bool *need_aging)
{
int priority;
- bool need_aging;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -4936,7 +4962,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
return 0;

- nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, &need_aging);
+ nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, need_aging);
if (!nr_to_scan)
return 0;

@@ -4952,7 +4978,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
if (!nr_to_scan)
return 0;

- if (!need_aging)
+ if (!*need_aging)
return nr_to_scan;

/* skip the aging path at the default priority */
@@ -4972,6 +4998,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
+ bool need_aging = false;
+ bool need_swapping = false;
unsigned long scanned = 0;
unsigned long reclaimed = sc->nr_reclaimed;

@@ -4993,21 +5021,30 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
else
swappiness = 0;

- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed);
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed, &need_aging);
if (!nr_to_scan)
- break;
+ goto done;

- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
if (!delta)
- break;
+ goto done;

scanned += delta;
if (scanned >= nr_to_scan)
break;

+ if (sc->memcgs_avoid_swapping && swappiness < 200 && need_swapping)
+ break;
+
cond_resched();
}

+ /* see the comment in lru_gen_age_node() */
+ if (!need_aging)
+ sc->memcgs_need_aging = false;
+ if (!need_swapping)
+ sc->memcgs_need_swapping = false;
+done:
clear_mm_walk();

blk_finish_plug(&plug);
--
2.37.1.595.g718a3a8f04-goog