Re: [PATCH 5/8] mm: vmscan: replace shrink_node() loop with a retry jump

From: Roman Gushchin
Date: Tue Oct 22 2019 - 15:56:49 EST


On Tue, Oct 22, 2019 at 10:48:00AM -0400, Johannes Weiner wrote:
> Most of the function body is inside a loop, which imposes an
> additional indentation and scoping level that makes the code a bit
> hard to follow and modify.
>
> The looping only happens in case of reclaim-compaction, which isn't
> the common case. So rather than adding yet another function level to
> the reclaim path and have every reclaim invocation go through a level
> that only exists for one specific cornercase, use a retry goto.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
> mm/vmscan.c | 231 ++++++++++++++++++++++++++--------------------------
> 1 file changed, 115 insertions(+), 116 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 302dad112f75..235d1fc72311 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2729,144 +2729,143 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
> static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> {
> struct reclaim_state *reclaim_state = current->reclaim_state;
> + struct mem_cgroup *root = sc->target_mem_cgroup;
> unsigned long nr_reclaimed, nr_scanned;
> bool reclaimable = false;
> + struct mem_cgroup *memcg;
> +again:
> + memset(&sc->nr, 0, sizeof(sc->nr));
>
> - do {
> - struct mem_cgroup *root = sc->target_mem_cgroup;
> - struct mem_cgroup *memcg;
> -
> - memset(&sc->nr, 0, sizeof(sc->nr));
> -
> - nr_reclaimed = sc->nr_reclaimed;
> - nr_scanned = sc->nr_scanned;
> + nr_reclaimed = sc->nr_reclaimed;
> + nr_scanned = sc->nr_scanned;
>
> - memcg = mem_cgroup_iter(root, NULL, NULL);
> - do {
> - unsigned long reclaimed;
> - unsigned long scanned;
> + memcg = mem_cgroup_iter(root, NULL, NULL);
> + do {
> + unsigned long reclaimed;
> + unsigned long scanned;
>
> - switch (mem_cgroup_protected(root, memcg)) {
> - case MEMCG_PROT_MIN:
> - /*
> - * Hard protection.
> - * If there is no reclaimable memory, OOM.
> - */
> + switch (mem_cgroup_protected(root, memcg)) {
> + case MEMCG_PROT_MIN:
> + /*
> + * Hard protection.
> + * If there is no reclaimable memory, OOM.
> + */
> + continue;
> + case MEMCG_PROT_LOW:
> + /*
> + * Soft protection.
> + * Respect the protection only as long as
> + * there is an unprotected supply
> + * of reclaimable memory from other cgroups.
> + */
> + if (!sc->memcg_low_reclaim) {
> + sc->memcg_low_skipped = 1;
> continue;
> - case MEMCG_PROT_LOW:
> - /*
> - * Soft protection.
> - * Respect the protection only as long as
> - * there is an unprotected supply
> - * of reclaimable memory from other cgroups.
> - */
> - if (!sc->memcg_low_reclaim) {
> - sc->memcg_low_skipped = 1;
> - continue;
> - }
> - memcg_memory_event(memcg, MEMCG_LOW);
> - break;
> - case MEMCG_PROT_NONE:
> - /*
> - * All protection thresholds breached. We may
> - * still choose to vary the scan pressure
> - * applied based on by how much the cgroup in
> - * question has exceeded its protection
> - * thresholds (see get_scan_count).
> - */
> - break;
> }
> + memcg_memory_event(memcg, MEMCG_LOW);
> + break;
> + case MEMCG_PROT_NONE:
> + /*
> + * All protection thresholds breached. We may
> + * still choose to vary the scan pressure
> + * applied based on by how much the cgroup in
> + * question has exceeded its protection
> + * thresholds (see get_scan_count).
> + */
> + break;
> + }
>
> - reclaimed = sc->nr_reclaimed;
> - scanned = sc->nr_scanned;
> - shrink_node_memcg(pgdat, memcg, sc);
> -
> - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> - sc->priority);
> -
> - /* Record the group's reclaim efficiency */
> - vmpressure(sc->gfp_mask, memcg, false,
> - sc->nr_scanned - scanned,
> - sc->nr_reclaimed - reclaimed);
> -
> - } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> + reclaimed = sc->nr_reclaimed;
> + scanned = sc->nr_scanned;
> + shrink_node_memcg(pgdat, memcg, sc);
>
> - if (reclaim_state) {
> - sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> - reclaim_state->reclaimed_slab = 0;
> - }
> + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> + sc->priority);
>
> - /* Record the subtree's reclaim efficiency */
> - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> - sc->nr_scanned - nr_scanned,
> - sc->nr_reclaimed - nr_reclaimed);
> + /* Record the group's reclaim efficiency */
> + vmpressure(sc->gfp_mask, memcg, false,
> + sc->nr_scanned - scanned,
> + sc->nr_reclaimed - reclaimed);

It doesn't look as a trivial change. I'd add some comments to the commit message
why it's safe to do.

Thanks!

>
> - if (sc->nr_reclaimed - nr_reclaimed)
> - reclaimable = true;
> + } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
>
> - if (current_is_kswapd()) {
> - /*
> - * If reclaim is isolating dirty pages under writeback,
> - * it implies that the long-lived page allocation rate
> - * is exceeding the page laundering rate. Either the
> - * global limits are not being effective at throttling
> - * processes due to the page distribution throughout
> - * zones or there is heavy usage of a slow backing
> - * device. The only option is to throttle from reclaim
> - * context which is not ideal as there is no guarantee
> - * the dirtying process is throttled in the same way
> - * balance_dirty_pages() manages.
> - *
> - * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> - * count the number of pages under pages flagged for
> - * immediate reclaim and stall if any are encountered
> - * in the nr_immediate check below.
> - */
> - if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> - set_bit(PGDAT_WRITEBACK, &pgdat->flags);
> + if (reclaim_state) {
> + sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> + reclaim_state->reclaimed_slab = 0;
> + }
>
> - /*
> - * Tag a node as congested if all the dirty pages
> - * scanned were backed by a congested BDI and
> - * wait_iff_congested will stall.
> - */
> - if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> - set_bit(PGDAT_CONGESTED, &pgdat->flags);
> + /* Record the subtree's reclaim efficiency */
> + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> + sc->nr_scanned - nr_scanned,
> + sc->nr_reclaimed - nr_reclaimed);
>
> - /* Allow kswapd to start writing pages during reclaim.*/
> - if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> - set_bit(PGDAT_DIRTY, &pgdat->flags);
> + if (sc->nr_reclaimed - nr_reclaimed)
> + reclaimable = true;
>
> - /*
> - * If kswapd scans pages marked marked for immediate
> - * reclaim and under writeback (nr_immediate), it
> - * implies that pages are cycling through the LRU
> - * faster than they are written so also forcibly stall.
> - */
> - if (sc->nr.immediate)
> - congestion_wait(BLK_RW_ASYNC, HZ/10);
> - }

Don't you want to separate the block below into a separate function?
It can probably make the big loop shorter and easier to follow.

Thanks!

> + if (current_is_kswapd()) {
> + /*
> + * If reclaim is isolating dirty pages under writeback,
> + * it implies that the long-lived page allocation rate
> + * is exceeding the page laundering rate. Either the
> + * global limits are not being effective at throttling
> + * processes due to the page distribution throughout
> + * zones or there is heavy usage of a slow backing
> + * device. The only option is to throttle from reclaim
> + * context which is not ideal as there is no guarantee
> + * the dirtying process is throttled in the same way
> + * balance_dirty_pages() manages.
> + *
> + * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> + * count the number of pages under pages flagged for
> + * immediate reclaim and stall if any are encountered
> + * in the nr_immediate check below.
> + */
> + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> + set_bit(PGDAT_WRITEBACK, &pgdat->flags);
>
> /*
> - * Legacy memcg will stall in page writeback so avoid forcibly
> - * stalling in wait_iff_congested().
> + * Tag a node as congested if all the dirty pages
> + * scanned were backed by a congested BDI and
> + * wait_iff_congested will stall.
> */
> - if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> - sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> - set_memcg_congestion(pgdat, root, true);
> + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> + set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +
> + /* Allow kswapd to start writing pages during reclaim.*/
> + if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> + set_bit(PGDAT_DIRTY, &pgdat->flags);
>
> /*
> - * Stall direct reclaim for IO completions if underlying BDIs
> - * and node is congested. Allow kswapd to continue until it
> - * starts encountering unqueued dirty pages or cycling through
> - * the LRU too quickly.
> + * If kswapd scans pages marked marked for immediate
> + * reclaim and under writeback (nr_immediate), it
> + * implies that pages are cycling through the LRU
> + * faster than they are written so also forcibly stall.
> */
> - if (!sc->hibernation_mode && !current_is_kswapd() &&
> - current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> - wait_iff_congested(BLK_RW_ASYNC, HZ/10);
> + if (sc->nr.immediate)
> + congestion_wait(BLK_RW_ASYNC, HZ/10);
> + }
> +
> + /*
> + * Legacy memcg will stall in page writeback so avoid forcibly
> + * stalling in wait_iff_congested().
> + */
> + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> + sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> + set_memcg_congestion(pgdat, root, true);
> +
> + /*
> + * Stall direct reclaim for IO completions if underlying BDIs
> + * and node is congested. Allow kswapd to continue until it
> + * starts encountering unqueued dirty pages or cycling through
> + * the LRU too quickly.
> + */
> + if (!sc->hibernation_mode && !current_is_kswapd() &&
> + current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> + wait_iff_congested(BLK_RW_ASYNC, HZ/10);
>
> - } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> - sc));
> + if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> + sc))
> + goto again;
>
> /*
> * Kswapd gives up on balancing particular nodes after too
> --
> 2.23.0
>
>