Re: [PATCH 3/3] mm: vmscan: enforce inactive:active ratio at the reclaim root

From: Suren Baghdasaryan
Date: Sun Nov 10 2019 - 21:16:37 EST


On Thu, Nov 7, 2019 at 12:53 PM Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
>
> We split the LRU lists into inactive and an active parts to maximize
> workingset protection while allowing just enough inactive cache space
> to faciltate readahead and writeback for one-off file accesses (e.g. a
> linear scan through a file, or logging); or just enough inactive anon
> to maintain recent reference information when reclaim needs to swap.
>
> With cgroups and their nested LRU lists, we currently don't do this
> correctly. While recursive cgroup reclaim establishes a relative LRU
> order among the pages of all involved cgroups, inactive:active size
> decisions are done on a per-cgroup level. As a result, we'll reclaim a
> cgroup's workingset when it doesn't have cold pages, even when one of
> its siblings has plenty of it that should be reclaimed first.
>
> For example: workload A has 50M worth of hot cache but doesn't do any
> one-off file accesses; meanwhile, parallel workload B scans files and
> rarely accesses the same page twice.
>
> If these workloads were to run in an uncgrouped system, A would be
> protected from the high rate of cache faults from B. But if they were
> put in parallel cgroups for memory accounting purposes, B's fast cache
> fault rate would push out the hot cache pages of A. This is unexpected
> and undesirable - the "scan resistance" of the page cache is broken.
>
> This patch moves inactive:active size balancing decisions to the root
> of reclaim - the same level where the LRU order is established.
>
> It does this by looking at the recursize

recursive?

> size of the inactive and the
> active file sets of the cgroup subtree at the beginning of the reclaim
> cycle, and then making a decision - scan or skip active pages - that
> applies throughout the entire run and to every cgroup involved.
>
> With that in place, in the test above, the VM will recognize that
> there are plenty of inactive pages in the combined cache set of
> workloads A and B and prefer the one-off cache in B over the hot pages
> in A. The scan resistance of the cache is restored.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
> include/linux/mmzone.h | 4 +-
> mm/vmscan.c | 185 ++++++++++++++++++++++++++---------------
> 2 files changed, 118 insertions(+), 71 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 7a09087e8c77..454a230ad417 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -229,12 +229,12 @@ enum lru_list {
>
> #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
>
> -static inline int is_file_lru(enum lru_list lru)
> +static inline bool is_file_lru(enum lru_list lru)
> {
> return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
> }
>
> -static inline int is_active_lru(enum lru_list lru)
> +static inline bool is_active_lru(enum lru_list lru)
> {
> return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 527617ee9b73..df859b1d583c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -79,6 +79,13 @@ struct scan_control {
> */
> struct mem_cgroup *target_mem_cgroup;
>
> + /* Can active pages be deactivated as part of reclaim? */
> +#define DEACTIVATE_ANON 1
> +#define DEACTIVATE_FILE 2
> + unsigned int may_deactivate:2;
> + unsigned int force_deactivate:1;
> + unsigned int skipped_deactivate:1;
> +
> /* Writepage batching in laptop mode; RECLAIM_WRITE */
> unsigned int may_writepage:1;
>
> @@ -101,6 +108,9 @@ struct scan_control {
> /* One of the zones is ready for compaction */
> unsigned int compaction_ready:1;
>
> + /* There is easily reclaimable cold cache in the current node */
> + unsigned int cache_trim_mode:1;
> +
> /* The file pages on the current node are dangerously low */
> unsigned int file_is_tiny:1;
>
> @@ -2154,6 +2164,20 @@ unsigned long reclaim_pages(struct list_head *page_list)
> return nr_reclaimed;
> }
>
> +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> + struct lruvec *lruvec, struct scan_control *sc)
> +{
> + if (is_active_lru(lru)) {
> + if (sc->may_deactivate & (1 << is_file_lru(lru)))
> + shrink_active_list(nr_to_scan, lruvec, sc, lru);
> + else
> + sc->skipped_deactivate = 1;
> + return 0;
> + }
> +
> + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
> +}
> +
> /*
> * The inactive anon list should be small enough that the VM never has
> * to do too much work.
> @@ -2182,59 +2206,25 @@ unsigned long reclaim_pages(struct list_head *page_list)
> * 1TB 101 10GB
> * 10TB 320 32GB
> */
> -static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
> - struct scan_control *sc, bool trace)
> +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
> {
> - enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
> - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> - enum lru_list inactive_lru = file * LRU_FILE;
> + enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
> unsigned long inactive, active;
> unsigned long inactive_ratio;
> - struct lruvec *target_lruvec;
> - unsigned long refaults;
> unsigned long gb;
>
> - inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
> - active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
> + inactive = lruvec_page_state(lruvec, inactive_lru);
> + active = lruvec_page_state(lruvec, active_lru);
>
> - /*
> - * When refaults are being observed, it means a new workingset
> - * is being established. Disable active list protection to get
> - * rid of the stale workingset quickly.
> - */
> - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
> - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
> - if (file && target_lruvec->refaults != refaults) {
> - inactive_ratio = 0;
> - } else {
> - gb = (inactive + active) >> (30 - PAGE_SHIFT);
> - if (gb)
> - inactive_ratio = int_sqrt(10 * gb);
> - else
> - inactive_ratio = 1;
> - }
> -
> - if (trace)
> - trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
> - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
> - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
> - inactive_ratio, file);
> + gb = (inactive + active) >> (30 - PAGE_SHIFT);
> + if (gb)
> + inactive_ratio = int_sqrt(10 * gb);
> + else
> + inactive_ratio = 1;
>
> return inactive * inactive_ratio < active;
> }
>
> -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
> - struct lruvec *lruvec, struct scan_control *sc)
> -{
> - if (is_active_lru(lru)) {
> - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
> - shrink_active_list(nr_to_scan, lruvec, sc, lru);
> - return 0;
> - }
> -
> - return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
> -}
> -
> enum scan_balance {
> SCAN_EQUAL,
> SCAN_FRACT,
> @@ -2296,28 +2286,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>
> /*
> * If the system is almost out of file pages, force-scan anon.
> - * But only if there are enough inactive anonymous pages on
> - * the LRU. Otherwise, the small LRU gets thrashed.
> */
> - if (sc->file_is_tiny &&
> - !inactive_list_is_low(lruvec, false, sc, false) &&
> - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
> - sc->reclaim_idx) >> sc->priority) {
> + if (sc->file_is_tiny) {
> scan_balance = SCAN_ANON;
> goto out;
> }
>
> /*
> - * If there is enough inactive page cache, i.e. if the size of the
> - * inactive list is greater than that of the active list *and* the
> - * inactive list actually has some pages to scan on this priority, we
> - * do not reclaim anything from the anonymous working set right now.
> - * Without the second condition we could end up never scanning an
> - * lruvec even if it has plenty of old anonymous pages unless the
> - * system is under heavy pressure.
> + * If there is enough inactive page cache, we do not reclaim
> + * anything from the anonymous working right now.
> */
> - if (!inactive_list_is_low(lruvec, true, sc, false) &&
> - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
> + if (sc->cache_trim_mode) {
> scan_balance = SCAN_FILE;
> goto out;
> }
> @@ -2582,7 +2561,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> * Even if we did not try to evict anon pages at all, we want to
> * rebalance the anon lru active/inactive ratio.
> */
> - if (total_swap_pages && inactive_list_is_low(lruvec, false, sc, true))
> + if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
> shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> sc, LRU_ACTIVE_ANON);
> }
> @@ -2722,6 +2701,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> unsigned long nr_reclaimed, nr_scanned;
> struct lruvec *target_lruvec;
> bool reclaimable = false;
> + unsigned long file;
>
> target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
>
> @@ -2731,6 +2711,44 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> nr_reclaimed = sc->nr_reclaimed;
> nr_scanned = sc->nr_scanned;
>
> + /*
> + * Target desirable inactive:active list ratios for the anon
> + * and file LRU lists.
> + */
> + if (!sc->force_deactivate) {
> + unsigned long refaults;
> +
> + if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
> + sc->may_deactivate |= DEACTIVATE_ANON;
> + else
> + sc->may_deactivate &= ~DEACTIVATE_ANON;
> +
> + /*
> + * When refaults are being observed, it means a new
> + * workingset is being established. Deactivate to get
> + * rid of any stale active pages quickly.
> + */
> + refaults = lruvec_page_state(target_lruvec,
> + WORKINGSET_ACTIVATE);
> + if (refaults != target_lruvec->refaults ||
> + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
> + sc->may_deactivate |= DEACTIVATE_FILE;
> + else
> + sc->may_deactivate &= ~DEACTIVATE_FILE;
> + } else
> + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
> +
> + /*
> + * If we have plenty of inactive file pages that aren't
> + * thrashing, try to reclaim those first before touching
> + * anonymous pages.
> + */
> + file = lruvec_page_state(target_lruvec, LRU_INACTIVE_FILE);
> + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
> + sc->cache_trim_mode = 1;
> + else
> + sc->cache_trim_mode = 0;
> +
> /*
> * Prevent the reclaimer from falling into the cache trap: as
> * cache pages start out inactive, every cache fault will tip
> @@ -2741,10 +2759,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> * anon pages. Try to detect this based on file LRU size.
> */
> if (!cgroup_reclaim(sc)) {
> - unsigned long file;
> - unsigned long free;
> - int z;
> unsigned long total_high_wmark = 0;
> + unsigned long free, anon;
> + int z;
>
> free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
> file = node_page_state(pgdat, NR_ACTIVE_FILE) +
> @@ -2758,7 +2775,17 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> total_high_wmark += high_wmark_pages(zone);
> }
>
> - sc->file_is_tiny = file + free <= total_high_wmark;
> + /*
> + * Consider anon: if that's low too, this isn't a
> + * runaway file reclaim problem, but rather just
> + * extreme pressure. Reclaim as per usual then.
> + */
> + anon = node_page_state(pgdat, NR_INACTIVE_ANON);
> +
> + sc->file_is_tiny =
> + file + free <= total_high_wmark &&
> + !(sc->may_deactivate & DEACTIVATE_ANON) &&
> + anon >> sc->priority;

The name of file_is_tiny flag seems to not correspond with its actual
semantics anymore. Maybe rename it into "skip_file"?

I'm confused about why !(sc->may_deactivate & DEACTIVATE_ANON) should
be a prerequisite for skipping file LRU reclaim. IIUC this means we
will skip reclaiming from file LRU only when anonymous page
deactivation is not allowed. Could you please add a comment explaining
this?

> }
>
> shrink_node_memcgs(pgdat, sc);
> @@ -3062,9 +3089,27 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> if (sc->compaction_ready)
> return 1;
>
> + /*
> + * We make inactive:active ratio decisions based on the node's
> + * composition of memory, but a restrictive reclaim_idx or a
> + * memory.low cgroup setting can exempt large amounts of
> + * memory from reclaim. Neither of which are very common, so
> + * instead of doing costly eligibility calculations of the
> + * entire cgroup subtree up front, we assume the estimates are
> + * good, and retry with forcible deactivation if that fails.
> + */
> + if (sc->skipped_deactivate) {
> + sc->priority = initial_priority;
> + sc->force_deactivate = 1;
> + sc->skipped_deactivate = 0;
> + goto retry;
> + }
> +
> /* Untapped cgroup reserves? Don't OOM, retry. */
> if (sc->memcg_low_skipped) {
> sc->priority = initial_priority;
> + sc->force_deactivate = 0;
> + sc->skipped_deactivate = 0;
> sc->memcg_low_reclaim = 1;
> sc->memcg_low_skipped = 0;
> goto retry;
> @@ -3347,18 +3392,20 @@ static void age_active_anon(struct pglist_data *pgdat,
> struct scan_control *sc)
> {
> struct mem_cgroup *memcg;
> + struct lruvec *lruvec;
>
> if (!total_swap_pages)
> return;
>
> + lruvec = mem_cgroup_lruvec(NULL, pgdat);
> + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
> + return;
> +
> memcg = mem_cgroup_iter(NULL, NULL, NULL);
> do {
> - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> -
> - if (inactive_list_is_low(lruvec, false, sc, true))
> - shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> - sc, LRU_ACTIVE_ANON);
> -
> + lruvec = mem_cgroup_lruvec(memcg, pgdat);
> + shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> + sc, LRU_ACTIVE_ANON);
> memcg = mem_cgroup_iter(NULL, memcg, NULL);
> } while (memcg);
> }
> --
> 2.24.0
>