Re: [PATCH 12/27] mm, vmscan: Make shrink_node decisions more node-centric

From: Michal Hocko
Date: Wed Jun 22 2016 - 09:21:01 EST


On Tue 21-06-16 15:15:51, Mel Gorman wrote:
> Earlier patches focused on having direct reclaim and kswapd use data that
> is node-centric for reclaiming but shrink_node() itself still uses too much
> zone information. This patch removes unnecessary zone-based information
> with the most important decision being whether to continue reclaim or
> not. Some memcg APIs are adjusted as a result even though memcg itself
> still uses some zone information.
>
> Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

> ---
> include/linux/memcontrol.h | 9 +++----
> include/linux/mmzone.h | 4 ++--
> include/linux/swap.h | 2 +-
> mm/memcontrol.c | 17 +++++++-------
> mm/page_alloc.c | 2 +-
> mm/vmscan.c | 58 ++++++++++++++++++++++++++--------------------
> mm/workingset.c | 6 ++---
> 7 files changed, 54 insertions(+), 44 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index cda436c79d8c..a13328851fea 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -306,7 +306,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
>
> void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
>
> -struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
> +struct lruvec *mem_cgroup_lruvec(struct pglist_data *, struct zone *zone,
> + struct mem_cgroup *);
> struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
>
> bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
> @@ -573,10 +574,10 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
> {
> }
>
> -static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
> - struct mem_cgroup *memcg)
> +static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
> + struct zone *zone, struct mem_cgroup *memcg)
> {
> - return zone_lruvec(zone);
> + return node_lruvec(pgdat);
> }
>
> static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 890d1858aa22..6991eded0ffd 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -737,9 +737,9 @@ static inline spinlock_t *zone_lru_lock(struct zone *zone)
> return &zone->zone_pgdat->lru_lock;
> }
>
> -static inline struct lruvec *zone_lruvec(struct zone *zone)
> +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
> {
> - return &zone->zone_pgdat->lruvec;
> + return &pgdat->lruvec;
> }
>
> static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 916e2eddecd6..0ad616d7c381 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -316,7 +316,7 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> unsigned long nr_pages,
> gfp_t gfp_mask,
> bool may_swap);
> -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
> +extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
> gfp_t gfp_mask, bool noswap,
> struct zone *zone,
> unsigned long *nr_scanned);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 864a4e3a82c1..aac5fae56ea4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -944,22 +944,23 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> iter = mem_cgroup_iter(NULL, iter, NULL))
>
> /**
> - * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
> + * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
> + * @node: node of the wanted lruvec
> * @zone: zone of the wanted lruvec
> * @memcg: memcg of the wanted lruvec
> *
> - * Returns the lru list vector holding pages for the given @zone and
> - * @mem. This can be the global zone lruvec, if the memory controller
> + * Returns the lru list vector holding pages for a given @node or a given
> + * @memcg and @zone. This can be the node lruvec, if the memory controller
> * is disabled.
> */
> -struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
> - struct mem_cgroup *memcg)
> +struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
> + struct zone *zone, struct mem_cgroup *memcg)
> {
> struct mem_cgroup_per_zone *mz;
> struct lruvec *lruvec;
>
> if (mem_cgroup_disabled()) {
> - lruvec = zone_lruvec(zone);
> + lruvec = node_lruvec(pgdat);
> goto out;
> }
>
> @@ -1474,8 +1475,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
> }
> continue;
> }
> - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
> - zone, &nr_scanned);
> + total += mem_cgroup_shrink_node(victim, gfp_mask, false,
> + zone, &nr_scanned);
> *total_scanned += nr_scanned;
> if (!soft_limit_excess(root_memcg))
> break;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e128af8de05f..d62b147fd426 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5897,6 +5897,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
> #endif
> pgdat_page_ext_init(pgdat);
> spin_lock_init(&pgdat->lru_lock);
> + lruvec_init(node_lruvec(pgdat));
>
> for (j = 0; j < MAX_NR_ZONES; j++) {
> struct zone *zone = pgdat->node_zones + j;
> @@ -5959,7 +5960,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
> /* For bootup, initialized properly in watermark setup */
> mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
>
> - lruvec_init(zone_lruvec(zone));
> if (!size)
> continue;
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index d42a86e603e8..3774ebf19f63 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2220,10 +2220,11 @@ static inline void init_tlb_ubc(void)
> /*
> * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> */
> -static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
> +static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
> struct scan_control *sc, unsigned long *lru_pages)
> {
> - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> + struct zone *zone = &pgdat->node_zones[sc->reclaim_idx];
> + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg);
> unsigned long nr[NR_LRU_LISTS];
> unsigned long targets[NR_LRU_LISTS];
> unsigned long nr_to_scan;
> @@ -2356,13 +2357,14 @@ static bool in_reclaim_compaction(struct scan_control *sc)
> * calls try_to_compact_zone() that it will have enough free pages to succeed.
> * It will give up earlier than that if there is difficulty reclaiming pages.
> */
> -static inline bool should_continue_reclaim(struct zone *zone,
> +static inline bool should_continue_reclaim(struct pglist_data *pgdat,
> unsigned long nr_reclaimed,
> unsigned long nr_scanned,
> struct scan_control *sc)
> {
> unsigned long pages_for_compaction;
> unsigned long inactive_lru_pages;
> + int z;
>
> /* If not in reclaim/compaction mode, stop */
> if (!in_reclaim_compaction(sc))
> @@ -2396,21 +2398,27 @@ static inline bool should_continue_reclaim(struct zone *zone,
> * inactive lists are large enough, continue reclaiming
> */
> pages_for_compaction = (2UL << sc->order);
> - inactive_lru_pages = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE);
> + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
> if (get_nr_swap_pages() > 0)
> - inactive_lru_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
> + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
> if (sc->nr_reclaimed < pages_for_compaction &&
> inactive_lru_pages > pages_for_compaction)
> return true;
>
> /* If compaction would go ahead or the allocation would succeed, stop */
> - switch (compaction_suitable(zone, sc->order, 0, 0)) {
> - case COMPACT_PARTIAL:
> - case COMPACT_CONTINUE:
> - return false;
> - default:
> - return true;
> + for (z = 0; z <= sc->reclaim_idx; z++) {
> + struct zone *zone = &pgdat->node_zones[z];
> +
> + switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
> + case COMPACT_PARTIAL:
> + case COMPACT_CONTINUE:
> + return false;
> + default:
> + /* check next zone */
> + ;
> + }
> }
> + return true;
> }
>
> static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
> @@ -2419,15 +2427,14 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
> struct reclaim_state *reclaim_state = current->reclaim_state;
> unsigned long nr_reclaimed, nr_scanned;
> bool reclaimable = false;
> - struct zone *zone = &pgdat->node_zones[classzone_idx];
>
> do {
> struct mem_cgroup *root = sc->target_mem_cgroup;
> struct mem_cgroup_reclaim_cookie reclaim = {
> - .zone = zone,
> + .zone = &pgdat->node_zones[classzone_idx],
> .priority = sc->priority,
> };
> - unsigned long zone_lru_pages = 0;
> + unsigned long node_lru_pages = 0;
> struct mem_cgroup *memcg;
>
> nr_reclaimed = sc->nr_reclaimed;
> @@ -2448,11 +2455,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
> reclaimed = sc->nr_reclaimed;
> scanned = sc->nr_scanned;
>
> - shrink_zone_memcg(zone, memcg, sc, &lru_pages);
> - zone_lru_pages += lru_pages;
> + shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
> + node_lru_pages += lru_pages;
>
> if (!global_reclaim(sc) && sc->reclaim_idx == classzone_idx)
> - shrink_slab(sc->gfp_mask, zone_to_nid(zone),
> + shrink_slab(sc->gfp_mask, pgdat->node_id,
> memcg, sc->nr_scanned - scanned,
> lru_pages);
>
> @@ -2464,7 +2471,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
> /*
> * Direct reclaim and kswapd have to scan all memory
> * cgroups to fulfill the overall scan target for the
> - * zone.
> + * node.
> *
> * Limit reclaim, on the other hand, only cares about
> * nr_to_reclaim pages to be reclaimed and it will
> @@ -2483,9 +2490,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
> * the eligible LRU pages were scanned.
> */
> if (global_reclaim(sc) && sc->reclaim_idx == classzone_idx)
> - shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
> + shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
> sc->nr_scanned - nr_scanned,
> - zone_lru_pages);
> + node_lru_pages);
>
> if (reclaim_state) {
> sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> @@ -2500,7 +2507,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
> if (sc->nr_reclaimed - nr_reclaimed)
> reclaimable = true;
>
> - } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
> + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> sc->nr_scanned - nr_scanned, sc));
>
> return reclaimable;
> @@ -2896,7 +2903,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>
> #ifdef CONFIG_MEMCG
>
> -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
> gfp_t gfp_mask, bool noswap,
> struct zone *zone,
> unsigned long *nr_scanned)
> @@ -2906,6 +2913,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> .target_mem_cgroup = memcg,
> .may_writepage = !laptop_mode,
> .may_unmap = 1,
> + .reclaim_idx = zone_idx(zone),
> .may_swap = !noswap,
> };
> unsigned long lru_pages;
> @@ -2920,11 +2928,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> /*
> * NOTE: Although we can get the priority field, using it
> * here is not a good idea, since it limits the pages we can scan.
> - * if we don't reclaim here, the shrink_zone from balance_pgdat
> + * if we don't reclaim here, the shrink_node from balance_pgdat
> * will pick up pages from other mem cgroup's as well. We hack
> * the priority and make it zero.
> */
> - shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
> + shrink_node_memcg(zone->zone_pgdat, memcg, &sc, &lru_pages);
>
> trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
>
> @@ -2982,7 +2990,7 @@ static void age_active_anon(struct pglist_data *pgdat,
>
> memcg = mem_cgroup_iter(NULL, NULL, NULL);
> do {
> - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg);
>
> if (inactive_list_is_low(lruvec, false))
> shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
> diff --git a/mm/workingset.c b/mm/workingset.c
> index c0820e06aaff..2d81ca11317d 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -218,7 +218,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
> VM_BUG_ON_PAGE(page_count(page), page);
> VM_BUG_ON_PAGE(!PageLocked(page), page);
>
> - lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> + lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg);
> eviction = atomic_long_inc_return(&lruvec->inactive_age);
> return pack_shadow(memcgid, zone, eviction);
> }
> @@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
> rcu_read_unlock();
> return false;
> }
> - lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> + lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg);
> refault = atomic_long_read(&lruvec->inactive_age);
> active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
> rcu_read_unlock();
> @@ -317,7 +317,7 @@ void workingset_activation(struct page *page)
> */
> if (!mem_cgroup_disabled() && !page_memcg(page))
> goto out;
> - lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
> + lruvec = mem_cgroup_lruvec(page_pgdat(page), page_zone(page), page_memcg(page));
> atomic_long_inc(&lruvec->inactive_age);
> out:
> unlock_page_memcg(page);
> --
> 2.6.4
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

--
Michal Hocko
SUSE Labs