[PATCH] mm: get_scan_count consider reclaimable lru pages

From: Minchan Kim
Date: Mon Jul 25 2016 - 22:57:27 EST


With node-lru, if there are enough reclaimable pages in highmem
but nothing in lowmem, VM can try to shrink inactive list although
the requested zone is lowmem.

The problem is that if the inactive list is full of highmem pages then a
direct reclaimer searching for a lowmem page waste CPU scanning uselessly.
It just burns out CPU. Even, many direct reclaimers are stalled by
too_many_isolated if lots of parallel reclaimer are going on although
there are no reclaimable memory in inactive list.

To solve the issue, get_scan_count should consider reclaimable
lru size.

I tried the experiment 4 times in 32bit 2G 8 CPU KVM machine to get
elapsed time.

hackbench 500 process 2

= Old =

1st: 289s 2nd: 310s 3rd: 112s 4th: 272s

= Now =

1st: 31s 2nd: 132s 3rd: 162s 4th: 50s

Not-yet-Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
include/linux/mmzone.h | 3 +-
mm/vmscan.c | 91 ++++++++++++++++++++++----------------------------
mm/workingset.c | 2 +-
3 files changed, 43 insertions(+), 53 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d572b78..87d186f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -805,7 +805,8 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
#endif
}

-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int classzone);

#ifdef CONFIG_HAVE_MEMORY_PRESENT
void memory_present(int nid, unsigned long start, unsigned long end);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5af357..c27e307 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,12 +234,33 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
pgdat_reclaimable_pages(pgdat) * 6;
}

-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/*
+ * Return size of lru list zones[0..classzone_idx] if memcg is disabled.
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int classzone_idx)
{
+ struct pglist_data *pgdat;
+ unsigned long nr_pages, nr_zone_pages;
+ int zid;
+ struct zone *zone;
+
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);

- return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+ pgdat = lruvec_pgdat(lruvec);
+ nr_pages = node_page_state(pgdat, NR_LRU_BASE + lru);
+
+ for (zid = classzone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+ zone = &pgdat->node_zones[zid];
+ if (!populated_zone(zone))
+ continue;
+
+ nr_zone_pages = zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
+ nr_pages -= min(nr_pages, nr_zone_pages);
+ }
+
+ return nr_pages;
}

/*
@@ -1481,13 +1502,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
total_skipped += nr_skipped[zid];
}

- /*
- * Account skipped pages as a partial scan as the pgdat may be
- * close to unreclaimable. If the LRU list is empty, account
- * skipped pages as a full scan.
- */
- scan += list_empty(src) ? total_skipped : total_skipped >> 2;
-
list_splice(&pages_skipped, src);
}
*nr_scanned = scan;
@@ -1995,34 +2009,9 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
if (!file && !total_swap_pages)
return false;

- inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
- active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
- /*
- * For global reclaim on zone-constrained allocations, it is necessary
- * to check if rotations are required for lowmem to be reclaimed. This
- * calculates the inactive/active pages available in eligible zones.
- */
- if (global_reclaim(sc)) {
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- int zid;
-
- for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
- unsigned long inactive_zone, active_zone;
-
- if (!populated_zone(zone))
- continue;
-
- inactive_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE));
- active_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
-
- inactive -= min(inactive, inactive_zone);
- active -= min(active, active_zone);
- }
- }
+ inactive = lruvec_lru_size(lruvec, file * LRU_FILE, sc->reclaim_idx);
+ active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE,
+ sc->reclaim_idx);

gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -2136,21 +2125,20 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon pages. Try to detect this based on file LRU size.
*/
if (global_reclaim(sc)) {
- unsigned long pgdatfile;
- unsigned long pgdatfree;
- int z;
+ unsigned long pgdatfile = 0;
+ unsigned long pgdatfree = 0;
unsigned long total_high_wmark = 0;
+ int z;

- pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
+ for (z = 0; z <= sc->reclaim_idx; z++) {
struct zone *zone = &pgdat->node_zones[z];
if (!populated_zone(zone))
continue;

total_high_wmark += high_wmark_pages(zone);
+ pgdatfree += zone_page_state(zone, NR_FREE_PAGES);
+ pgdatfile += zone_page_state(zone, NR_ZONE_ACTIVE_FILE);
+ pgdatfile += zone_page_state(zone, NR_ZONE_INACTIVE_FILE);
}

if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
@@ -2169,7 +2157,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* system is under heavy pressure.
*/
if (!inactive_list_is_low(lruvec, true, sc) &&
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx)
+ >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2195,10 +2184,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/

- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, sc->reclaim_idx) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, sc->reclaim_idx) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx);

spin_lock_irq(&pgdat->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2236,7 +2225,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
unsigned long size;
unsigned long scan;

- size = lruvec_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
scan = size >> sc->priority;

if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index 69551cf..0c71027 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -266,7 +266,7 @@ bool workingset_refault(void *shadow)
}
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES - 1);
rcu_read_unlock();

/*
--
1.9.1