[resend][PATCH] mm, vmscan: fix do_try_to_free_pages() livelock

From: kosaki . motohiro
Date: Thu Jun 14 2012 - 04:13:33 EST


From: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>

Currently, do_try_to_free_pages() can enter livelock. Because of,
now vmscan has two conflicted policies.

1) kswapd sleep when it couldn't reclaim any page when reaching
priority 0. This is because to avoid kswapd() infinite
loop. That said, kswapd assume direct reclaim makes enough
free pages to use either regular page reclaim or oom-killer.
This logic makes kswapd -> direct-reclaim dependency.
2) direct reclaim continue to reclaim without oom-killer until
kswapd turn on zone->all_unreclaimble. This is because
to avoid too early oom-kill.
This logic makes direct-reclaim -> kswapd dependency.

In worst case, direct-reclaim may continue to page reclaim forever
when kswapd sleeps forever.

We can't turn on zone->all_unreclaimable from direct reclaim path
because direct reclaim path don't take any lock and this way is racy.

Thus this patch removes zone->all_unreclaimable field completely and
recalculates zone reclaimable state every time.

Note: we can't take the idea that direct-reclaim see zone->pages_scanned
directly and kswapd continue to use zone->all_unreclaimable. Because, it
is racy. commit 929bea7c71 (vmscan: all_unreclaimable() use
zone->all_unreclaimable as a name) describes the detail.

Reported-by: Aaditya Kumar <aaditya.kumar.30@xxxxxxxxx>
Reported-by: Ying Han <yinghan@xxxxxxxxxx>
Cc: Nick Piggin <npiggin@xxxxxxxxx>
Acked-by: Rik van Riel <riel@xxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Mel Gorman <mel@xxxxxxxxx>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
Cc: Minchan Kim <minchan.kim@xxxxxxxxx>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
---
include/linux/mm_inline.h | 19 +++++++++++++++++
include/linux/mmzone.h | 2 +-
include/linux/vmstat.h | 1 -
mm/page-writeback.c | 2 +
mm/page_alloc.c | 5 +--
mm/vmscan.c | 48 ++++++++++++--------------------------------
mm/vmstat.c | 3 +-
7 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 1397ccf..04f32e1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -2,6 +2,7 @@
#define LINUX_MM_INLINE_H

#include <linux/huge_mm.h>
+#include <linux/swap.h>

/**
* page_is_file_cache - should the page be on a file LRU or anon LRU?
@@ -99,4 +100,22 @@ static __always_inline enum lru_list page_lru(struct page *page)
return lru;
}

+static inline unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+ int nr;
+
+ nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (nr_swap_pages > 0)
+ nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+
+ return nr;
+}
+
+static inline bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2427706..9d2a720 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -368,7 +368,7 @@ struct zone {
* free areas of different sizes
*/
spinlock_t lock;
- int all_unreclaimable; /* All pages pinned */
+
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 65efb92..9607256 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -140,7 +140,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
}

extern unsigned long global_reclaimable_pages(void);
-extern unsigned long zone_reclaimable_pages(struct zone *zone);

#ifdef CONFIG_NUMA
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 93d8d2f..d2d957f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,8 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
+#include <linux/mm_inline.h>
+
#include <trace/events/writeback.h>

/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4403009..5716b00 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -59,6 +59,7 @@
#include <linux/prefetch.h>
#include <linux/migrate.h>
#include <linux/page-debug-flags.h>
+#include <linux/mm_inline.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -638,7 +639,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
int to_free = count;

spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
zone->pages_scanned = 0;

while (to_free) {
@@ -680,7 +680,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
int migratetype)
{
spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
zone->pages_scanned = 0;

__free_one_page(page, zone, order, migratetype);
@@ -2870,7 +2869,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_BOUNCE)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
- (zone->all_unreclaimable ? "yes" : "no")
+ (zone_reclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9..033671c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1592,7 +1592,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* latencies, so it's better to scan a minimum amount there as
* well.
*/
- if (current_is_kswapd() && zone->all_unreclaimable)
+ if (current_is_kswapd() && !zone_reclaimable(zone))
force_scan = true;
if (!global_reclaim(sc))
force_scan = true;
@@ -1936,8 +1936,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone->all_unreclaimable &&
- sc->priority != DEF_PRIORITY)
+ if (!zone_reclaimable(zone) &&
+ sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
if (COMPACTION_BUILD) {
/*
@@ -1975,11 +1975,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
return aborted_reclaim;
}

-static bool zone_reclaimable(struct zone *zone)
-{
- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
-}
-
/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
@@ -1993,7 +1988,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (!zone->all_unreclaimable)
+ if (zone_reclaimable(zone))
return false;
}

@@ -2299,7 +2294,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
* they must be considered balanced here as well if kswapd
* is to sleep
*/
- if (zone->all_unreclaimable) {
+ if (zone_reclaimable(zone)) {
balanced += zone->present_pages;
continue;
}
@@ -2393,8 +2388,7 @@ loop_again:
if (!populated_zone(zone))
continue;

- if (zone->all_unreclaimable &&
- sc.priority != DEF_PRIORITY)
+ if (!zone_reclaimable(zone) && sc.priority != DEF_PRIORITY)
continue;

/*
@@ -2443,14 +2437,13 @@ loop_again:
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- int nr_slab, testorder;
+ int testorder;
unsigned long balance_gap;

if (!populated_zone(zone))
continue;

- if (zone->all_unreclaimable &&
- sc.priority != DEF_PRIORITY)
+ if (!zone_reclaimable(zone) && sc.priority != DEF_PRIORITY)
continue;

sc.nr_scanned = 0;
@@ -2497,12 +2490,11 @@ loop_again:
shrink_zone(zone, &sc);

reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+ shrink_slab(&shrink, sc.nr_scanned, lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;

- if (nr_slab == 0 && !zone_reclaimable(zone))
- zone->all_unreclaimable = 1;
+
}

/*
@@ -2514,7 +2506,7 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;

- if (zone->all_unreclaimable) {
+ if (!zone_reclaimable(zone)) {
if (end_zone && end_zone == i)
end_zone--;
continue;
@@ -2616,7 +2608,7 @@ out:
if (!populated_zone(zone))
continue;

- if (zone->all_unreclaimable &&
+ if (!zone_reclaimable(zone) &&
sc.priority != DEF_PRIORITY)
continue;

@@ -2850,20 +2842,6 @@ unsigned long global_reclaimable_pages(void)
return nr;
}

-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
- int nr;
-
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
-
- if (nr_swap_pages > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
-
- return nr;
-}
-
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3158,7 +3136,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;

- if (zone->all_unreclaimable)
+ if (!zone_reclaimable(zone))
return ZONE_RECLAIM_FULL;

/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9..94b9d4c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -19,6 +19,7 @@
#include <linux/math64.h>
#include <linux/writeback.h>
#include <linux/compaction.h>
+#include <linux/mm_inline.h>

#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -1022,7 +1023,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n all_unreclaimable: %u"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
- zone->all_unreclaimable,
+ !zone_reclaimable(zone),
zone->zone_start_pfn,
zone->inactive_ratio);
seq_putc(m, '\n');
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/