Silent hang up caused by pages being not scanned?

From: Tetsuo Handa
Date: Mon Oct 12 2015 - 11:26:22 EST


Tetsuo Handa wrote:
> Uptime between 101 and 300 is a silent hang up (i.e. no OOM killer messages,
> no SIGKILL pending tasks, no TIF_MEMDIE tasks) which I solved using SysRq-f
> at uptime = 289. I don't know the reason of this silent hang up, but the
> memory unzapping kernel thread will not help because there is no OOM victim.
>
> ----------
> [ 101.438951] MemAlloc-Info: 10 stalling task, 0 dying task, 0 victim task.
> (...snipped...)
> [ 111.817922] MemAlloc-Info: 12 stalling task, 0 dying task, 0 victim task.
> (...snipped...)
> [ 122.281828] MemAlloc-Info: 13 stalling task, 0 dying task, 0 victim task.
> (...snipped...)
> [ 132.793724] MemAlloc-Info: 14 stalling task, 0 dying task, 0 victim task.
> (...snipped...)
> [ 143.336154] MemAlloc-Info: 16 stalling task, 0 dying task, 0 victim task.
> (...snipped...)
> [ 289.343187] sysrq: SysRq : Manual OOM execution
> (...snipped...)
> [ 292.065650] MemAlloc-Info: 16 stalling task, 0 dying task, 0 victim task.
> (...snipped...)
> [ 302.590736] kworker/3:2 invoked oom-killer: gfp_mask=0x24000c0, order=-1, oom_score_adj=0
> (...snipped...)
> [ 302.690047] MemAlloc-Info: 4 stalling task, 0 dying task, 0 victim task.
> ----------

I examined this hang up using additional debug printk() patch. And it was
observed that when this silent hang up occurs, zone_reclaimable() called from
shrink_zones() called from a __GFP_FS memory allocation request is returning
true forever. Since the __GFP_FS memory allocation request can never call
out_of_memory() due to did_some_progree > 0, the system will silently hang up
with 100% CPU usage.

----------
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0473eec..fda0bb5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2821,6 +2821,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
#endif /* CONFIG_COMPACTION */

+pid_t dump_target_pid;
+
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -2847,6 +2849,9 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,

cond_resched();

+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "__perform_reclaim returned %u at line %u\n",
+ progress, __LINE__);
return progress;
}

@@ -3007,6 +3012,7 @@ static int malloc_watchdog(void *unused)
unsigned int memdie_pending;
unsigned int stalling_tasks;
u8 index;
+ pid_t pid;

not_stalling: /* Healty case. */
/*
@@ -3025,12 +3031,16 @@ static int malloc_watchdog(void *unused)
* and stop_memalloc_timer() within timeout duration.
*/
if (likely(!memalloc_counter[index]))
+ {
+ dump_target_pid = 0;
goto not_stalling;
+ }
maybe_stalling: /* Maybe something is wrong. Let's check. */
/* First, report whether there are SIGKILL tasks and/or OOM victims. */
sigkill_pending = 0;
memdie_pending = 0;
stalling_tasks = 0;
+ pid = 0;
preempt_disable();
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -3062,8 +3072,11 @@ static int malloc_watchdog(void *unused)
(fatal_signal_pending(p) ? "-dying" : ""),
p->comm, p->pid, m->gfp, m->order, spent);
show_stack(p, NULL);
+ if (!pid && (m->gfp & __GFP_FS))
+ pid = p->pid;
}
spin_unlock(&memalloc_list_lock);
+ dump_target_pid = -pid;
/* Wait until next timeout duration. */
schedule_timeout_interruptible(timeout);
if (memalloc_counter[index])
@@ -3155,6 +3168,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;

retry:
+ if (dump_target_pid == -current->pid)
+ dump_target_pid = -dump_target_pid;
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);

@@ -3280,6 +3296,11 @@ retry:
goto noretry;

/* Keep reclaiming pages as long as there is reasonable progress */
+ if (dump_target_pid == current->pid) {
+ printk(KERN_INFO "did_some_progress=%lu at line %u\n",
+ did_some_progress, __LINE__);
+ dump_target_pid = 0;
+ }
pages_reclaimed += did_some_progress;
if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 27d580b..cb0c22e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2527,6 +2527,8 @@ static inline bool compaction_ready(struct zone *zone, int order)
return watermark_ok;
}

+extern pid_t dump_target_pid;
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2619,16 +2621,41 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
if (nr_soft_reclaimed)
+ {
+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "nr_soft_reclaimed=%lu at line %u\n",
+ nr_soft_reclaimed, __LINE__);
reclaimable = true;
+ }
/* need some check for avoid more shrink_zone() */
}

if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
+ {
+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "shrink_zone returned 1 at line %u\n",
+ __LINE__);
reclaimable = true;
+ }

if (global_reclaim(sc) &&
!reclaimable && zone_reclaimable(zone))
+ {
+ if (dump_target_pid == current->pid) {
+ printk(KERN_INFO "zone_reclaimable returned 1 at line %u\n",
+ __LINE__);
+ printk(KERN_INFO "(ACTIVE_FILE=%lu+INACTIVE_FILE=%lu",
+ zone_page_state(zone, NR_ACTIVE_FILE),
+ zone_page_state(zone, NR_INACTIVE_FILE));
+ if (get_nr_swap_pages() > 0)
+ printk(KERN_CONT "+ACTIVE_ANON=%lu+INACTIVE_ANON=%lu",
+ zone_page_state(zone, NR_ACTIVE_ANON),
+ zone_page_state(zone, NR_INACTIVE_ANON));
+ printk(KERN_CONT ") * 6 > PAGES_SCANNED=%lu\n",
+ zone_page_state(zone, NR_PAGES_SCANNED));
+ }
reclaimable = true;
+ }
}

/*
@@ -2674,6 +2701,9 @@ retry:
sc->priority);
sc->nr_scanned = 0;
zones_reclaimable = shrink_zones(zonelist, sc);
+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "shrink_zones returned %u at line %u\n",
+ zones_reclaimable, __LINE__);

total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2707,11 +2737,21 @@ retry:
delayacct_freepages_end();

if (sc->nr_reclaimed)
+ {
+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "sc->nr_reclaimed=%lu at line %u\n",
+ sc->nr_reclaimed, __LINE__);
return sc->nr_reclaimed;
+ }

/* Aborted reclaim to try compaction? don't OOM, then */
if (sc->compaction_ready)
+ {
+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "sc->compaction_ready=%u at line %u\n",
+ sc->compaction_ready, __LINE__);
return 1;
+ }

/* Untapped cgroup reserves? Don't OOM, retry. */
if (!sc->may_thrash) {
@@ -2720,6 +2760,9 @@ retry:
goto retry;
}

+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "zones_reclaimable=%u at line %u\n",
+ zones_reclaimable, __LINE__);
/* Any of the zones still reclaimable? Don't OOM. */
if (zones_reclaimable)
return 1;
@@ -2875,7 +2918,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* point.
*/
if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+ {
+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "throttle_direct_reclaim returned 1 at line %u\n",
+ __LINE__);
return 1;
+ }

trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
@@ -2885,6 +2933,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,

trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);

+ if (dump_target_pid == current->pid)
+ printk(KERN_INFO "do_try_to_free_pages returned %lu at line %u\n",
+ nr_reclaimed, __LINE__);
return nr_reclaimed;
}

----------

What is strange, the values printed by this debug printk() patch did not
change as time went by. Thus, I think that this is not a problem of lack of
CPU time for scanning pages. I suspect that there is a bug that nobody is
scanning pages.

----------
[ 66.821450] zone_reclaimable returned 1 at line 2646
[ 66.823020] (ACTIVE_FILE=26+INACTIVE_FILE=10) * 6 > PAGES_SCANNED=32
[ 66.824935] shrink_zones returned 1 at line 2706
[ 66.826392] zones_reclaimable=1 at line 2765
[ 66.827865] do_try_to_free_pages returned 1 at line 2938
[ 67.102322] __perform_reclaim returned 1 at line 2854
[ 67.103968] did_some_progress=1 at line 3301
(...snipped...)
[ 281.439977] zone_reclaimable returned 1 at line 2646
[ 281.439977] (ACTIVE_FILE=26+INACTIVE_FILE=10) * 6 > PAGES_SCANNED=32
[ 281.439978] shrink_zones returned 1 at line 2706
[ 281.439978] zones_reclaimable=1 at line 2765
[ 281.439979] do_try_to_free_pages returned 1 at line 2938
[ 281.439979] __perform_reclaim returned 1 at line 2854
[ 281.439980] did_some_progress=1 at line 3301
----------

Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20151013.txt.xz
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/