Re: mm, vmscan: commit makes PAE kernel crash nightly (bisected)

From: Trevor Cordes
Date: Sun Jan 22 2017 - 19:46:36 EST


On 2017-01-20 Mel Gorman wrote:
> >
> > Thanks for the OOM report. I was expecting it to be a particular
> > shape and my expectations were not matched so it took time to
> > consider it further. Can you try the cumulative patch below? It
> > combines three patches that
> >
> > 1. Allow slab shrinking even if the LRU patches are unreclaimable in
> > direct reclaim
> > 2. Shrinks slab based once based on the contents of all memcgs
> > instead of shrinking one at a time
> > 3. Tries to shrink slabs if the lowmem usage is too high
> >
> > Unfortunately it's only boot tested on x86-64 as I didn't get the
> > chance to setup an i386 test bed.
> >
>
> There was one major flaw in that patch. This version fixes it and
> addresses other minor issues. It may still be too agressive shrinking
> slab but worth trying out. Thanks.

I ran with your patch below and it oom'd on the first night. It was
weird, it didn't hang the system, and my rebooter script started a
reboot but the system never got more than half down before it just sat
there in a weird state where a local console user could still login but
not much was working. So the patches don't seem to solve the problem.

For the above compile I applied your patches to 4.10.0-rc4+, I hope
that's ok.

Attached is the first oom from that night. I include some stuff below
the oom where the kernel is obviously having issues and dumping more
strange output. I don't think I've seen that before. That probably
explains the strange state it was left in.

Also, completely separate from your patch I ran mhocko's 4.9 tree with
mem=2G to see if lower ram amount would help, but it didn't. Even with
2G the system oom and hung same as usual. So far the only thing that
helps at all was the cgroup_disable=memory option, which makes the
problem disappear completely for me. I added that option to 3 other
boxes I admin with PAE and that plus limiting ram to <4GB gets rid of
the bug. However, on the RHBZ on this bug I am commenting on, someone
there reports that cgroup_disable=memory doesn't help him at all.

Hopefully the oom attached can help you figure out a next step. Thanks!

> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2281ad310d06..2c735ea24a85 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2318,6 +2318,59 @@ static void get_scan_count(struct lruvec
> *lruvec, struct mem_cgroup *memcg, }
> }
>
> +#ifdef CONFIG_HIGHMEM
> +static void balance_slab_lowmem(struct pglist_data *pgdat,
> + struct scan_control *sc)
> +{
> + unsigned long lru_pages = 0;
> + unsigned long slab_pages = 0;
> + unsigned long managed_pages = 0;
> + int zid;
> +
> + for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> + struct zone *zone = &pgdat->node_zones[zid];
> +
> + if (!populated_zone(zone) || is_highmem_idx(zid))
> + continue;
> +
> + lru_pages += zone_page_state(zone,
> NR_ZONE_INACTIVE_FILE);
> + lru_pages += zone_page_state(zone,
> NR_ZONE_ACTIVE_FILE);
> + lru_pages += zone_page_state(zone,
> NR_ZONE_INACTIVE_ANON);
> + lru_pages += zone_page_state(zone,
> NR_ZONE_ACTIVE_ANON);
> + slab_pages += zone_page_state(zone,
> NR_SLAB_RECLAIMABLE);
> + slab_pages += zone_page_state(zone,
> NR_SLAB_UNRECLAIMABLE);
> + }
> +
> + /* Do not balance until LRU and slab exceeds 50% of lowmem */
> + if (lru_pages + slab_pages < (managed_pages >> 1))
> + return;
> +
> + /*
> + * Shrink reclaimable slabs if the number of lowmem slab
> pages is
> + * over twice the size of LRU pages. Apply pressure relative
> to
> + * the imbalance between LRU and slab pages.
> + */
> + if (slab_pages > lru_pages << 1) {
> + struct reclaim_state *reclaim_state =
> current->reclaim_state;
> + unsigned long exceed = slab_pages - (lru_pages << 1);
> + int nid = pgdat->node_id;
> +
> + exceed = min(exceed, slab_pages);
> + shrink_slab(sc->gfp_mask, nid, NULL, exceed >> 3,
> slab_pages);
> + if (reclaim_state) {
> + sc->nr_reclaimed +=
> reclaim_state->reclaimed_slab;
> + reclaim_state->reclaimed_slab = 0;
> + }
> + }
> +}
> +#else
> +static void balance_slab_lowmem(struct pglist_data *pgdat,
> + struct scan_control *sc)
> +{
> + return;
> +}
> +#endif
> +
> /*
> * This is a basic per-node page freer. Used by both kswapd and
> direct reclaim. */
> @@ -2336,6 +2389,27 @@ static void shrink_node_memcg(struct
> pglist_data *pgdat, struct mem_cgroup *memc
> get_scan_count(lruvec, memcg, sc, nr, lru_pages);
>
> + /*
> + * If direct reclaiming at elevated priority and the node is
> + * unreclaimable then skip LRU reclaim and let kswapd poll
> it.
> + */
> + if (!current_is_kswapd() &&
> + sc->priority != DEF_PRIORITY &&
> + !pgdat_reclaimable(pgdat)) {
> + unsigned long nr_scanned;
> +
> + /*
> + * Fake scanning so that slab shrinking will
> continue. For
> + * lowmem restricted allocations, shrink
> aggressively.
> + */
> + nr_scanned = SWAP_CLUSTER_MAX << (DEF_PRIORITY -
> sc->priority);
> + if (!(sc->gfp_mask & __GFP_HIGHMEM))
> + nr_scanned = max(nr_scanned, *lru_pages);
> + sc->nr_scanned += nr_scanned;
> +
> + return;
> + }
> +
> /* Record the original scan target for proportional
> adjustments later */ memcpy(targets, nr, sizeof(nr));
>
> @@ -2435,6 +2509,8 @@ static void shrink_node_memcg(struct
> pglist_data *pgdat, struct mem_cgroup *memc if
> (inactive_list_is_low(lruvec, false, sc, true))
> shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON);
> +
> + balance_slab_lowmem(pgdat, sc);
> }
>
> /* Use reclaim/compaction for costly allocs or under memory pressure
> */ @@ -2533,7 +2609,8 @@ static bool shrink_node(pg_data_t *pgdat,
> struct scan_control *sc) .pgdat = pgdat,
> .priority = sc->priority,
> };
> - unsigned long node_lru_pages = 0;
> + unsigned long slab_pressure = 0;
> + unsigned long slab_eligible = 0;
> struct mem_cgroup *memcg;
>
> nr_reclaimed = sc->nr_reclaimed;
> @@ -2555,12 +2632,8 @@ static bool shrink_node(pg_data_t *pgdat,
> struct scan_control *sc) scanned = sc->nr_scanned;
>
> shrink_node_memcg(pgdat, memcg, sc,
> &lru_pages);
> - node_lru_pages += lru_pages;
> -
> - if (memcg)
> - shrink_slab(sc->gfp_mask,
> pgdat->node_id,
> - memcg, sc->nr_scanned -
> scanned,
> - lru_pages);
> + slab_eligible += lru_pages;
> + slab_pressure += sc->nr_reclaimed -
> reclaimed;
> /* Record the group's reclaim efficiency */
> vmpressure(sc->gfp_mask, memcg, false,
> @@ -2586,12 +2659,12 @@ static bool shrink_node(pg_data_t *pgdat,
> struct scan_control *sc)
> /*
> * Shrink the slab caches in the same proportion that
> - * the eligible LRU pages were scanned.
> + * the eligible LRU pages were scanned. For memcg,
> this
> + * will apply the cumulative scanning pressure over
> all
> + * memcgs.
> */
> - if (global_reclaim(sc))
> - shrink_slab(sc->gfp_mask, pgdat->node_id,
> NULL,
> - sc->nr_scanned - nr_scanned,
> - node_lru_pages);
> + shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
> slab_pressure,
> +
> slab_eligible);
> if (reclaim_state) {
> sc->nr_reclaimed +=
> reclaim_state->reclaimed_slab; @@ -2683,10 +2756,6 @@ static void
> shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> GFP_KERNEL | __GFP_HARDWALL)) continue;
>
> - if (sc->priority != DEF_PRIORITY &&
> - !pgdat_reclaimable(zone->zone_pgdat))
> - continue; /* Let kswapd poll
> it */ -
> /*
> * If we already have plenty of memory free
> for
> * compaction in this zone, don't free any
> more.

Attachment: oom4
Description: Binary data