Re: [PATCH 06/27] mm, vmscan: Make kswapd reclaim in terms of nodes

From: Hillf Danton
Date: Wed Jun 22 2016 - 04:50:08 EST


> /*
> - * kswapd shrinks the zone by the number of pages required to reach
> - * the high watermark.
> + * kswapd shrinks a node of pages that are at or below the highest usable
> + * zone that is currently unbalanced.
> *
> * Returns true if kswapd scanned at least the requested number of pages to
> * reclaim or if the lack of progress was due to pages under writeback.
> * This is used to determine if the scanning priority needs to be raised.
> */
> -static bool kswapd_shrink_zone(struct zone *zone,
> +static bool kswapd_shrink_node(pg_data_t *pgdat,
> int classzone_idx,
> struct scan_control *sc)
> {
> - unsigned long balance_gap;
> - bool lowmem_pressure;
> - struct pglist_data *pgdat = zone->zone_pgdat;
> + struct zone *zone;
> + int z;
>
> - /* Reclaim above the high watermark. */
> - sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
> + /* Reclaim a number of pages proportional to the number of zones */
> + sc->nr_to_reclaim = 0;
> + for (z = 0; z <= classzone_idx; z++) {
> + zone = pgdat->node_zones + z;
> + if (!populated_zone(zone))
> + continue;
>
> - /*
> - * We put equal pressure on every zone, unless one zone has way too
> - * many pages free already. The "too many pages" is defined as the
> - * high wmark plus a "gap" where the gap is either the low
> - * watermark or 1% of the zone, whichever is smaller.
> - */
> - balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
> - zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
> + sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
> + }
>
> /*
> - * If there is no low memory pressure or the zone is balanced then no
> - * reclaim is necessary
> + * Historically care was taken to put equal pressure on all zones but
> + * now pressure is applied based on node LRU order.
> */
> - lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
> - if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
> - balance_gap, classzone_idx))
> - return true;
> -
> - shrink_node(zone->zone_pgdat, sc, classzone_idx);
> -
> - /* TODO: ANOMALY */
> - clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
> + shrink_node(pgdat, sc, classzone_idx);
>
> /*
> - * If a zone reaches its high watermark, consider it to be no longer
> - * congested. It's possible there are dirty pages backed by congested
> - * BDIs but as pressure is relieved, speculatively avoid congestion
> - * waits.
> + * Fragmentation may mean that the system cannot be rebalanced for
> + * high-order allocations. If twice the allocation size has been
> + * reclaimed then recheck watermarks only at order-0 to prevent
> + * excessive reclaim. Assume that a process requested a high-order
> + * can direct reclaim/compact.
> */
> - if (pgdat_reclaimable(zone->zone_pgdat) &&
> - zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
> - clear_bit(PGDAT_CONGESTED, &pgdat->flags);
> - clear_bit(PGDAT_DIRTY, &pgdat->flags);
> - }
> + if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
> + sc->order = 0;
>

Reclaim order is changed here.
Btw, I find no such change in current code.

> return sc->nr_scanned >= sc->nr_to_reclaim;
> }
>
> /*
> - * For kswapd, balance_pgdat() will work across all this node's zones until
> - * they are all at high_wmark_pages(zone).
> - *
> - * Returns the highest zone idx kswapd was reclaiming at
> + * For kswapd, balance_pgdat() will reclaim pages across a node from zones
> + * that are eligible for use by the caller until at least one zone is
> + * balanced.
> *
> - * There is special handling here for zones which are full of pinned pages.
> - * This can happen if the pages are all mlocked, or if they are all used by
> - * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
> - * What we do is to detect the case where all pages in the zone have been
> - * scanned twice and there has been zero successful reclaim. Mark the zone as
> - * dead and from now on, only perform a short scan. Basically we're polling
> - * the zone for when the problem goes away.
> + * Returns the order kswapd finished reclaiming at.
> *
> * kswapd scans the zones in the highmem->normal->dma direction. It skips
> * zones which have free_pages > high_wmark_pages(zone), but once a zone is
> - * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
> - * lower zones regardless of the number of free pages in the lower zones. This
> - * interoperates with the page allocator fallback scheme to ensure that aging
> - * of pages is balanced across the zones.
> + * found to have free_pages <= high_wmark_pages(zone), any page is that zone
> + * or lower is eligible for reclaim until at least one usable zone is
> + * balanced.
> */
> static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
> {
> int i;
> - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
> unsigned long nr_soft_reclaimed;
> unsigned long nr_soft_scanned;
> + struct zone *zone;
> struct scan_control sc = {
> .gfp_mask = GFP_KERNEL,
> - .reclaim_idx = MAX_NR_ZONES - 1,
> .order = order,
> .priority = DEF_PRIORITY,
> .may_writepage = !laptop_mode,
> .may_unmap = 1,
> .may_swap = 1,
> + .reclaim_idx = classzone_idx,
> };
> count_vm_event(PAGEOUTRUN);
>
> @@ -3203,21 +3125,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
>
> /* Scan from the highest requested zone to dma */
> for (i = classzone_idx; i >= 0; i--) {
> - struct zone *zone = pgdat->node_zones + i;
> -
> + zone = pgdat->node_zones + i;
> if (!populated_zone(zone))
> continue;
>
> - if (sc.priority != DEF_PRIORITY &&
> - !pgdat_reclaimable(zone->zone_pgdat))
> - continue;
> -
> - /*
> - * Do some background aging of the anon list, to give
> - * pages a chance to be referenced before reclaiming.
> - */
> - age_active_anon(zone, &sc);
> -
> /*
> * If the number of buffer_heads in the machine
> * exceeds the maximum allowed level and this node
> @@ -3225,19 +3136,17 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
> * it to relieve lowmem pressure.
> */
> if (buffer_heads_over_limit && is_highmem_idx(i)) {
> - end_zone = i;
> + classzone_idx = i;
> break;
> }
>
> - if (!zone_balanced(zone, order, false, 0, 0)) {
> - end_zone = i;
> + if (!zone_balanced(zone, order, 0, 0)) {

We need to sync order with the above change?

> + classzone_idx = i;
> break;
> } else {
> /*
> - * If balanced, clear the dirty and congested
> - * flags
> - *
> - * TODO: ANOMALY
> + * If any eligible zone is balanced then the
> + * node is not considered congested or dirty.
> */
> clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
> clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
> @@ -3248,51 +3157,34 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
> goto out;
>
> /*
> + * Do some background aging of the anon list, to give
> + * pages a chance to be referenced before reclaiming. All
> + * pages are rotated regardless of classzone as this is
> + * about consistent aging.
> + */
> + age_active_anon(pgdat, &pgdat->node_zones[MAX_NR_ZONES - 1], &sc);
> +
> + /*
> * If we're getting trouble reclaiming, start doing writepage
> * even in laptop mode.
> */
> - if (sc.priority < DEF_PRIORITY - 2)
> + if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
> sc.may_writepage = 1;
>
> + /* Call soft limit reclaim before calling shrink_node. */
> + sc.nr_scanned = 0;
> + nr_soft_scanned = 0;
> + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, sc.order,
> + sc.gfp_mask, &nr_soft_scanned);
> + sc.nr_reclaimed += nr_soft_reclaimed;
> +
> /*
> - * Continue scanning in the highmem->dma direction stopping at
> - * the last zone which needs scanning. This may reclaim lowmem
> - * pages that are not necessary for zone balancing but it
> - * preserves LRU ordering. It is assumed that the bulk of
> - * allocation requests can use arbitrary zones with the
> - * possible exception of big highmem:lowmem configurations.
> + * There should be no need to raise the scanning priority if
> + * enough pages are already being scanned that that high
> + * watermark would be met at 100% efficiency.
> */
> - for (i = end_zone; i >= 0; i--) {
> - struct zone *zone = pgdat->node_zones + i;
> -
> - if (!populated_zone(zone))
> - continue;
> -
> - if (sc.priority != DEF_PRIORITY &&
> - !pgdat_reclaimable(zone->zone_pgdat))
> - continue;
> -
> - sc.nr_scanned = 0;
> - sc.reclaim_idx = i;
> -
> - nr_soft_scanned = 0;
> - /*
> - * Call soft limit reclaim before calling shrink_zone.
> - */
> - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
> - order, sc.gfp_mask,
> - &nr_soft_scanned);
> - sc.nr_reclaimed += nr_soft_reclaimed;
> -
> - /*
> - * There should be no need to raise the scanning
> - * priority if enough pages are already being scanned
> - * that that high watermark would be met at 100%
> - * efficiency.
> - */
> - if (kswapd_shrink_zone(zone, end_zone, &sc))
> - raise_priority = false;
> - }
> + if (kswapd_shrink_node(pgdat, classzone_idx, &sc))
> + raise_priority = false;
>
> /*
> * If the low watermark is met there is no need for processes
> @@ -3308,20 +3200,37 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
> break;
>
> /*
> + * Stop reclaiming if any eligible zone is balanced and clear
> + * node writeback or congested.
> + */
> + for (i = 0; i <= classzone_idx; i++) {
> + zone = pgdat->node_zones + i;
> + if (!populated_zone(zone))
> + continue;
> +
> + if (zone_balanced(zone, sc.order, 0, classzone_idx)) {
> + clear_bit(PGDAT_CONGESTED, &pgdat->flags);
> + clear_bit(PGDAT_DIRTY, &pgdat->flags);
> + goto out;
> + }
> + }
> +
> + /*
> * Raise priority if scanning rate is too low or there was no
> * progress in reclaiming pages
> */
> if (raise_priority || !sc.nr_reclaimed)
> sc.priority--;
> - } while (sc.priority >= 1 &&
> - !pgdat_balanced(pgdat, order, classzone_idx));
> + } while (sc.priority >= 1);
>
> out:
> /*
> - * Return the highest zone idx we were reclaiming at so
> - * prepare_kswapd_sleep() makes the same decisions as here.
> + * Return the order kswapd stopped reclaiming at as
> + * prepare_kswapd_sleep() takes it into account. If another caller
> + * entered the allocator slow path while kswapd was awake, order will
> + * remain at the higher level.
> */
> - return end_zone;
> + return sc.order;
> }
>