[PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_maxpages

From: Rik van Riel
Date: Thu Nov 13 2008 - 17:14:02 EST


Sometimes the VM spends the first few priority rounds rotating back
referenced pages and submitting IO. Once we get to a lower priority,
sometimes the VM ends up freeing way too many pages.

The fix is relatively simple: in shrink_zone() we can check how many
pages we have already freed and break out of the loop.

However, in order to do this we do need to know how many pages we already
freed, so move nr_reclaimed into scan_control.


Signed-off-by: Rik van Riel <riel@xxxxxxxxxx>

---
mm/vmscan.c | 60 ++++++++++++++++++++++++++++++------------------------------
1 file changed, 30 insertions(+), 30 deletions(-)

Index: linux-2.6.28-rc2-mm1/mm/vmscan.c
===================================================================
--- linux-2.6.28-rc2-mm1.orig/mm/vmscan.c 2008-10-30 15:20:06.000000000 -0400
+++ linux-2.6.28-rc2-mm1/mm/vmscan.c 2008-11-13 17:08:35.000000000 -0500
@@ -53,6 +53,9 @@ struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;

+ /* Number of pages that were freed */
+ unsigned long nr_reclaimed;
+
/* This context's GFP mask */
gfp_t gfp_mask;

@@ -1408,16 +1411,14 @@ static void get_scan_ratio(struct zone *
percent[1] = 100 - percent[0];
}

-
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- unsigned long nr_reclaimed = 0;
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;

@@ -1458,10 +1459,18 @@ static unsigned long shrink_zone(int pri
(unsigned long)sc->swap_cluster_max);
nr[l] -= nr_to_scan;

- nr_reclaimed += shrink_list(l, nr_to_scan,
+ sc->nr_reclaimed += shrink_list(l, nr_to_scan,
zone, sc, priority);
}
}
+ /*
+ * On large memory systems, scan >> priority can become
+ * really large. This is OK if we need to scan through
+ * that many pages (referenced, dirty, etc), but make
+ * sure to stop if we already freed enough.
+ */
+ if (sc->nr_reclaimed > sc->swap_cluster_max)
+ break;
}

/*
@@ -1474,7 +1483,6 @@ static unsigned long shrink_zone(int pri
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);

throttle_vm_writeout(sc->gfp_mask);
- return nr_reclaimed;
}

/*
@@ -1488,16 +1496,13 @@ static unsigned long shrink_zone(int pri
* b) The zones may be over pages_high but they must go *over* pages_high to
* satisfy the `incremental min' zone defense algorithm.
*
- * Returns the number of reclaimed pages.
- *
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
- unsigned long nr_reclaimed = 0;
struct zoneref *z;
struct zone *zone;

@@ -1528,10 +1533,8 @@ static unsigned long shrink_zones(int pr
priority);
}

- nr_reclaimed += shrink_zone(priority, zone, sc);
+ shrink_zone(priority, zone, sc);
}
-
- return nr_reclaimed;
}

/*
@@ -1556,7 +1559,6 @@ static unsigned long do_try_to_free_page
int priority;
unsigned long ret = 0;
unsigned long total_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
struct zoneref *z;
@@ -1584,7 +1586,7 @@ static unsigned long do_try_to_free_page
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1592,13 +1594,13 @@ static unsigned long do_try_to_free_page
if (scan_global_lru(sc)) {
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
}
total_scanned += sc->nr_scanned;
- if (nr_reclaimed >= sc->swap_cluster_max) {
- ret = nr_reclaimed;
+ if (sc->nr_reclaimed >= sc->swap_cluster_max) {
+ ret = sc->nr_reclaimed;
goto out;
}

@@ -1621,7 +1623,7 @@ static unsigned long do_try_to_free_page
}
/* top priority shrink_zones still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
- ret = nr_reclaimed;
+ ret = sc->nr_reclaimed;
out:
/*
* Now that we've scanned all the zones at this priority level, note
@@ -1716,7 +1718,6 @@ static unsigned long balance_pgdat(pg_da
int priority;
int i;
unsigned long total_scanned;
- unsigned long nr_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -1735,7 +1736,7 @@ static unsigned long balance_pgdat(pg_da

loop_again:
total_scanned = 0;
- nr_reclaimed = 0;
+ sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);

@@ -1821,11 +1822,11 @@ loop_again:
*/
if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
end_zone, 0))
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
continue;
@@ -1839,7 +1840,7 @@ loop_again:
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > nr_reclaimed + nr_reclaimed / 2)
+ total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
}
if (all_zones_ok)
@@ -1857,7 +1858,7 @@ loop_again:
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+ if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
}
out:
@@ -1879,7 +1880,7 @@ out:
goto loop_again;
}

- return nr_reclaimed;
+ return sc.nr_reclaimed;
}

/*
@@ -2231,7 +2232,6 @@ static int __zone_reclaim(struct zone *z
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int priority;
- unsigned long nr_reclaimed = 0;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2264,9 +2264,9 @@ static int __zone_reclaim(struct zone *z
priority = ZONE_RECLAIM_PRIORITY;
do {
note_zone_scanning_priority(zone, priority);
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
priority--;
- } while (priority >= 0 && nr_reclaimed < nr_pages);
+ } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}

slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2290,13 +2290,13 @@ static int __zone_reclaim(struct zone *z
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- nr_reclaimed += slab_reclaimable -
+ sc.nr_reclaimed += slab_reclaimable -
zone_page_state(zone, NR_SLAB_RECLAIMABLE);
}

p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
- return nr_reclaimed >= nr_pages;
+ return sc.nr_reclaimed >= nr_pages;
}

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/