[RFC][PATCH] page reclaim throttle take2

From: KOSAKI Motohiro
Date: Mon Feb 25 2008 - 21:35:58 EST


Hi

this patch is page reclaim improvement.

o previous discussion:
http://marc.info/?l=linux-mm&m=120339997125985&w=2

o test method
$ ./hackbench 120 process 1000

o test result (average of 5 times measure)

limit hackbench sys-time major-fault max-spent-time
time(s) (s) in shrink_zone()
(jiffies)
--------------------------------------------------------------------
3 42.06 378.70 5336 6306


o reason why restrict parallel reclaim 3 task per zone

we tested various parameter.
- restrict 1 is best major fault.
but worst max spent time.
- restrict 3 is best max spent reclaim time and hackbench result.

I think "restrict 3" cause most good experience.


limit hackbench sys-time major-fault max-spent-time
time(s) (s) in shrink_zone()
(jiffies)
--------------------------------------------------------------------
1 48.50 283.89 3690 9057
2 44.43 350.94 5245 7159
3 42.06 378.70 5336 6306
4 48.84 401.87 5474 6669
unlimited 282.30 1248.47 29026 -



Please any comments!



Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
CC: Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx>
CC: Rik van Riel <riel@xxxxxxxxxx>
CC: Lee Schermerhorn <Lee.Schermerhorn@xxxxxx>
CC: Nick Piggin <npiggin@xxxxxxx>


---
include/linux/mmzone.h | 3 +
mm/page_alloc.c | 4 +
mm/vmscan.c | 101 ++++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 99 insertions(+), 9 deletions(-)

Index: b/include/linux/mmzone.h
===================================================================
--- a/include/linux/mmzone.h 2008-02-25 21:37:49.000000000 +0900
+++ b/include/linux/mmzone.h 2008-02-26 10:12:12.000000000 +0900
@@ -335,6 +335,9 @@ struct zone {
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */

+
+ atomic_t nr_reclaimers;
+ wait_queue_head_t reclaim_throttle_waitq;
/*
* rarely used fields:
*/
Index: b/mm/page_alloc.c
===================================================================
--- a/mm/page_alloc.c 2008-02-25 21:37:49.000000000 +0900
+++ b/mm/page_alloc.c 2008-02-26 10:12:12.000000000 +0900
@@ -3466,6 +3466,10 @@ static void __meminit free_area_init_cor
zone->nr_scan_inactive = 0;
zap_zone_vm_stats(zone);
zone->flags = 0;
+
+ zone->nr_reclaimers = ATOMIC_INIT(0);
+ init_waitqueue_head(&zone->reclaim_throttle_waitq);
+
if (!size)
continue;

Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c 2008-02-25 21:37:49.000000000 +0900
+++ b/mm/vmscan.c 2008-02-26 10:59:38.000000000 +0900
@@ -1252,6 +1252,55 @@ static unsigned long shrink_zone(int pri
return nr_reclaimed;
}

+
+#define RECLAIM_LIMIT (3)
+
+static int do_shrink_zone_throttled(int priority, struct zone *zone,
+ struct scan_control *sc,
+ unsigned long *ret_reclaimed)
+{
+ u64 start_time;
+ int ret = 0;
+
+ start_time = jiffies_64;
+
+ wait_event(zone->reclaim_throttle_waitq,
+ atomic_add_unless(&zone->nr_reclaimers, 1, RECLAIM_LIMIT));
+
+ /* more reclaim until needed? */
+ if (scan_global_lru(sc) &&
+ !(current->flags & PF_KSWAPD) &&
+ time_after64(jiffies, start_time + HZ/10)) {
+ if (zone_watermark_ok(zone, sc->order, 4*zone->pages_high,
+ MAX_NR_ZONES-1, 0)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
+ *ret_reclaimed += shrink_zone(priority, zone, sc);
+
+out:
+ atomic_dec(&zone->nr_reclaimers);
+ wake_up_all(&zone->reclaim_throttle_waitq);
+
+ return ret;
+}
+
+static unsigned long shrink_zone_throttled(int priority, struct zone *zone,
+ struct scan_control *sc)
+{
+ unsigned long nr_reclaimed = 0;
+ int ret;
+
+ ret = do_shrink_zone_throttled(priority, zone, sc, &nr_reclaimed);
+
+ if (ret == -EAGAIN)
+ nr_reclaimed = 1;
+
+ return nr_reclaimed;
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1268,12 +1317,11 @@ static unsigned long shrink_zone(int pri
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zone **zones,
- struct scan_control *sc)
+static int shrink_zones(int priority, struct zone **zones,
+ struct scan_control *sc, unsigned long *ret_reclaimed)
{
- unsigned long nr_reclaimed = 0;
int i;
-
+ int ret;

sc->all_unreclaimable = 1;
for (i = 0; zones[i] != NULL; i++) {
@@ -1304,10 +1352,15 @@ static unsigned long shrink_zones(int pr
priority);
}

- nr_reclaimed += shrink_zone(priority, zone, sc);
+ ret = do_shrink_zone_throttled(priority, zone, sc,
+ ret_reclaimed);
+ if (ret == -EAGAIN)
+ goto out;
}
+ ret = 0;

- return nr_reclaimed;
+out:
+ return ret;
}

/*
@@ -1333,6 +1386,9 @@ static unsigned long do_try_to_free_page
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
int i;
+ unsigned long start_time = jiffies;
+ unsigned long last_check_time = jiffies;
+ int err;

if (scan_global_lru(sc))
count_vm_event(ALLOCSTALL);
@@ -1356,7 +1412,12 @@ static unsigned long do_try_to_free_page
sc->nr_io_pages = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zones, sc);
+ err = shrink_zones(priority, zones, sc, &nr_reclaimed);
+ if (err == -EAGAIN) {
+ ret = 1;
+ goto out;
+ }
+
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1389,8 +1450,28 @@ static unsigned long do_try_to_free_page

/* Take a nap, wait for some writeback to complete */
if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
- sc->nr_io_pages > sc->swap_cluster_max)
+ sc->nr_io_pages > sc->swap_cluster_max) {
congestion_wait(WRITE, HZ/10);
+
+ }
+
+ if (scan_global_lru(sc) &&
+ time_after(jiffies, start_time+HZ) &&
+ time_after(jiffies, last_check_time+HZ/10)) {
+ last_check_time = jiffies;
+
+ /* more reclaim until needed? */
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *zone = zones[i];
+
+ if (zone_watermark_ok(zone, sc->order,
+ 4*zone->pages_high,
+ zone_idx(zones[0]), 0)) {
+ ret = 1;
+ goto out;
+ }
+ }
+ }
}
/* top priority shrink_caches still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
@@ -1588,7 +1669,9 @@ loop_again:
*/
if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
end_zone, 0))
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ nr_reclaimed += shrink_zone_throttled(priority,
+ zone,
+ &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/