Reported-by: wassim dagash Signed-off-by: wassim dagash diff -Nuar linux-2.6.28.orig/include/linux/mmzone.h linux-2.6.28/include/linux/mmzone.h --- linux-2.6.28.orig/include/linux/mmzone.h 2008-12-25 12:20:10.000000000 +0200 +++ linux-2.6.28/include/linux/mmzone.h 2008-12-31 10:16:03.000000000 +0200 @@ -409,6 +409,7 @@ * rarely used fields: */ const char *name; + unsigned int kswapd_max_order; } ____cacheline_internodealigned_in_smp; typedef enum { @@ -625,7 +626,6 @@ int node_id; wait_queue_head_t kswapd_wait; struct task_struct *kswapd; - int kswapd_max_order; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -642,8 +642,8 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); void build_all_zonelists(void); -void wakeup_kswapd(struct zone *zone, int order); -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +void wakeup_kswapd(struct zone *zone); +int zone_watermark_ok(struct zone *z, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, diff -Nuar linux-2.6.28.orig/Makefile linux-2.6.28/Makefile --- linux-2.6.28.orig/Makefile 2008-12-25 12:19:10.000000000 +0200 +++ linux-2.6.28/Makefile 2008-12-31 10:51:18.000000000 +0200 @@ -2,7 +2,7 @@ PATCHLEVEL = 6 SUBLEVEL = 28 EXTRAVERSION = -NAME = Erotic Pickled Herring +NAME = kswapd_sol # *DOCUMENTATION* # To see a list of typical targets execute "make help" diff -Nuar linux-2.6.28.orig/mm/page_alloc.c linux-2.6.28/mm/page_alloc.c --- linux-2.6.28.orig/mm/page_alloc.c 2008-12-25 12:20:14.000000000 +0200 +++ linux-2.6.28/mm/page_alloc.c 2008-12-31 10:26:32.000000000 +0200 @@ -1224,10 +1224,11 @@ * Return 1 if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +int zone_watermark_ok(struct zone *z, unsigned long mark, int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ + unsigned int order = z->kswapd_max_order; long min = mark; long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; int o; @@ -1417,7 +1418,7 @@ mark = zone->pages_low; else mark = zone->pages_high; - if (!zone_watermark_ok(zone, order, mark, + if (!zone_watermark_ok(zone, mark, classzone_idx, alloc_flags)) { if (!zone_reclaim_mode || !zone_reclaim(zone, gfp_mask, order)) @@ -1485,9 +1486,16 @@ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) goto got_pg; + /* + First item in list of zones suitable for gfp_mask is the zone the request intended to, + the other items are fallback + */ + z->zone->kswapd_max_order = order; + /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem @@ -1500,7 +1508,7 @@ goto nopage; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -3448,7 +3456,6 @@ pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); - pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { @@ -3488,6 +3495,8 @@ nr_kernel_pages += realsize; nr_all_pages += realsize; + /* Initializing kswapd_max_order to zero */ + zone->kswapd_max_order = 0; zone->spanned_pages = size; zone->present_pages = realsize; #ifdef CONFIG_NUMA diff -Nuar linux-2.6.28.orig/mm/vmscan.c linux-2.6.28/mm/vmscan.c --- linux-2.6.28.orig/mm/vmscan.c 2008-12-25 12:20:14.000000000 +0200 +++ linux-2.6.28/mm/vmscan.c 2008-12-31 10:29:27.000000000 +0200 @@ -1770,7 +1770,7 @@ shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, zone->pages_high, + if (!zone_watermark_ok(zone, zone->pages_high, 0, 0)) { end_zone = i; break; @@ -1805,7 +1805,7 @@ priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, + if (!zone_watermark_ok(zone, zone->pages_high, end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; @@ -1815,7 +1815,7 @@ * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, + if (!zone_watermark_ok(zone, 8*zone->pages_high, end_zone, 0)) nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -1924,22 +1924,30 @@ order = 0; for ( ; ; ) { unsigned long new_order; - + int i,max_order; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - new_order = pgdat->kswapd_max_order; - pgdat->kswapd_max_order = 0; - if (order < new_order) { - /* - * Don't sleep if someone wants a larger 'order' - * allocation - */ - order = new_order; - } else { - if (!freezing(current)) - schedule(); - order = pgdat->kswapd_max_order; + max_order = 0; + for (i = pgdat->nr_zones - 1; i >= 0; i--) + { + struct zone *zone = pgdat->node_zones + i; + new_order = zone->kswapd_max_order; + zone->kswapd_max_order = 0; + if (max_order < new_order){ + max_order = new_order; + } + } + + if(order < max_order) + { + order = max_order; } + else + { + if (!freezing(current)) + schedule(); + } + finish_wait(&pgdat->kswapd_wait, &wait); if (!try_to_freeze()) { @@ -1955,7 +1963,7 @@ /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone) { pg_data_t *pgdat; @@ -1963,10 +1971,8 @@ return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + if (zone_watermark_ok(zone, zone->pages_low, 0, 0)) return; - if (pgdat->kswapd_max_order < order) - pgdat->kswapd_max_order = order; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait))