Re: [PATCH] __alloc_pages - NUMA and lower zone protection

From: Martin Hicks
Date: Fri Feb 13 2004 - 21:19:40 EST




On Sat, Feb 14, 2004 at 11:01:08AM +1100, Nick Piggin wrote:
>
>
> Martin Hicks wrote:
> >
> >The patch seems to do the right thing on my non-NUMA zx1 ia64 machine
> >(which has ZONE_DMA and ZONE_NORMAL) as well as the multi-node Altix.
> >
> >
>
> Could you add a comment or two, please?

Okay. Same patch, with a comment.

mh

--
Martin Hicks Wild Open Source Inc.
mort@xxxxxxxxxxxxxxxxxx 613-266-2296
# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.1631 -> 1.1632
# mm/page_alloc.c 1.185 -> 1.186
# include/linux/mmzone.h 1.52 -> 1.53
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 04/02/13 mort@xxxxxxxxxxxxxxxx 1.1632
# Change incremental min in __alloc_pages to ensure that min
# doesn't increase across nodes.
# --------------------------------------------
#
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h Fri Feb 13 21:13:56 2004
+++ b/include/linux/mmzone.h Fri Feb 13 21:13:56 2004
@@ -70,6 +70,7 @@
spinlock_t lock;
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
+ unsigned long zone_type;

ZONE_PADDING(_pad1_)

diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c Fri Feb 13 21:13:56 2004
+++ b/mm/page_alloc.c Fri Feb 13 21:13:56 2004
@@ -41,6 +41,7 @@
int nr_swap_pages;
int numnodes = 1;
int sysctl_lower_zone_protection = 0;
+static int max_zone; /* Highest zone number that contains pages */

EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
@@ -559,27 +560,26 @@
return NULL;

/* Go through the zonelist once, looking for a zone with enough free */
- min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
- unsigned long local_low;
+ unsigned long local_low = z->pages_low;

/*
* This is the fabled 'incremental min'. We let real-time tasks
* dip their real-time paws a little deeper into reserves.
*/
- local_low = z->pages_low;
if (rt_task(p))
local_low >>= 1;
- min += local_low;
-
+ /* Reset min on each iteration so we don't accumulate
+ * the min across multiple nodes */
+ min = (1UL << order) + local_low;
+ min += local_low * sysctl_lower_zone_protection * (max_zone - z->zone_type);
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
- goto got_pg;
+ goto got_pg;
}
- min += z->pages_low * sysctl_lower_zone_protection;
}

/* we're somewhat low on memory, failed to find what we needed */
@@ -587,24 +587,25 @@
wakeup_kswapd(zones[i]);

/* Go through the zonelist again, taking __GFP_HIGH into account */
- min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
- unsigned long local_min;
struct zone *z = zones[i];
+ unsigned long local_min = z->pages_min;

- local_min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
local_min >>= 2;
if (rt_task(p))
local_min >>= 1;
- min += local_min;
+ /* Reset min on each iteration so we don't accumulate
+ * the min across multiple nodes */
+ min = (1UL << order) + local_min;
+ min += local_min * sysctl_lower_zone_protection * (max_zone - z->zone_type);
+
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
}
- min += local_min * sysctl_lower_zone_protection;
}

/* here we're in the low on memory slow path */
@@ -636,18 +637,19 @@
p->flags &= ~PF_MEMALLOC;

/* go through the zonelist yet one more time */
- min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

- min += z->pages_min;
+ /* Reset min on each iteration so we don't accumulate
+ * the min across multiple nodes */
+ min = (1UL << order) + z->pages_min;
+ min += z->pages_min * sysctl_lower_zone_protection * (max_zone - z->zone_type);
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
}
- min += z->pages_low * sysctl_lower_zone_protection;
}

/*
@@ -1115,7 +1117,11 @@
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);

zonelist->zones[j++] = NULL;
- }
+
+ if (pgdat->node_zones[i].present_pages > 0)
+ if (i > max_zone)
+ max_zone = i;
+ }
}

void __init build_all_zonelists(void)
@@ -1258,6 +1264,7 @@
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->zone_type = j;

zone->temp_priority = zone->prev_priority = DEF_PRIORITY;