[PATCH 6/6] Use one zonelist that is filtered by nodemask

From: Mel Gorman
Date: Tue Sep 11 2007 - 17:32:52 EST



Two zonelists exist so that GFP_THISNODE allocations will be guaranteed
to use memory only from a node local to the CPU. As we can now filter the
zonelist based on a nodemask, we can filter the node slightly different
when GFP_THISNODE is specified.

When GFP_THISNODE is used, a temporary nodemask is created with only the
node local to the CPU set. This allows us to eliminate the second zonelist.

Signed-off-by: Mel Gorman <mel@xxxxxxxxx>
---

drivers/char/sysrq.c | 2 -
fs/buffer.c | 5 +--
include/linux/gfp.h | 23 +++------------
include/linux/mempolicy.h | 2 -
include/linux/mmzone.h | 14 ---------
mm/mempolicy.c | 8 ++---
mm/page_alloc.c | 61 ++++++++++++++++++++++-------------------
mm/slab.c | 2 -
mm/slub.c | 2 -
9 files changed, 50 insertions(+), 69 deletions(-)

diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/drivers/char/sysrq.c linux-2.6.23-rc4-mm1-040_use_one_zonelist/drivers/char/sysrq.c
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/drivers/char/sysrq.c 2007-09-10 16:06:13.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/drivers/char/sysrq.c 2007-09-11 13:43:28.000000000 +0100
@@ -270,7 +270,7 @@ static struct sysrq_key_op sysrq_term_op

static void moom_callback(struct work_struct *ignored)
{
- out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0);
+ out_of_memory(node_zonelist(0), GFP_KERNEL, 0);
}

static DECLARE_WORK(moom_work, moom_callback);
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/fs/buffer.c linux-2.6.23-rc4-mm1-040_use_one_zonelist/fs/buffer.c
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/fs/buffer.c 2007-09-10 16:06:39.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/fs/buffer.c 2007-09-11 13:43:28.000000000 +0100
@@ -375,11 +375,10 @@ static void free_more_memory(void)
yield();

for_each_online_node(nid) {
- zones = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+ zones = first_zones_zonelist(node_zonelist(nid),
NULL, gfp_zone(GFP_NOFS));
if (*zones)
- try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
- GFP_NOFS);
+ try_to_free_pages(node_zonelist(nid), 0, GFP_NOFS);
}
}

diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/include/linux/gfp.h linux-2.6.23-rc4-mm1-040_use_one_zonelist/include/linux/gfp.h
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/include/linux/gfp.h 2007-09-10 16:06:39.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/include/linux/gfp.h 2007-09-11 13:43:28.000000000 +0100
@@ -150,29 +150,16 @@ static inline gfp_t set_migrateflags(gfp
* virtual kernel addresses to the allocated page(s).
*/

-static inline enum zone_type gfp_zonelist(gfp_t flags)
-{
- int base = 0;
-
-#ifdef CONFIG_NUMA
- if (flags & __GFP_THISNODE)
- base = 1;
-#endif
-
- return base;
-}
-
/*
- * We get the zone list from the current node and the gfp_mask.
- * This zonelist contains two zonelists, one for all zones with memory and
- * one containing just zones from the node the zonelist belongs to
+ * We get the zone list from the current node and the list of zones
+ * is filtered based on the GFP flags
*
* For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
* optimized to &contig_page_data at compile-time.
*/
-static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
+static inline struct zonelist *node_zonelist(int nid)
{
- return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
+ return &NODE_DATA(nid)->node_zonelist;
}

#ifndef HAVE_ARCH_FREE_PAGE
@@ -199,7 +186,7 @@ static inline struct page *alloc_pages_n
if (nid < 0)
nid = numa_node_id();

- return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+ return __alloc_pages(gfp_mask, order, node_zonelist(nid));
}

#ifdef CONFIG_NUMA
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/include/linux/mempolicy.h linux-2.6.23-rc4-mm1-040_use_one_zonelist/include/linux/mempolicy.h
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/include/linux/mempolicy.h 2007-09-10 16:06:39.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/include/linux/mempolicy.h 2007-09-11 13:43:28.000000000 +0100
@@ -239,7 +239,7 @@ static inline void mpol_fix_fork_child_f
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags)
{
- return node_zonelist(0, gfp_flags);
+ return node_zonelist(0);
}

static inline int do_migrate_pages(struct mm_struct *mm,
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/include/linux/mmzone.h linux-2.6.23-rc4-mm1-040_use_one_zonelist/include/linux/mmzone.h
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/include/linux/mmzone.h 2007-09-11 13:43:04.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/include/linux/mmzone.h 2007-09-11 13:43:28.000000000 +0100
@@ -356,17 +356,6 @@ struct zone {
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

#ifdef CONFIG_NUMA
-
-/*
- * The NUMA zonelists are doubled becausse we need zonelists that restrict the
- * allocations to a single node for GFP_THISNODE.
- *
- * [0] : Zonelist with fallback
- * [1] : No fallback (GFP_THISNODE)
- */
-#define MAX_ZONELISTS 2
-
-
/*
* We cache key information from each zonelist for smaller cache
* footprint when scanning for free pages in get_page_from_freelist().
@@ -432,7 +421,6 @@ struct zonelist_cache {
unsigned long last_full_zap; /* when last zap'd (jiffies) */
};
#else
-#define MAX_ZONELISTS 1
struct zonelist_cache;
#endif

@@ -488,7 +476,7 @@ extern struct page *mem_map;
struct bootmem_data;
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
- struct zonelist node_zonelists[MAX_ZONELISTS];
+ struct zonelist node_zonelist;
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP
struct page *node_mem_map;
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/mempolicy.c linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/mempolicy.c
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/mempolicy.c 2007-09-10 16:06:39.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/mempolicy.c 2007-09-11 13:43:28.000000000 +0100
@@ -1112,7 +1112,7 @@ static struct zonelist *zonelist_policy(
nd = 0;
BUG();
}
- return node_zonelist(nd, gfp);
+ return node_zonelist(nd);
}

/* Do dynamic interleaving for a process */
@@ -1149,7 +1149,7 @@ unsigned slab_node(struct mempolicy *pol
struct zonelist *zonelist;
unsigned long *z;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+ zonelist = &NODE_DATA(numa_node_id())->node_zonelist;
z = first_zones_zonelist(zonelist, &policy->v.nodes,
highest_zoneidx);
return zone_to_nid(zonelist_zone(*z));
@@ -1215,7 +1215,7 @@ struct zonelist *huge_zonelist(struct vm
unsigned nid;

nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- return node_zonelist(nid, gfp_flags);
+ return node_zonelist(nid);
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
@@ -1229,7 +1229,7 @@ static struct page *alloc_page_interleav
struct zonelist *zl;
struct page *page;

- zl = node_zonelist(nid, gfp);
+ zl = node_zonelist(nid);
page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zonelist_zone(zl->_zones[0]))
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/page_alloc.c linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/page_alloc.c
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/page_alloc.c 2007-09-10 16:06:39.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/page_alloc.c 2007-09-11 13:43:28.000000000 +0100
@@ -1730,10 +1730,33 @@ got_pg:
return page;
}

+static nodemask_t *nodemask_thisnode(nodemask_t *nodemask)
+{
+ /* Build a nodemask for just this node */
+ int nid = numa_node_id();
+
+ nodes_clear(*nodemask);
+ node_set(nid, *nodemask);
+
+ return nodemask;
+}
+
struct page * fastcall
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
+ /*
+ * Use a temporary nodemask for __GFP_THISNODE allocations. If the
+ * cost of allocating on the stack or the stack usage becomes
+ * noticable, allocate the nodemasks per node at boot or compile time
+ */
+ if (unlikely(gfp_mask & __GFP_THISNODE)) {
+ nodemask_t nodemask;
+
+ return __alloc_pages_internal(gfp_mask, order,
+ zonelist, nodemask_thisnode(&nodemask));
+ }
+
return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
}

@@ -1741,6 +1764,9 @@ struct page * fastcall
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
+ /* Specifying both __GFP_THISNODE and nodemask is stupid. Warn user */
+ WARN_ON(gfp_mask & __GFP_THISNODE);
+
return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
}

@@ -1817,7 +1843,7 @@ static unsigned int nr_free_zone_pages(i
/* Just pick one node, since fallback list is circular */
unsigned int sum = 0;

- struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
+ struct zonelist *zonelist = node_zonelist(numa_node_id());

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
unsigned long size = zone->present_pages;
@@ -2182,7 +2208,7 @@ static void build_zonelists_in_node_orde
int j;
struct zonelist *zonelist;

- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelist;
for (j = 0; zonelist->_zones[j] != 0; j++)
;
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
@@ -2191,19 +2217,6 @@ static void build_zonelists_in_node_orde
}

/*
- * Build gfp_thisnode zonelists
- */
-static void build_thisnode_zonelists(pg_data_t *pgdat)
-{
- int j;
- struct zonelist *zonelist;
-
- zonelist = &pgdat->node_zonelists[1];
- j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
- zonelist->_zones[j] = 0;
-}
-
-/*
* Build zonelists ordered by zone and nodes within zones.
* This results in conserving DMA zone[s] until all Normal memory is
* exhausted, but results in overflowing to remote node while memory
@@ -2218,7 +2231,7 @@ static void build_zonelists_in_zone_orde
struct zone *z;
struct zonelist *zonelist;

- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelist;
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
@@ -2298,17 +2311,14 @@ static void set_zonelist_order(void)
static void build_zonelists(pg_data_t *pgdat)
{
int j, node, load;
- enum zone_type i;
nodemask_t used_mask;
int local_node, prev_node;
struct zonelist *zonelist;
int order = current_zonelist_order;

/* initialize zonelists */
- for (i = 0; i < MAX_ZONELISTS; i++) {
- zonelist = pgdat->node_zonelists + i;
- zonelist->_zones[0] = 0;
- }
+ zonelist = &pgdat->node_zonelist;
+ zonelist->_zones[0] = 0;

/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -2350,8 +2360,6 @@ static void build_zonelists(pg_data_t *p
/* calculate node order -- i.e., DMA last! */
build_zonelists_in_zone_order(pgdat, j);
}
-
- build_thisnode_zonelists(pgdat);
}

/* Construct the zonelist performance cache - see further mmzone.h */
@@ -2361,7 +2369,7 @@ static void build_zonelist_cache(pg_data
struct zonelist_cache *zlc;
unsigned long *z;

- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelist;
zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
for (z = zonelist->_zones; *z; z++)
@@ -2385,7 +2393,7 @@ static void build_zonelists(pg_data_t *p

local_node = pgdat->node_id;

- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelist;
j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

/*
@@ -2415,8 +2423,7 @@ static void build_zonelists(pg_data_t *p
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
static void build_zonelist_cache(pg_data_t *pgdat)
{
- pgdat->node_zonelists[0].zlcache_ptr = NULL;
- pgdat->node_zonelists[1].zlcache_ptr = NULL;
+ pgdat->node_zonelist.zlcache_ptr = NULL;
}

#endif /* CONFIG_NUMA */
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/slab.c linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/slab.c
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/slab.c 2007-09-10 16:06:31.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/slab.c 2007-09-11 13:43:28.000000000 +0100
@@ -3250,7 +3250,7 @@ static void *fallback_alloc(struct kmem_
if (flags & __GFP_THISNODE)
return NULL;

- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ zonelist = node_zonelist(slab_node(current->mempolicy));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

retry:
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/slub.c linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/slub.c
--- linux-2.6.23-rc4-mm1-030_filter_nodemask/mm/slub.c 2007-09-10 16:06:31.000000000 +0100
+++ linux-2.6.23-rc4-mm1-040_use_one_zonelist/mm/slub.c 2007-09-11 13:43:28.000000000 +0100
@@ -1283,7 +1283,7 @@ static struct page *get_any_partial(stru
if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
return NULL;

- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ zonelist = node_zonelist(slab_node(current->mempolicy));
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/