Re: [PATCH] mm: fix invalid node in alloc_migrate_target()

From: Andrew Morton
Date: Fri Mar 25 2016 - 15:22:44 EST


On Fri, 25 Mar 2016 14:56:04 +0800 Xishi Qiu <qiuxishi@xxxxxxxxxx> wrote:

> It is incorrect to use next_node to find a target node, it will
> return MAX_NUMNODES or invalid node. This will lead to crash in
> buddy system allocation.
>
> ...
>
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -289,11 +289,11 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
> * now as a simple work-around, we use the next node for destination.
> */
> if (PageHuge(page)) {
> - nodemask_t src = nodemask_of_node(page_to_nid(page));
> - nodemask_t dst;
> - nodes_complement(dst, src);
> + int node = next_online_node(page_to_nid(page));
> + if (node == MAX_NUMNODES)
> + node = first_online_node;
> return alloc_huge_page_node(page_hstate(compound_head(page)),
> - next_node(page_to_nid(page), dst));
> + node);
> }
>
> if (PageHighMem(page))

Indeed. Can you tell us more about this circumstances under which the
kernel will crash? I need to decide which kernel version(s) need the
patch, but the changelog doesn't contain the info needed to make this
decision (it should).



next_node() isn't a very useful interface, really. Just about every
caller does this:


node = next_node(node, XXX);
if (node == MAX_NUMNODES)
node = first_node(XXX);

so how about we write a function which does that, and stop open-coding
the same thing everywhere?

And I think your fix could then use such a function:

int node = that_new_function(page_to_nid(page), node_online_map);



Also, mm/mempolicy.c:offset_il_node() worries me:

do {
nid = next_node(nid, pol->v.nodes);
c++;
} while (c <= target);

Can't `nid' hit MAX_NUMNODES?


And can someone please explain mem_cgroup_select_victim_node() to me?
How can we hit the "node = numa_node_id()" path? Only if
memcg->scan_nodes is empty? is that even valid? The comment seems to
have not much to do with the code?

mpol_rebind_nodemask() is similar.



Something like this?


From: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Subject: include/linux/nodemask.h: create next_node_in() helper

Lots of code does

node = next_node(node, XXX);
if (node == MAX_NUMNODES)
node = first_node(XXX);

so create next_node_in() to do this and use it in various places.

Cc: Xishi Qiu <qiuxishi@xxxxxxxxxx>
Cc: Vlastimil Babka <vbabka@xxxxxxx>
Cc: Joonsoo Kim <js1304@xxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
Cc: "Laura Abbott" <lauraa@xxxxxxxxxxxxxx>
Cc: Hui Zhu <zhuhui@xxxxxxxxxx>
Cc: Wang Xiaoqiang <wangxq10@xxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

include/linux/nodemask.h | 18 +++++++++++++++++-
kernel/cpuset.c | 8 +-------
mm/hugetlb.c | 4 +---
mm/memcontrol.c | 4 +---
mm/mempolicy.c | 8 ++------
mm/page_isolation.c | 9 +++------
mm/slab.c | 13 +++----------
7 files changed, 28 insertions(+), 36 deletions(-)

diff -puN include/linux/nodemask.h~include-linux-nodemaskh-create-next_node_in-helper include/linux/nodemask.h
--- a/include/linux/nodemask.h~include-linux-nodemaskh-create-next_node_in-helper
+++ a/include/linux/nodemask.h
@@ -43,8 +43,10 @@
*
* int first_node(mask) Number lowest set bit, or MAX_NUMNODES
* int next_node(node, mask) Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask) Next node past 'node', or wrap to first,
+ * or MAX_NUMNODES
* int first_unset_node(mask) First node not set in mask, or
- * MAX_NUMNODES.
+ * MAX_NUMNODES
*
* nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set
* NODE_MASK_ALL Initializer - all bits set
@@ -259,6 +261,20 @@ static inline int __next_node(int n, con
return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed. Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+static inline int __next_node_in(int node, const nodemask_t *srcp)
+{
+ int ret = __next_node(node, srcp);
+
+ if (ret == MAX_NUMNODES)
+ ret = __first_node(srcp);
+ return ret;
+}
+
static inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
nodes_clear(*mask);
diff -puN kernel/cpuset.c~include-linux-nodemaskh-create-next_node_in-helper kernel/cpuset.c
--- a/kernel/cpuset.c~include-linux-nodemaskh-create-next_node_in-helper
+++ a/kernel/cpuset.c
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_

static int cpuset_spread_node(int *rotor)
{
- int node;
-
- node = next_node(*rotor, current->mems_allowed);
- if (node == MAX_NUMNODES)
- node = first_node(current->mems_allowed);
- *rotor = node;
- return node;
+ return *rotor = next_node_in(*rotor, current->mems_allowed);
}

int cpuset_mem_spread_node(void)
diff -puN mm/hugetlb.c~include-linux-nodemaskh-create-next_node_in-helper mm/hugetlb.c
--- a/mm/hugetlb.c~include-linux-nodemaskh-create-next_node_in-helper
+++ a/mm/hugetlb.c
@@ -937,9 +937,7 @@ err:
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
+ nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);

return nid;
diff -puN mm/memcontrol.c~include-linux-nodemaskh-create-next_node_in-helper mm/memcontrol.c
--- a/mm/memcontrol.c~include-linux-nodemaskh-create-next_node_in-helper
+++ a/mm/memcontrol.c
@@ -1388,9 +1388,7 @@ int mem_cgroup_select_victim_node(struct
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;

- node = next_node(node, memcg->scan_nodes);
- if (node == MAX_NUMNODES)
- node = first_node(memcg->scan_nodes);
+ node = next_node_in(node, memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
diff -puN mm/mempolicy.c~include-linux-nodemaskh-create-next_node_in-helper mm/mempolicy.c
--- a/mm/mempolicy.c~include-linux-nodemaskh-create-next_node_in-helper
+++ a/mm/mempolicy.c
@@ -347,9 +347,7 @@ static void mpol_rebind_nodemask(struct
BUG();

if (!node_isset(current->il_next, tmp)) {
- current->il_next = next_node(current->il_next, tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = first_node(tmp);
+ current->il_next = next_node_in(current->il_next, tmp);
if (current->il_next >= MAX_NUMNODES)
current->il_next = numa_node_id();
}
@@ -1709,9 +1707,7 @@ static unsigned interleave_nodes(struct
struct task_struct *me = current;

nid = me->il_next;
- next = next_node(nid, policy->v.nodes);
- if (next >= MAX_NUMNODES)
- next = first_node(policy->v.nodes);
+ next = next_node_in(nid, policy->v.nodes);
if (next < MAX_NUMNODES)
me->il_next = next;
return nid;
diff -puN mm/page_isolation.c~include-linux-nodemaskh-create-next_node_in-helper mm/page_isolation.c
--- a/mm/page_isolation.c~include-linux-nodemaskh-create-next_node_in-helper
+++ a/mm/page_isolation.c
@@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct
* accordance with memory policy of the user process if possible. For
* now as a simple work-around, we use the next node for destination.
*/
- if (PageHuge(page)) {
- int node = next_online_node(page_to_nid(page));
- if (node == MAX_NUMNODES)
- node = first_online_node;
+ if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- }
+ next_node_in(page_to_nid(page),
+ node_online_map));

if (PageHighMem(page))
gfp_mask |= __GFP_HIGHMEM;
diff -puN mm/slab.c~include-linux-nodemaskh-create-next_node_in-helper mm/slab.c
--- a/mm/slab.c~include-linux-nodemaskh-create-next_node_in-helper
+++ a/mm/slab.c
@@ -519,22 +519,15 @@ static DEFINE_PER_CPU(unsigned long, sla

static void init_reap_node(int cpu)
{
- int node;
-
- node = next_node(cpu_to_mem(cpu), node_online_map);
- if (node == MAX_NUMNODES)
- node = first_node(node_online_map);
-
- per_cpu(slab_reap_node, cpu) = node;
+ per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+ node_online_map);
}

static void next_reap_node(void)
{
int node = __this_cpu_read(slab_reap_node);

- node = next_node(node, node_online_map);
- if (unlikely(node >= MAX_NUMNODES))
- node = first_node(node_online_map);
+ node = next_node_in(node, node_online_map);
__this_cpu_write(slab_reap_node, node);
}

_