[RFC][PATCH 23/26] sched, numa: Introduce sys_numa_{t,m}bind()

From: Peter Zijlstra
Date: Fri Mar 16 2012 - 10:56:44 EST


Now that we have a NUMA process scheduler, provide a syscall interface
for finer granularity NUMA balancing. In particular this allows
setting up NUMA groups of threads and vmas within a process.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
arch/x86/syscalls/syscall_32.tbl | 2
arch/x86/syscalls/syscall_64.tbl | 2
include/asm-generic/unistd.h | 6
include/linux/mempolicy.h | 35 ++
include/linux/sched.h | 2
include/linux/syscalls.h | 3
kernel/exit.c | 1
kernel/sched/numa.c | 582 ++++++++++++++++++++++++++++++++++++++-
kernel/sys_ni.c | 4
mm/mempolicy.c | 8
10 files changed, 639 insertions(+), 6 deletions(-)
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,5 @@
346 i386 setns sys_setns
347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
+349 i386 numa_mbind sys_numa_mbind compat_sys_numa_mbind
+350 i386 numa_tbind sys_numa_tbind compat_sys_numa_tbind
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
309 common getcpu sys_getcpu
310 64 process_vm_readv sys_process_vm_readv
311 64 process_vm_writev sys_process_vm_writev
+312 64 numa_mbind sys_numa_mbind
+313 64 numa_tbind sys_numa_tbind
#
# x32-specific system call numbers start at 512 to avoid cache impact
# for native 64-bit operation.
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -691,9 +691,13 @@ __SC_COMP(__NR_process_vm_readv, sys_pro
#define __NR_process_vm_writev 271
__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
compat_sys_process_vm_writev)
+#define __NR_numa_mbind 272
+__SC_COMP(__NR_numa_mbind, sys_numa_mbind, compat_sys_ms_mbind)
+#define __NR_numa_tbind 273
+__SC_COMP(__NR_numa_tbind, sys_numa_tbind, compat_sys_ms_tbind)

#undef __NR_syscalls
-#define __NR_syscalls 272
+#define __NR_syscalls 274

/*
* All syscalls below here should go away really,
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -78,6 +78,8 @@ enum mpol_rebind_step {
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/migrate.h>
+#include <linux/list.h>
+#include <linux/sched.h>

struct mm_struct;

@@ -109,6 +111,10 @@ struct mempolicy {
atomic_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
+ struct numa_group *numa_group;
+ struct list_head ng_entry;
+ struct vm_area_struct *vma;
+ struct rcu_head rcu;
union {
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave/bind */
@@ -396,6 +402,35 @@ static inline int mpol_to_str(char *buff
}

#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_NUMA
+
+extern void __numa_task_exit(struct task_struct *);
+extern void numa_vma_link(struct vm_area_struct *, struct vm_area_struct *);
+extern void numa_vma_unlink(struct vm_area_struct *);
+extern void __numa_add_vma_counter(struct vm_area_struct *, int, long);
+
+static inline
+void numa_add_vma_counter(struct vm_area_struct *vma, int member, long value)
+{
+ if (vma->vm_policy && vma->vm_policy->numa_group)
+ __numa_add_vma_counter(vma, member, value);
+}
+
+static inline void numa_task_exit(struct task_struct *p)
+{
+ if (p->numa_group)
+ __numa_task_exit(p);
+}
+
+#else /* CONFIG_NUMA */
+
+static inline void numa_task_exit(struct task_struct *) { }
+static inline void numa_vma_link(struct vm_area_struct *, struct vm_area_struct *) { }
+static inline void numa_vma_unlink(struct vm_area_struct *) { }
+
+#endif /* CONFIG_NUMA */
+
#endif /* __KERNEL__ */

#endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1548,6 +1548,8 @@ struct task_struct {
short il_next;
short pref_node_fork;
int node;
+ struct numa_group *numa_group;
+ struct list_head ng_entry;
#endif
struct rcu_head rcu;

--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -856,5 +856,8 @@ asmlinkage long sys_process_vm_writev(pi
const struct iovec __user *rvec,
unsigned long riovcnt,
unsigned long flags);
+asmlinkage long sys_numa_mbind(unsigned long addr, unsigned long len,
+ int ng_id, unsigned long flags);
+asmlinkage long sys_numa_tbind(int tid, int ng_id, unsigned long flags);

#endif
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1010,6 +1010,7 @@ void do_exit(long code)
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
task_unlock(tsk);
+ numa_task_exit(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
--- a/kernel/sched/numa.c
+++ b/kernel/sched/numa.c
@@ -14,6 +14,7 @@

#include <linux/mempolicy.h>
#include <linux/kthread.h>
+#include <linux/compat.h>

#include "sched.h"

@@ -302,17 +303,20 @@ static void enqueue_ne(struct numa_entit
spin_unlock(&nq->lock);
}

-static void dequeue_ne(struct numa_entity *ne)
+static int dequeue_ne(struct numa_entity *ne)
{
struct node_queue *nq;
+ int node = ne->node; // XXX serialization

- if (ne->node == -1) // XXX serialization
- return;
+ if (node == -1) // XXX serialization
+ return node;

nq = lock_ne_nq(ne);
ne->node = -1;
__dequeue_ne(nq, ne);
spin_unlock(&nq->lock);
+
+ return node;
}

static void init_ne(struct numa_entity *ne, const struct numa_ops *nops)
@@ -400,6 +404,8 @@ static int find_idlest_node(int this_nod

void select_task_node(struct task_struct *p, struct mm_struct *mm, int sd_flags)
{
+ int node;
+
if (!sched_feat(NUMA_SELECT)) {
p->node = -1;
return;
@@ -424,7 +430,11 @@ void select_task_node(struct task_struct
}
}

- enqueue_ne(&mm->numa, find_idlest_node(p->node));
+ node = find_idlest_node(p->node);
+ if (node == -1)
+ node = numa_node_id();
+
+ enqueue_ne(&mm->numa, node);
}

__init void init_sched_numa(void)
@@ -804,3 +814,567 @@ static __init int numa_init(void)
return 0;
}
early_initcall(numa_init);
+
+
+/*
+ * numa_group bits
+ */
+
+#include <linux/idr.h>
+#include <linux/srcu.h>
+#include <linux/syscalls.h>
+
+struct numa_group {
+ spinlock_t lock;
+ int id;
+
+ struct mm_rss_stat rss;
+
+ struct list_head tasks;
+ struct list_head vmas;
+
+ const struct cred *cred;
+ atomic_t ref;
+
+ struct numa_entity numa_entity;
+
+ struct rcu_head rcu;
+};
+
+static struct srcu_struct ng_srcu;
+
+static DEFINE_MUTEX(numa_group_idr_lock);
+static DEFINE_IDR(numa_group_idr);
+
+static inline struct numa_group *ne_ng(struct numa_entity *ne)
+{
+ return container_of(ne, struct numa_group, numa_entity);
+}
+
+static inline bool ng_tryget(struct numa_group *ng)
+{
+ return atomic_inc_not_zero(&ng->ref);
+}
+
+static inline void ng_get(struct numa_group *ng)
+{
+ atomic_inc(&ng->ref);
+}
+
+static void __ng_put_rcu(struct rcu_head *rcu)
+{
+ struct numa_group *ng = container_of(rcu, struct numa_group, rcu);
+
+ put_cred(ng->cred);
+ kfree(ng);
+}
+
+static void __ng_put(struct numa_group *ng)
+{
+ mutex_lock(&numa_group_idr_lock);
+ idr_remove(&numa_group_idr, ng->id);
+ mutex_unlock(&numa_group_idr_lock);
+
+ WARN_ON(!list_empty(&ng->tasks));
+ WARN_ON(!list_empty(&ng->vmas));
+
+ dequeue_ne(&ng->numa_entity);
+
+ call_rcu(&ng->rcu, __ng_put_rcu);
+}
+
+static inline void ng_put(struct numa_group *ng)
+{
+ if (atomic_dec_and_test(&ng->ref))
+ __ng_put(ng);
+}
+
+/*
+ * numa_ops
+ */
+
+static unsigned long numa_group_mem_load(struct numa_entity *ne)
+{
+ struct numa_group *ng = ne_ng(ne);
+
+ return atomic_long_read(&ng->rss.count[MM_ANONPAGES]);
+}
+
+static unsigned long numa_group_cpu_load(struct numa_entity *ne)
+{
+ struct numa_group *ng = ne_ng(ne);
+ unsigned long load = 0;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &ng->tasks, ng_entry)
+ load += p->numa_contrib;
+ rcu_read_unlock();
+
+ return load;
+}
+
+static void numa_group_mem_migrate(struct numa_entity *ne, int node)
+{
+ struct numa_group *ng = ne_ng(ne);
+ struct vm_area_struct *vma;
+ struct mempolicy *mpol;
+ struct mm_struct *mm;
+ int idx;
+
+ /*
+ * Horrid code this..
+ *
+ * The main problem is that ng->lock nests inside mmap_sem [
+ * numa_vma_{,un}link() gets called under mmap_sem ]. But here we need
+ * to iterate that list and acquire mmap_sem for each entry.
+ *
+ * We get here without serialization. We abuse numa_vma_unlink() to add
+ * an SRCU delayed reference count to the mpols. This allows us to do
+ * lockless iteration of the list.
+ *
+ * Once we have an mpol we need to acquire mmap_sem, this too isn't
+ * straight fwd, take ng->lock to pin mpol->vma due to its
+ * serialization against numa_vma_unlink(). While that vma pointer is
+ * stable the vma->vm_mm pointer must be good too, so acquire an extra
+ * reference to the mm.
+ *
+ * This reference keeps mm stable so we can drop ng->lock and acquire
+ * mmap_sem. After which mpol->vma is stable again since the memory map
+ * is stable. So verify ->vma is still good (numa_vma_unlink clears it)
+ * and the mm is still the same (paranoia, can't see how that could
+ * happen).
+ */
+
+ idx = srcu_read_lock(&ng_srcu);
+ list_for_each_entry_rcu(mpol, &ng->vmas, ng_entry) {
+ nodemask_t mask = nodemask_of_node(node);
+
+ spin_lock(&ng->lock); /* pin mpol->vma */
+ vma = mpol->vma;
+ if (!vma) {
+ spin_unlock(&ng->lock);
+ continue;
+ }
+ mm = vma->vm_mm;
+ atomic_inc(&mm->mm_users); /* pin mm */
+ spin_unlock(&ng->lock);
+
+ down_read(&mm->mmap_sem);
+ vma = mpol->vma;
+ if (!vma)
+ goto unlock_next;
+
+ mpol_rebind_policy(mpol, &mask, MPOL_REBIND_ONCE);
+ lazy_migrate_vma(vma, node);
+unlock_next:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+ srcu_read_unlock(&ng_srcu, idx);
+}
+
+static void numa_group_cpu_migrate(struct numa_entity *ne, int node)
+{
+ struct numa_group *ng = ne_ng(ne);
+ struct task_struct *p;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &ng->tasks, ng_entry)
+ sched_setnode(p, node);
+ rcu_read_unlock();
+}
+
+static bool numa_group_tryget(struct numa_entity *ne)
+{
+ /*
+ * See process_tryget(), similar but against ng_put().
+ */
+ return ng_tryget(ne_ng(ne));
+}
+
+static void numa_group_put(struct numa_entity *ne)
+{
+ ng_put(ne_ng(ne));
+}
+
+static const struct numa_ops numa_group_ops = {
+ .mem_load = numa_group_mem_load,
+ .cpu_load = numa_group_cpu_load,
+
+ .mem_migrate = numa_group_mem_migrate,
+ .cpu_migrate = numa_group_cpu_migrate,
+
+ .tryget = numa_group_tryget,
+ .put = numa_group_put,
+};
+
+void __numa_task_exit(struct task_struct *p)
+{
+ struct numa_group *ng = p->numa_group;
+
+ spin_lock(&ng->lock);
+ list_del_rcu(&p->ng_entry);
+ spin_unlock(&ng->lock);
+
+ p->numa_group = NULL; // XXX serialization ?!
+
+ ng_put(ng);
+}
+
+/*
+ * memory (vma) accounting/tracking
+ *
+ * We assume a 1:1 relation between vmas and mpols and keep a list of mpols in
+ * the numa_group, and a vma backlink in the mpol.
+ */
+
+void numa_vma_link(struct vm_area_struct *new, struct vm_area_struct *old)
+{
+ struct numa_group *ng = NULL;
+
+ if (old && old->vm_policy)
+ ng = old->vm_policy->numa_group;
+
+ if (!ng && new->vm_policy)
+ ng = new->vm_policy->numa_group;
+
+ if (!ng)
+ return;
+
+ ng_get(ng);
+ new->vm_policy->numa_group = ng;
+ new->vm_policy->vma = new;
+
+ spin_lock(&ng->lock);
+ list_add_rcu(&new->vm_policy->ng_entry, &ng->vmas);
+ spin_unlock(&ng->lock);
+}
+
+void __numa_add_vma_counter(struct vm_area_struct *vma, int member, long value)
+{
+ /*
+ * Since the caller passes the vma argument, the caller is responsible
+ * for making sure the vma is stable, hence the ->vm_policy->numa_group
+ * dereference is safe. (caller usually has vma->vm_mm->mmap_sem for
+ * reading).
+ */
+ atomic_long_add(value, &vma->vm_policy->numa_group->rss.count[member]);
+}
+
+static void __mpol_put_rcu(struct rcu_head *rcu)
+{
+ struct mempolicy *mpol = container_of(rcu, struct mempolicy, rcu);
+ mpol_put(mpol);
+}
+
+void numa_vma_unlink(struct vm_area_struct *vma)
+{
+ struct mempolicy *mpol;
+ struct numa_group *ng;
+
+ if (!vma)
+ return;
+
+ mpol = vma->vm_policy;
+ if (!mpol)
+ return;
+
+ ng = mpol->numa_group;
+ if (!ng)
+ return;
+
+ spin_lock(&ng->lock);
+ list_del_rcu(&mpol->ng_entry);
+ /*
+ * Rediculous, see numa_group_mem_migrate.
+ */
+ mpol->vma = NULL;
+ mpol_get(mpol);
+ call_srcu(&ng_srcu, &mpol->rcu, __mpol_put_rcu);
+ spin_unlock(&ng->lock);
+
+ ng_put(ng);
+}
+
+/*
+ * syscall bits
+ */
+
+#define MS_ID_GET -2
+#define MS_ID_NEW -1
+
+static struct numa_group *ng_create(struct task_struct *p)
+{
+ struct numa_group *ng;
+ int node, err;
+
+ ng = kzalloc(sizeof(*ng), GFP_KERNEL);
+ if (!ng)
+ goto fail;
+
+ err = idr_pre_get(&numa_group_idr, GFP_KERNEL);
+ if (!err)
+ goto fail_alloc;
+
+ mutex_lock(&numa_group_idr_lock);
+ err = idr_get_new(&numa_group_idr, ng, &ng->id);
+ mutex_unlock(&numa_group_idr_lock);
+
+ if (err)
+ goto fail_alloc;
+
+ spin_lock_init(&ng->lock);
+ atomic_set(&ng->ref, 1);
+ ng->cred = get_task_cred(p);
+ INIT_LIST_HEAD(&ng->tasks);
+ INIT_LIST_HEAD(&ng->vmas);
+ init_ne(&ng->numa_entity, &numa_group_ops);
+
+ dequeue_ne(&p->mm->numa); // XXX
+
+ node = find_idlest_node(tsk_home_node(p));
+ enqueue_ne(&ng->numa_entity, node);
+
+ return ng;
+
+fail_alloc:
+ kfree(ng);
+fail:
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * More or less equal to ptrace_may_access(); XXX
+ */
+static int ng_allowed(struct numa_group *ng, struct task_struct *p)
+{
+ const struct cred *cred = ng->cred, *tcred;
+
+ rcu_read_lock();
+ tcred = __task_cred(p);
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ goto ok;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+ goto ok;
+ rcu_read_unlock();
+ return -EPERM;
+
+ok:
+ rcu_read_unlock();
+ return 0;
+}
+
+static struct numa_group *ng_lookup(int ng_id, struct task_struct *p)
+{
+ struct numa_group *ng;
+
+ rcu_read_lock();
+again:
+ ng = idr_find(&numa_group_idr, ng_id);
+ if (!ng) {
+ rcu_read_unlock();
+ return ERR_PTR(-EINVAL);
+ }
+ if (ng_allowed(ng, p)) {
+ rcu_read_unlock();
+ return ERR_PTR(-EPERM);
+ }
+ if (!ng_tryget(ng))
+ goto again;
+ rcu_read_unlock();
+
+ return ng;
+}
+
+static int ng_task_assign(struct task_struct *p, int ng_id)
+{
+ struct numa_group *old_ng, *ng;
+
+ ng = ng_lookup(ng_id, p);
+ if (IS_ERR(ng))
+ return PTR_ERR(ng);
+
+ old_ng = p->numa_group; // XXX racy
+ if (old_ng) {
+ spin_lock(&old_ng->lock);
+ list_del_rcu(&p->ng_entry);
+ spin_unlock(&old_ng->lock);
+
+ /*
+ * We have to wait for the old ng_entry users to go away before
+ * we can re-use the link entry for the new list.
+ */
+ synchronize_rcu();
+ }
+
+ spin_lock(&ng->lock);
+ p->numa_group = ng;
+ list_add_rcu(&p->ng_entry, &ng->tasks);
+ spin_unlock(&ng->lock);
+
+ sched_setnode(p, ng->numa_entity.node);
+
+ if (old_ng)
+ ng_put(old_ng);
+
+ return ng_id;
+}
+
+static struct task_struct *find_get_task(pid_t tid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ if (!tid)
+ p = current;
+ else
+ p = find_task_by_vpid(tid);
+ if (p)
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (!p)
+ return ERR_PTR(-ESRCH);
+
+ return p;
+}
+
+/*
+ * Bind a thread to a numa group or query its binding or create a new group.
+ *
+ * sys_numa_tbind(tid, -1, 0); // create new group, return new ng_id
+ * sys_numa_tbind(tid, -2, 0); // returns existing ng_id
+ * sys_numa_tbind(tid, ng_id, 0); // set ng_id
+ *
+ * Returns:
+ * -ESRCH tid->task resolution failed
+ * -EINVAL task didn't have a ng_id, flags was wrong
+ * -EPERM tid isn't in our process
+ *
+ */
+SYSCALL_DEFINE3(numa_tbind, int, tid, int, ng_id, unsigned long, flags)
+{
+ struct task_struct *p = find_get_task(tid);
+ struct numa_group *ng = NULL;
+ int orig_ng_id = ng_id;
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ if (flags) {
+ ng_id = -EINVAL;
+ goto out;
+ }
+
+ switch (ng_id) {
+ case MS_ID_GET:
+ ng_id = -EINVAL;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ ng_id = ng->id;
+ rcu_read_unlock();
+ break;
+
+ case MS_ID_NEW:
+ ng = ng_create(p);
+ if (IS_ERR(ng)) {
+ ng_id = PTR_ERR(ng);
+ break;
+ }
+ ng_id = ng->id;
+ /* fall through */
+
+ default:
+ ng_id = ng_task_assign(p, ng_id);
+ if (ng && orig_ng_id < 0)
+ ng_put(ng);
+ break;
+ }
+
+out:
+ put_task_struct(p);
+ return ng_id;
+}
+
+/*
+ * Bind a memory region to a numa group.
+ *
+ * sys_numa_mbind(addr, len, ng_id, 0);
+ *
+ * create a non-mergable vma over [addr,addr+len) and assign a mpol binding it
+ * to the numa group identified by ng_id.
+ *
+ */
+SYSCALL_DEFINE4(numa_mbind, unsigned long, addr, unsigned long, len,
+ int, ng_id, unsigned long, flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *mpol;
+ struct numa_group *ng;
+ nodemask_t mask;
+ int node, err = 0;
+
+ if (flags)
+ return -EINVAL;
+
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+
+ ng = ng_lookup(ng_id, current);
+ if (IS_ERR(ng))
+ return PTR_ERR(ng);
+
+ mask = nodemask_of_node(ng->numa_entity.node);
+ mpol = mpol_new(MPOL_BIND, 0, &mask);
+ if (!mpol) {
+ ng_put(ng);
+ return -ENOMEM;
+ }
+ mpol->flags |= MPOL_MF_LAZY;
+ mpol->numa_group = ng;
+
+ node = dequeue_ne(&mm->numa); // XXX
+
+ down_write(&mm->mmap_sem);
+ err = mpol_do_mbind(addr, len, mpol, MPOL_BIND,
+ &mask, MPOL_MF_MOVE|MPOL_MF_LAZY);
+ up_write(&mm->mmap_sem);
+ mpol_put(mpol);
+ ng_put(ng);
+
+ if (err && node != -1)
+ enqueue_ne(&mm->numa, node); // XXX
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage long compat_sys_numa_mbind(compat_ulong_t addr, compat_ulong_t len,
+ compat_int_t ng_id, compat_ulong_t flags)
+{
+ return sys_numa_mbind(addr, len, ng_id, flags);
+}
+
+asmlinkage long compat_sys_numa_tbind(compat_int_t tid, compat_int_t ng_id,
+ compat_ulong_t flags)
+{
+ return sys_numa_tbind(tid, ng_id, flags);
+}
+
+#endif /* CONFIG_COMPAT */
+
+static __init int numa_group_init(void)
+{
+ init_srcu_struct(&ng_srcu);
+ return 0;
+}
+early_initcall(numa_group_init);
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -103,6 +103,10 @@ cond_syscall(sys_set_mempolicy);
cond_syscall(compat_sys_mbind);
cond_syscall(compat_sys_get_mempolicy);
cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_numa_mbind);
+cond_syscall(compat_sys_numa_mbind);
+cond_syscall(sys_numa_tbind);
+cond_syscall(compat_sys_numa_tbind);
cond_syscall(sys_add_key);
cond_syscall(sys_request_key);
cond_syscall(sys_keyctl);
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -287,12 +287,13 @@ struct mempolicy *mpol_new(unsigned shor
}
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ policy = kmem_cache_alloc(policy_cache, GFP_KERNEL | __GFP_ZERO);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
+ INIT_LIST_HEAD(&policy->ng_entry);

return policy;
}
@@ -607,6 +608,9 @@ static int policy_vma(struct vm_area_str
if (!err) {
mpol_get(new);
vma->vm_policy = new;
+ numa_vma_link(vma, NULL);
+ if (old)
+ numa_vma_unlink(old->vma);
mpol_put(old);
}
return err;
@@ -1994,11 +1998,13 @@ int vma_dup_policy(struct vm_area_struct
if (IS_ERR(mpol))
return PTR_ERR(mpol);
vma_set_policy(new, mpol);
+ numa_vma_link(new, old);
return 0;
}

void vma_put_policy(struct vm_area_struct *vma)
{
+ numa_vma_unlink(vma);
mpol_put(vma_policy(vma));
}



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/