[tip:sched/numa] sched, mm: Introduce tsk_home_node()

From: tip-bot for Peter Zijlstra
Date: Fri May 18 2012 - 06:31:52 EST


Commit-ID: 84213e2b6e2166083c3d06e91dcf54f8e136bd78
Gitweb: http://git.kernel.org/tip/84213e2b6e2166083c3d06e91dcf54f8e136bd78
Author: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
AuthorDate: Sat, 3 Mar 2012 17:05:16 +0100
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Fri, 18 May 2012 08:16:20 +0200

sched, mm: Introduce tsk_home_node()

Introduce the home-node concept for tasks. In order to keep memory
locality we need to have a something to stay local to, we define the
home-node of a task as the node we prefer to allocate memory from and
prefer to execute on.

These are no hard guarantees, merely preferences. This allows for
optimal resource usage, we can run a task away from the home-node, the
remote memory hit -- while expensive -- is less expensive than not
running at all, or very little, due to severe cpu overload.

Similarly, we can allocate memory from another node if our home-node
is depleted, again, some memory is better than no memory.

This patch merely introduces the basic infrastructure, all policy
comes later.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Suresh Siddha <suresh.b.siddha@xxxxxxxxx>
Cc: Paul Turner <pjt@xxxxxxxxxx>
Cc: Dan Smith <danms@xxxxxxxxxx>
Cc: Bharata B Rao <bharata.rao@xxxxxxxxx>
Cc: Lee Schermerhorn <Lee.Schermerhorn@xxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Link: http://lkml.kernel.org/n/tip-41h2mswllhkskd4bnxpoi388@xxxxxxxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/init_task.h | 8 ++++++++
include/linux/sched.h | 10 ++++++++++
kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e4baff5..d3029fa 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -134,6 +134,13 @@ extern struct cred init_cred;

#define INIT_TASK_COMM "swapper"

+#ifdef CONFIG_NUMA
+# define INIT_TASK_NUMA(tsk) \
+ .node = -1,
+#else
+# define INIT_TASK_NUMA(tsk)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -200,6 +207,7 @@ extern struct cred init_cred;
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_CPUSET_SEQ \
+ INIT_TASK_NUMA(tsk) \
}


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d64480..49378f0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1505,6 +1505,7 @@ struct task_struct {
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
short pref_node_fork;
+ int node;
#endif
struct rcu_head rcu;

@@ -1575,6 +1576,15 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)

+static inline int tsk_home_node(struct task_struct *p)
+{
+#ifdef CONFIG_NUMA
+ return p->node;
+#else
+ return -1;
+#endif
+}
+
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb9bee9..8fd0325 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6279,6 +6279,38 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol

#ifdef CONFIG_NUMA

+/*
+ * Requeues a task ensuring its on the right load-balance list so
+ * that it might get migrated to its new home.
+ *
+ * Note that we cannot actively migrate ourselves since our callers
+ * can be from atomic context. We rely on the regular load-balance
+ * mechanisms to move us around -- its all preference anyway.
+ */
+void sched_setnode(struct task_struct *p, int node)
+{
+ unsigned long flags;
+ int on_rq, running;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+ on_rq = p->on_rq;
+ running = task_current(rq, p);
+
+ if (on_rq)
+ dequeue_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+
+ p->node = node;
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq)
+ enqueue_task(rq, p, 0);
+ task_rq_unlock(rq, p, &flags);
+}
+
static int sched_domains_numa_levels;
static int sched_domains_numa_scale;
static int *sched_domains_numa_distance;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/