[PATCH 4/4] sched,numa: pull workloads towards their preferred nodes

From: riel
Date: Thu May 08 2014 - 13:23:50 EST


From: Rik van Riel <riel@xxxxxxxxxx>

Give a bonus to nodes near a workload's preferred node. This will pull
workloads towards their preferred node.

For workloads that span multiple NUMA nodes, pseudo-interleaving will
even out the memory use between nodes over time, causing the preferred
node to move around over time.

This movement over time will cause the preferred nodes to be on opposite
sides of the system eventually, untangling workloads that were spread
all over the system, and moving them onto adjacent nodes.

The perturbation introduced by this patch enables the kernel to
reliably untangled 2 4-node wide SPECjbb2005 instances on an 8 node
system, improving average performance from 857814 to 931792 bops.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxx>
Tested-by: Chegu Vinod <chegu_vinod@xxxxxx>
---
kernel/sched/fair.c | 25 ++++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 99cc829..cffa829 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -932,7 +932,7 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
* the proximity of those nodes.
*/
static inline unsigned long nearby_nodes_score(struct task_struct *p, int nid,
- bool task)
+ bool task, bool *preferred_nid)
{
int max_distance = max_node_distance();
unsigned long score = 0;
@@ -949,6 +949,15 @@ static inline unsigned long nearby_nodes_score(struct task_struct *p, int nid,
int distance;
unsigned long faults;

+ /*
+ * Pseudo-interleaving balances out the memory use between the
+ * nodes where a workload runs, so the preferred node should
+ * change over time. This helps separate two workloads onto
+ * separate sides of the system.
+ */
+ if (p->numa_group && node == p->numa_group->preferred_nid)
+ *preferred_nid = true;
+
/* Already scored by the calling function. */
if (node == nid)
continue;
@@ -989,6 +998,7 @@ static inline unsigned long nearby_nodes_score(struct task_struct *p, int nid,
static inline unsigned long task_weight(struct task_struct *p, int nid)
{
unsigned long total_faults, score;
+ bool near_preferred_nid = false;

if (!p->numa_faults_memory)
return 0;
@@ -999,7 +1009,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
return 0;

score = 1000 * task_faults(p, nid);
- score += nearby_nodes_score(p, nid, true);
+ score += nearby_nodes_score(p, nid, true, &near_preferred_nid);

score /= total_faults;

@@ -1009,6 +1019,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
static inline unsigned long group_weight(struct task_struct *p, int nid)
{
unsigned long total_faults, score;
+ bool near_preferred_nid = false;

if (!p->numa_group)
return 0;
@@ -1019,7 +1030,15 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
return 0;

score = 1000 * group_faults(p, nid);
- score += nearby_nodes_score(p, nid, false);
+ score += nearby_nodes_score(p, nid, false, &near_preferred_nid);
+
+ /*
+ * Pull workloads towards their preferred node, with the minimum
+ * multiplier required to be a tie-breaker when two groups of nodes
+ * have the same amount of memory.
+ */
+ if (near_preferred_nid)
+ score *= (max_node_distance() - LOCAL_DISTANCE);

score /= total_faults;

--
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/