Re: [PATCH] numa,sched: only consider less busy nodes as numa balancing destination

From: Peter Zijlstra
Date: Wed May 13 2015 - 02:29:04 EST


On Tue, May 12, 2015 at 11:45:09AM -0400, Rik van Riel wrote:
> I have a few poorly formed ideas on what could be done about that:
>
> 1) have fbq_classify_rq take the current task on the rq into account,
> and adjust the fbq classification if all the runnable-but-queued
> tasks are on the right node

So while looking at this I came up with the below; it treats anything
inside ->active_nodes as a preferred node for balancing purposes.

Would that make sense?

I'll see what I can do about current in the runqueue type
classification.

> 2) ensure that rq->nr_numa_running and rq->nr_preferred_running also
> get incremented for kernel threads that are bound to a particular
> CPU - currently CPU-bound kernel threads will cause the NUMA
> statistics to look like a CPU has tasks that do not belong on that
> NUMA node

I'm thinking accounting those to nr_pinned, lemme see how that works
out.

---
include/linux/sched.h | 1 +
kernel/sched/fair.c | 58 ++++++++++++++++++++++++++++++++-------------------
2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cb734861123a..ffebc2e091ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,7 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned sched_preferred:1;

#ifdef CONFIG_MEMCG_KMEM
unsigned memcg_kmem_skip_account:1;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c1510abeefa..d59adb8e8ef4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -856,18 +856,6 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}

-static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
-{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
- rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
-}
-
-static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
-{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
- rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
-}
-
struct numa_group {
atomic_t refcount;

@@ -887,6 +875,28 @@ struct numa_group {
unsigned long faults[0];
};

+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ int node = task_node(p);
+ bool local;
+
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+
+ if (p->numa_group)
+ local = node_isset(node, p->numa_group->active_nodes);
+ else
+ local = (p->numa_preferred_nid == node);
+
+ p->sched_preferred = local;
+ rq->nr_preferred_running += local;
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= p->sched_preferred;
+}
+
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2

@@ -1572,9 +1582,10 @@ static void numa_migrate_preferred(struct task_struct *p)
* are added when they cause over 6/16 of the maximum number of faults, but
* only removed when they drop below 3/16.
*/
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static bool update_numa_active_node_mask(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
+ bool update = false;
int nid;

for_each_online_node(nid) {
@@ -1586,11 +1597,17 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
if (!node_isset(nid, numa_group->active_nodes)) {
- if (faults > max_faults * 6 / 16)
+ if (faults > max_faults * 6 / 16) {
node_set(nid, numa_group->active_nodes);
- } else if (faults < max_faults * 3 / 16)
+ update = true;
+ }
+ } else if (faults < max_faults * 3 / 16) {
node_clear(nid, numa_group->active_nodes);
+ update = true;
+ }
}
+
+ return update;
}

/*
@@ -1884,16 +1901,15 @@ static void task_numa_placement(struct task_struct *p)
update_numa_active_node_mask(p->numa_group);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid);
- }
-
- if (max_faults) {
+ sched_setnuma(p, max_nid);
+ } else if (max_faults) {
/* Set the new preferred node */
if (max_nid != p->numa_preferred_nid)
sched_setnuma(p, max_nid);
-
- if (task_node(p) != p->numa_preferred_nid)
- numa_migrate_preferred(p);
}
+
+ if (task_node(p) != p->numa_preferred_nid)
+ numa_migrate_preferred(p);
}

static inline int get_numa_group(struct numa_group *grp)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/