[PATCH 39/39] autonuma: NUMA scheduler SMT awareness

From: Andrea Arcangeli
Date: Mon Mar 26 2012 - 14:29:02 EST


Add SMT awareness to the NUMA scheduler so that it will not move load
from fully idle SMT threads, to semi idle SMT threads.

Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
---
include/linux/autonuma_flags.h | 10 ++++++++
kernel/sched/numa.c | 50 +++++++++++++++++++++++++++++++++++++--
mm/autonuma.c | 7 +++++
3 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/include/linux/autonuma_flags.h b/include/linux/autonuma_flags.h
index 9c702fd..d6b34b0 100644
--- a/include/linux/autonuma_flags.h
+++ b/include/linux/autonuma_flags.h
@@ -8,6 +8,7 @@ enum autonuma_flag {
AUTONUMA_SCHED_LOAD_BALANCE_STRICT_FLAG,
AUTONUMA_SCHED_CLONE_RESET_FLAG,
AUTONUMA_SCHED_FORK_RESET_FLAG,
+ AUTONUMA_SCHED_SMT_FLAG,
AUTONUMA_SCAN_PMD_FLAG,
AUTONUMA_SCAN_USE_WORKING_SET_FLAG,
AUTONUMA_MIGRATE_DEFER_FLAG,
@@ -43,6 +44,15 @@ static bool inline autonuma_sched_fork_reset(void)
&autonuma_flags);
}

+static bool inline autonuma_sched_smt(void)
+{
+#ifdef CONFIG_SCHED_SMT
+ return !!test_bit(AUTONUMA_SCHED_SMT_FLAG, &autonuma_flags);
+#else
+ return 0;
+#endif
+}
+
static bool inline autonuma_scan_pmd(void)
{
return !!test_bit(AUTONUMA_SCAN_PMD_FLAG, &autonuma_flags);
diff --git a/kernel/sched/numa.c b/kernel/sched/numa.c
index d51e1ec..4211305 100644
--- a/kernel/sched/numa.c
+++ b/kernel/sched/numa.c
@@ -11,6 +11,30 @@

#include "sched.h"

+static inline bool idle_cpu_avg(int cpu, bool require_avg_idle)
+{
+ struct rq *rq = cpu_rq(cpu);
+ return idle_cpu(cpu) && (!require_avg_idle ||
+ rq->avg_idle > sysctl_sched_migration_cost);
+}
+
+/* A false avg_idle param makes it easier for smt_idle() to return true */
+static bool smt_idle(int _cpu, bool require_avg_idle)
+{
+#ifdef CONFIG_SCHED_SMT
+ int cpu;
+
+ for_each_cpu_and(cpu, topology_thread_cpumask(_cpu), cpu_online_mask) {
+ if (cpu == _cpu)
+ continue;
+ if (!idle_cpu_avg(cpu, require_avg_idle))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
#define AUTONUMA_BALANCE_SCALE 1000

/*
@@ -47,6 +71,7 @@ void sched_autonuma_balance(void)
int cpu, nid, selected_cpu, selected_nid;
int cpu_nid = numa_node_id();
int this_cpu = smp_processor_id();
+ int this_smt_idle;
unsigned long p_w, p_t, m_w, m_t;
unsigned long weight_delta_max, weight;
struct cpumask *allowed;
@@ -96,6 +121,7 @@ void sched_autonuma_balance(void)
weight_current[nid] = p_w*AUTONUMA_BALANCE_SCALE/p_t;
}

+ this_smt_idle = smt_idle(this_cpu, false);
bitmap_zero(mm_mask, NR_CPUS);
for_each_online_node(nid) {
if (nid == cpu_nid)
@@ -103,11 +129,24 @@ void sched_autonuma_balance(void)
for_each_cpu_and(cpu, cpumask_of_node(nid), allowed) {
struct mm_struct *mm;
struct rq *rq = cpu_rq(cpu);
+ bool other_smt_idle;
if (!cpu_online(cpu))
continue;
weight_others[cpu] = LONG_MAX;
- if (idle_cpu(cpu) &&
- rq->avg_idle > sysctl_sched_migration_cost) {
+
+ other_smt_idle = smt_idle(cpu, true);
+ if (autonuma_sched_smt() &&
+ this_smt_idle && !other_smt_idle)
+ continue;
+
+ if (idle_cpu_avg(cpu, true)) {
+ if (autonuma_sched_smt() &&
+ !this_smt_idle && other_smt_idle) {
+ /* NUMA affinity override */
+ weight_others[cpu] = -2;
+ continue;
+ }
+
if (weight_current[nid] >
weight_current[cpu_nid] &&
weight_current_mm[nid] >
@@ -115,6 +154,11 @@ void sched_autonuma_balance(void)
weight_others[cpu] = -1;
continue;
}
+
+ if (autonuma_sched_smt() &&
+ this_smt_idle && cpu_rq(this_cpu)->nr_running <= 1)
+ continue;
+
mm = rq->curr->mm;
if (!mm)
continue;
@@ -169,7 +213,7 @@ void sched_autonuma_balance(void)
w_cpu_nid = weight_current_mm[cpu_nid];
}
if (w_nid > weight_others[cpu] &&
- w_nid > w_cpu_nid) {
+ (w_nid > w_cpu_nid || weight_others[cpu] == -2)) {
weight = w_nid -
weight_others[cpu] +
w_nid -
diff --git a/mm/autonuma.c b/mm/autonuma.c
index 7ca4992..4cce6a1 100644
--- a/mm/autonuma.c
+++ b/mm/autonuma.c
@@ -23,6 +23,7 @@ unsigned long autonuma_flags __read_mostly =
(1<<AUTONUMA_SCHED_LOAD_BALANCE_STRICT_FLAG)|
(1<<AUTONUMA_SCHED_CLONE_RESET_FLAG)|
(1<<AUTONUMA_SCHED_FORK_RESET_FLAG)|
+ (1<<AUTONUMA_SCHED_SMT_FLAG)|
#ifdef CONFIG_AUTONUMA_DEFAULT_ENABLED
(1<<AUTONUMA_FLAG)|
#endif
@@ -1089,6 +1090,9 @@ SYSFS_ENTRY(defer, AUTONUMA_MIGRATE_DEFER_FLAG);
SYSFS_ENTRY(load_balance_strict, AUTONUMA_SCHED_LOAD_BALANCE_STRICT_FLAG);
SYSFS_ENTRY(clone_reset, AUTONUMA_SCHED_CLONE_RESET_FLAG);
SYSFS_ENTRY(fork_reset, AUTONUMA_SCHED_FORK_RESET_FLAG);
+#ifdef CONFIG_SCHED_SMT
+SYSFS_ENTRY(smt, AUTONUMA_SCHED_SMT_FLAG);
+#endif

#undef SYSFS_ENTRY

@@ -1205,6 +1209,9 @@ static struct attribute *scheduler_attr[] = {
&clone_reset_attr.attr,
&fork_reset_attr.attr,
&load_balance_strict_attr.attr,
+#ifdef CONFIG_SCHED_SMT
+ &smt_attr.attr,
+#endif
NULL,
};
static struct attribute_group scheduler_attr_group = {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/