[RFC PATCH] sched: select_idle_core should select least utilized core

From: Subhra Mazumdar
Date: Thu Jun 08 2017 - 15:27:33 EST


Current select_idle_core tries to find a fully idle core and if it fails
select_idle_cpu next returns any idle cpu in the llc domain. This is not optimal
for architectures with many (more than 2) hyperthreads in a core. This patch
changes select_idle_core to find the core with least number of busy
hyperthreads and return an idle cpu in that core.

Signed-off-by: Subhra Mazumdar <subhra.mazumdar@xxxxxxxxxx>
---
kernel/sched/fair.c | 113 +++++++++-------------------------------------
kernel/sched/idle_task.c | 1 -
kernel/sched/sched.h | 10 ----
3 files changed, 21 insertions(+), 103 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d711093..eb2c33c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5679,111 +5679,49 @@ static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *

#ifdef CONFIG_SCHED_SMT

-static inline void set_idle_cores(int cpu, int val)
-{
- struct sched_domain_shared *sds;
-
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds)
- WRITE_ONCE(sds->has_idle_cores, val);
-}
-
-static inline bool test_idle_cores(int cpu, bool def)
-{
- struct sched_domain_shared *sds;
-
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds)
- return READ_ONCE(sds->has_idle_cores);
-
- return def;
-}
-
/*
- * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
- *
- * Since SMT siblings share all cache levels, inspecting this limited remote
- * state should be fairly cheap.
- */
-void __update_idle_core(struct rq *rq)
-{
- int core = cpu_of(rq);
- int cpu;
-
- rcu_read_lock();
- if (test_idle_cores(core, true))
- goto unlock;
-
- for_each_cpu(cpu, cpu_smt_mask(core)) {
- if (cpu == core)
- continue;
-
- if (!idle_cpu(cpu))
- goto unlock;
- }
-
- set_idle_cores(core, 1);
-unlock:
- rcu_read_unlock();
-}
-
-/*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
- * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * Scan the entire LLC domain for idle cores; Find the core with minimum number
+ * of busy strands and return a idle strand in that core
*/
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
+ int core, cpu, wrap, min_util = INT_MAX, min_cpu = -1;

if (!static_branch_likely(&sched_smt_present))
return -1;

- if (!test_idle_cores(target, false))
- return -1;
-
cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);

for_each_cpu_wrap(core, cpus, target, wrap) {
bool idle = true;
+ int util = 0;
+ int cp = -1;

for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
- if (!idle_cpu(cpu))
+ if (!idle_cpu(cpu)) {
idle = false;
+ util++;
+ } else if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ cp = cpu;
+ }
}

if (idle)
return core;
- }

- /*
- * Failed to find an idle core; stop looking for one.
- */
- set_idle_cores(target, 0);
+ if (util < min_util && cp != -1) {
+ min_util = util;
+ min_cpu = cp;
+ }
+ }

- return -1;
+ return min_cpu;
}

-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
- int cpu;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
- continue;
- if (idle_cpu(cpu))
- return cpu;
- }
-
return -1;
}

@@ -5794,13 +5732,6 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}

-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
- return -1;
-}
-
-#endif /* CONFIG_SCHED_SMT */
-
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -5830,8 +5761,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();

for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
- continue;
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
if (idle_cpu(cpu))
break;
}
@@ -5844,6 +5775,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return cpu;
}

+#endif /* CONFIG_SCHED_SMT */
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
@@ -5873,10 +5806,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;

- i = select_idle_smt(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
-
return target;
}

diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 0c00172..a3d5a7c 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,7 +27,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
- update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aa..96ef012 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -772,16 +772,6 @@ static inline int cpu_of(struct rq *rq)

extern struct static_key_false sched_smt_present;

-extern void __update_idle_core(struct rq *rq);
-
-static inline void update_idle_core(struct rq *rq)
-{
- if (static_branch_unlikely(&sched_smt_present))
- __update_idle_core(rq);
-}
-
-#else
-static inline void update_idle_core(struct rq *rq) { }
#endif

DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
--
1.7.1