Re: sched: tweak select_idle_sibling to look for idle threads

From: Chris Mason
Date: Mon Apr 11 2016 - 20:31:21 EST


On Mon, Apr 11, 2016 at 06:54:21AM +0200, Mike Galbraith wrote:
> On Sun, 2016-04-10 at 15:55 -0400, Chris Mason wrote:
> > On Sun, Apr 10, 2016 at 12:04:21PM +0200, Mike Galbraith wrote:
> > > On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:
> > >
> > > > This does preserve the existing logic to prefer idle cores over idle
> > > > CPU threads, and includes some tests to try and avoid the idle scan when we're
> > > > actually better off sharing a non-idle CPU with someone else.
> > >
> > > My box says the "oh nevermind" checks aren't selective enough, tbench
> > > dropped 4% at clients=cores, and 2% at clients=threads.
> >
> > Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
> > just socket 0. Version 2 below fixes things for me, but I'm hoping
> > someone can suggest a way to get task_hot() buddy checks without the rq
> > lock.
> >
> > I haven't run this on production loads yet, but our 4.0 patch for this
> > uses task_hot(), so I'd expect it to be on par. If this doesn't fix it
> > for you, I'll dig up a similar machine on Monday.
>
> My box stopped caring. I personally would be reluctant to apply it
> without a "you asked for it" button or a large pile of benchmark
> results. Lock banging or not, full scan existing makes me nervous.


We can use a bitmap at the socket level to keep track of which cpus are
idle. I'm sure there are better places for the array and better ways to
allocate, this is just a rough cut to make sure the idle tracking works.

-chris

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..1c3b5e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1055,6 +1055,8 @@ struct sched_domain {
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */

+ cpumask_var_t idle_cpus_mask;
+
/* idle_balance() stats */
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41f6b22..237d645 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3204,6 +3204,7 @@ again:
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
+ struct sched_domain *package_sd;
unsigned long *switch_count;
struct rq *rq;
int cpu;
@@ -3270,11 +3270,19 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);

next = pick_next_task(rq, prev);
+
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;

if (likely(prev != next)) {
+ package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (package_sd) {
+ if (prev->policy == SCHED_IDLE && next->policy != SCHED_IDLE)
+ cpumask_clear_cpu(cpu, package_sd->idle_cpus_mask);
+ else if (next->policy == SCHED_IDLE)
+ cpumask_set_cpu(cpu, package_sd->idle_cpus_mask);
+ }
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -6599,7 +6607,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
-
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
@@ -7041,6 +7048,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
return child;

cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ zalloc_cpumask_var(&sd->idle_cpus_mask, GFP_NOWAIT);
+ cpumask_and(sd->idle_cpus_mask, cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c76505..cae6bd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5026,7 +5026,7 @@ next:
* the package.
*/
if (package_sd && should_scan_idle(p, target)) {
- for_each_cpu_and(i, sched_domain_span(package_sd),
+ for_each_cpu_and(i, package_sd->idle_cpus_mask,
tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
target = i;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a713..7e34b42 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -202,6 +202,9 @@ DEFINE_PER_CPU(bool, cpu_dead_idle);
*/
static void cpu_idle_loop(void)
{
+ int cpu;
+ struct sched_domain *package_sd;
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -212,10 +215,19 @@ static void cpu_idle_loop(void)
* guaranteed to cause the cpu to reschedule.
*/

+
__current_set_polling();
quiet_vmstat();
tick_nohz_idle_enter();

+ preempt_disable();
+ cpu = smp_processor_id();
+ package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (package_sd) {
+ cpumask_set_cpu(cpu, package_sd->idle_cpus_mask);
+ }
+ preempt_enable();
+
while (!need_resched()) {
check_pgt_cache();
rmb();
--
2.8.0.rc2