[PATCH v2] sched: tweak select_idle_sibling to look for idle threads

From: Chris Mason
Date: Fri Apr 08 2016 - 16:18:20 EST


select_task_rq_fair() can leave cpu utilization a little lumpy,
especially as the workload ramps up to the maximum capacity of the
machine. The end result can be high p99 response times as apps
wait to get scheduled, even when boxes are mostly idle.

I wrote schbench to try and measure this:

git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git

The basic idea is to record the latency between when a thread is kicked
and when it actually gets the CPU. For this patch I used a simple model
where a thread thinks for a while and then waits for data from another
thread. The command line below will start two messenger threads with 18
workers per messenger:

./schbench -m 2 -t 18 -s 30000 -c 30000 -r 30
Latency percentiles (usec)
50.0000th: 52
75.0000th: 63
90.0000th: 74
95.0000th: 80
*99.0000th: 118
99.5000th: 707
99.9000th: 5016
Over=0, min=0, max=12941

p99 numbers here are acceptable. But if I use 19 workers per messenger,
p99 goes through the roof. This machine has two sockets, 10 cores each,
so with HT on, this commandline has one pthread on each CPU:

./schbench -m 2 -t 19 -s 30000 -c 30000 -r 30
Latency percentiles (usec)
50.0000th: 51
75.0000th: 63
90.0000th: 76
95.0000th: 89
*99.0000th: 2132
99.5000th: 6920
99.9000th: 12752
Over=0, min=0, max=17079

This commit tries to solve things by doing an extra scan in
select_idle_sibling().select_idle_sibling If it can find an idle
sibling in any core in the package, it will return that:

./schbench -m2 -t 19 -c 30000 -s 30000 -r 30
Latency percentiles (usec)
50.0000th: 65
75.0000th: 104
90.0000th: 115
95.0000th: 118
*99.0000th: 124
99.5000th: 127
99.9000th: 262
Over=0, min=0, max=12987

This basically means the whole fleet can have one more pthread per socket
and still maintain acceptable latencies. I can actually go up to -t 20,
but the curve starts getting steep.

./schbench -m2 -t 20 -c 30000 -s 30000 -r 30
Latency percentiles (usec)
50.0000th: 50
75.0000th: 63
90.0000th: 75
95.0000th: 81
*99.0000th: 111
99.5000th: 975
99.9000th: 12464
Over=0, min=0, max=18317

This does preserve the existing logic to prefer idle cores over idle
CPU threads, and includes some tests to try and avoid the idle scan when we're
actually better off sharing a non-idle CPU with someone else.

Benchmarks in production show average duration of a request is 13%
faster, with overall capacity going up between 2-5% depending on the
metric.

Credits to Arun Sharma <asharma@xxxxxx> for initial versions of this
patch.

Signed-off-by: Chris Mason <clm@xxxxxx>
---
v1 -> v2 call task_hot to decide if we should scan for idle cpus
kernel/sched/fair.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..0c76505 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -658,6 +658,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
+static int should_scan_idle(struct task_struct *p, int cpu);

/*
* We choose a half-life close to 1 scheduling period.
@@ -4974,6 +4975,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
+ struct sched_domain *package_sd;
struct sched_group *sg;
int i = task_cpu(p);

@@ -4989,7 +4991,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ package_sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = package_sd;
for_each_lower_domain(sd) {
sg = sd->groups;
do {
@@ -4998,7 +5001,12 @@ static int select_idle_sibling(struct task_struct *p, int target)
goto next;

for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
+ /*
+ * we tested target for idle up above,
+ * but don't skip it here because it might
+ * have raced to idle while we were scanning
+ */
+ if (!idle_cpu(i))
goto next;
}

@@ -5009,6 +5017,24 @@ next:
sg = sg->next;
} while (sg != sd->groups);
}
+
+ /*
+ * we're here because we didn't find an idle core, or an idle sibling
+ * in the target core. For message bouncing workloads, we want to
+ * just stick with the target suggestion from the caller, but
+ * otherwise we'd rather have an idle CPU from anywhere else in
+ * the package.
+ */
+ if (package_sd && should_scan_idle(p, target)) {
+ for_each_cpu_and(i, sched_domain_span(package_sd),
+ tsk_cpus_allowed(p)) {
+ if (idle_cpu(i)) {
+ target = i;
+ break;
+ }
+
+ }
+ }
done:
return target;
}
@@ -5714,6 +5740,35 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
return delta < (s64)sysctl_sched_migration_cost;
}

+/*
+ * helper for select_idle_sibling to decide if it should look for idle
+ * threads
+ */
+static int should_scan_idle(struct task_struct *p, int cpu)
+{
+ unsigned long flags;
+ struct lb_env env;
+ int hot;
+
+ /*
+ * as the run queue gets bigger, its more and more likely that
+ * balance will have distributed things for us, and less likely
+ * that scanning all our CPUs for an idle one will find one.
+ * So, if nr_running > 1, just call this CPU good enough
+ */
+ if (cpu_rq(cpu)->cfs.nr_running > 1)
+ return 0;
+
+ env.src_rq = task_rq(p);
+ env.dst_rq = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&env.src_rq->lock, flags);
+ hot = task_hot(p, &env);
+ raw_spin_unlock_irqrestore(&env.src_rq->lock, flags);
+
+ return hot == 0;
+}
+
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns 1, if task migration degrades locality
--
2.8.0.rc2