[RFC][PATCH 8/7] sched/fair: Use utilization distance to filter affine sync wakeups

From: Mike Galbraith
Date: Wed May 18 2016 - 01:51:47 EST


On Mon, 2016-05-09 at 12:48 +0200, Peter Zijlstra wrote:
> Hai,

(got some of the frozen variety handy?:)

> here be a semi coherent patch series for the recent select_idle_siblings()
> tinkering. Happy benchmarking..

And tinkering on top of your rewrite series...

sched/fair: Use utilization distance to filter affine sync wakeups

Identifying truly synchronous tasks accurately is annoyingly fragile,
which led to the demise of the old avg_overlap heuristic, which meant
that we schedule tasks high frequency localhost communicating buddies
to L3 vs L2, causing them to take painful cache misses needlessly.

To combat this, track average utilization distance, and when both
waker/wakee are short duration tasks cycling at the ~same frequency
(ie can't have any appreciable reclaimable overlap), and the sync
hint has been passed, take that as a queue that pulling the wakee
to hot L2 is very likely to be a win. Changes in behavior, such
as taking a long nap, bursts of other activity, or sharing the rq
with tasks that are not cycling rapidly will quickly encourage the
pair to search for a new home, where they can again find each other.

This only helps really fast movers, but that's ok (if we can get
away with it at all), as these are the ones that need some help.

It's dirt simple, cheap, and seems to work pretty well. It does
help fast movers, does not wreck lmbench AF_UNIX/TCP throughput
gains that select_idle_sibling() provided, and didn't change pgbench
numbers one bit on my desktop box, ie tight discrimination criteria
seems to work out ok in light testing, so _maybe_ not completely
useless...

4 x E7-8890 tbench
Throughput 598.158 MB/sec 1 clients 1 procs max_latency=0.287 ms 1.000
Throughput 1166.26 MB/sec 2 clients 2 procs max_latency=0.076 ms 1.000
Throughput 2214.55 MB/sec 4 clients 4 procs max_latency=0.087 ms 1.000
Throughput 4264.44 MB/sec 8 clients 8 procs max_latency=0.164 ms 1.000
Throughput 7780.58 MB/sec 16 clients 16 procs max_latency=0.109 ms 1.000
Throughput 15199.3 MB/sec 32 clients 32 procs max_latency=0.293 ms 1.000
Throughput 21714.8 MB/sec 64 clients 64 procs max_latency=0.872 ms 1.000
Throughput 44916.1 MB/sec 128 clients 128 procs max_latency=4.821 ms 1.000
Throughput 76294.5 MB/sec 256 clients 256 procs max_latency=7.375 ms 1.000

+IDLE_SYNC
Throughput 737.781 MB/sec 1 clients 1 procs max_latency=0.248 ms 1.233
Throughput 1478.49 MB/sec 2 clients 2 procs max_latency=0.321 ms 1.267
Throughput 2506.98 MB/sec 4 clients 4 procs max_latency=0.413 ms 1.132
Throughput 4359.15 MB/sec 8 clients 8 procs max_latency=0.306 ms 1.022
Throughput 9025.05 MB/sec 16 clients 16 procs max_latency=0.349 ms 1.159
Throughput 18703.1 MB/sec 32 clients 32 procs max_latency=0.290 ms 1.230
Throughput 33600.8 MB/sec 64 clients 64 procs max_latency=6.469 ms 1.547
Throughput 59084.3 MB/sec 128 clients 128 procs max_latency=5.031 ms 1.315
Throughput 75705.8 MB/sec 256 clients 256 procs max_latency=24.113 ms 0.992

1 x i4790 lmbench3
*Local* Communication bandwidths in MB/s - bigger is better
-----------------------------------------------------------------------------
Host OS Pipe AF TCP File Mmap Bcopy Bcopy Mem Mem
UNIX reread reread (libc) (hand) read write
--------- ------------- ---- ---- ---- ------ ------ ------ ------ ---- -----
IDLE_CORE+IDLE_CPU+IDLE_SMT
homer 4.6.0-masterx 6027 14.K 9773 8905.2 15.2K 10.1K 6775.0 15.K 10.0K
homer 4.6.0-masterx 5962 14.K 9881 8900.7 15.0K 10.1K 6785.2 15.K 10.0K
homer 4.6.0-masterx 5935 14.K 9917 8946.2 15.0K 10.1K 6761.8 15.K 9826.
+IDLE_SYNC
homer 4.6.0-masterx 8865 14.K 9807 8880.6 14.7K 10.1K 6777.9 15.K 9966.
homer 4.6.0-masterx 8855 13.K 9856 8844.5 15.2K 10.1K 6752.1 15.K 10.0K
homer 4.6.0-masterx 8896 14.K 9836 8880.1 15.0K 10.2K 6771.6 15.K 9941.
^++ ^+- ^+-
select_idle_sibling() completely disabled
homer 4.6.0-masterx 8810 9807 7109 8982.8 15.4K 10.2K 6831.7 15.K 10.1K
homer 4.6.0-masterx 8877 9757 6864 8970.1 15.3K 10.2K 6826.6 15.K 10.1K
homer 4.6.0-masterx 8779 9736 10.K 8975.6 15.4K 10.1K 6830.2 15.K 10.1K
^++ ^-- ^--

Signed-off-by: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
---
include/linux/sched.h | 2 -
kernel/sched/core.c | 6 -----
kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++++++++-------
kernel/sched/features.h | 1
kernel/sched/sched.h | 7 ++++++
5 files changed, 51 insertions(+), 14 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1302,7 +1302,7 @@ struct load_weight {
* issues.
*/
struct sched_avg {
- u64 last_update_time, load_sum;
+ u64 last_update_time, load_sum, util_dist_us;
u32 util_sum, period_contrib;
unsigned long load_avg, util_avg;
};
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1706,12 +1706,6 @@ int select_task_rq(struct task_struct *p
return cpu;
}

-static void update_avg(u64 *avg, u64 sample)
-{
- s64 diff = sample - *avg;
- *avg += diff >> 3;
-}
-
#else

static inline int __set_cpus_allowed_ptr(struct task_struct *p,
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -658,7 +658,7 @@ static u64 sched_vslice(struct cfs_rq *c
}

#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int cpu, int sync);
static unsigned long task_h_load(struct task_struct *p);

/*
@@ -689,6 +689,7 @@ void init_entity_runnable_average(struct
*/
sa->util_avg = 0;
sa->util_sum = 0;
+ sa->util_dist_us = USEC_PER_SEC;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}

@@ -1509,7 +1510,7 @@ static void task_numa_compare(struct tas
* can be used from IRQ context.
*/
local_irq_disable();
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu, 0);
local_irq_enable();
}

@@ -2734,6 +2735,13 @@ __update_load_avg(u64 now, int cpu, stru
return 0;
sa->last_update_time = now;

+ /*
+ * Track avg utilization distance for select_idle_sibling() to try
+ * to identify short running synchronous tasks that will perform
+ * much better when migrated to tasty hot L2 data.
+ */
+ update_avg(&sa->util_dist_us, min(delta, (u64)USEC_PER_SEC));
+
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);

@@ -5408,17 +5416,43 @@ static int select_idle_cpu(struct task_s
return cpu;
}

+static int select_idle_sync(struct task_struct *p, int target)
+{
+ u64 max = current->se.avg.util_dist_us;
+ u64 min = p->se.avg.util_dist_us;
+
+ if (min > max)
+ swap(min, max);
+
+ /*
+ * If waker/wakee are both short duration and very similar,
+ * prefer L2. If either spends much time waiting or changes
+ * behavior, utilization distances should quickly increase,
+ * rendering them ineligible.
+ */
+ if (max + min > 10 || max - min > 2)
+ return -1;
+ return target;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int target, int sync)
{
struct sched_domain *sd;
- int start, i = task_cpu(p);
+ int start, i;

if (idle_cpu(target))
return target;

+ if (sched_feat(IDLE_SYNC) && sync) {
+ i = select_idle_sync(p, target);
+ if (i != -1)
+ return i;
+ }
+
+ i = task_cpu(p);
/*
* If the previous cpu is cache affine and idle, don't be stupid.
*/
@@ -5573,9 +5607,10 @@ select_task_rq_fair(struct task_struct *
}

if (!sd) {
- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
-
+ if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ sync &= want_affine;
+ new_cpu = select_idle_sibling(p, new_cpu, sync);
+ }
} else while (sd) {
struct sched_group *group;
int weight;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -78,4 +78,5 @@ SCHED_FEAT(AVG_CPU, true)
SCHED_FEAT(PRINT_AVG, false)

SCHED_FEAT(IDLE_SMT, true)
+SCHED_FEAT(IDLE_SYNC, true)

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1848,3 +1848,10 @@ static inline void update_idle_core(stru
#else
static inline void update_idle_core(struct rq *rq) { }
#endif
+
+static inline void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+