[patch] sched/fair: Add SIS_MIN_LAT feature to mitigate PELT lag induced stacking

From: Mike Galbraith
Date: Mon Jan 01 2018 - 05:38:19 EST


The problem: the scheduler wanting to stack tasks way too deeply has
become far too common a behavior, and IMHO a very very bad one.

While booting...
systemd-udevd-529 [001] d... 17.425644: select_task_rq_fair: NAK systemd-udevd:818 rejects cpu:7:w3072:r:3 -> cpu:1:w4096:r:4
systemd-udevd-529 [001] d... 17.425657: select_task_rq_fair: NAK systemd-udevd:585 rejects cpu:4:w4096:r:4 -> cpu:1:w6144:r:6
systemd-udevd-529 [001] d... 17.425663: select_task_rq_fair: NAK systemd-udevd:638 rejects cpu:4:w5120:r:5 -> cpu:1:w6144:r:6
systemd-udevd-529 [001] d... 17.425665: select_task_rq_fair: NAK systemd-udevd:602 rejects cpu:2:w2048:r:2 -> cpu:1:w6144:r:6
systemd-udevd-529 [001] d... 17.425674: select_task_rq_fair: NAK systemd-udevd:709 rejects cpu:6:w5120:r:5 -> cpu:1:w6144:r:6
systemd-udevd-529 [001] d... 17.425686: select_task_rq_fair: NAK systemd-udevd:787 rejects cpu:4:w7168:r:7 -> cpu:1:w8192:r:8
systemd-udevd-529 [001] d... 17.425691: select_task_rq_fair: NAK systemd-udevd:595 rejects cpu:2:w3072:r:3 -> cpu:1:w8192:r:8
systemd-udevd-529 [001] d... 17.425702: select_task_rq_fair: NAK systemd-udevd:735 rejects cpu:7:w5120:r:5 -> cpu:1:w9216:r:9
systemd-udevd-529 [001] d... 17.425708: select_task_rq_fair: NAK systemd-udevd:697 rejects cpu:6:w6144:r:6 -> cpu:1:w10240:r:10
systemd-udevd-529 [001] d... 17.425720: select_task_rq_fair: NAK systemd-udevd:772 rejects cpu:7:w6144:r:6 -> cpu:1:w12288:r:12
^^^^^^^^^^^^^^^^^ wheee!
A git workload in nfs mount while make -j4 runs, and I watch a youtube
clip...

git-14582 [004] d... 1935.676996: select_task_rq_fair: NAK git:14568 rejects cpu:0:w2048:r:2 -> cpu:4:w5120:r:5
MediaPl~back #4-8585 [002] d... 1935.676998: select_task_rq_fair: NAK MediaPl~back #1:8582 rejects cpu:5:w1024:r:1 -> cpu:2:w3072:r:3
git-14571 [004] d... 1935.677006: select_task_rq_fair: NAK git:14568 rejects cpu:0:w2048:r:2 -> cpu:4:w4096:r:4
nfsd-1887 [004] d... 1935.677316: select_task_rq_fair: NAK nfsd:1888 rejects cpu:0:w2048:r:2 -> cpu:4:w5120:r:5
git-14585 [000] d... 1935.678344: select_task_rq_fair: NAK git:14571 rejects cpu:4:w2048:r:2 -> cpu:0:w6144:r:6
git-14573 [000] d... 1935.678458: select_task_rq_fair: NAK git:14580 rejects cpu:4:w2048:r:2 -> cpu:0:w4096:r:4
git-14570 [000] d... 1935.681195: select_task_rq_fair: NAK git:14585 rejects cpu:4:w3072:r:3 -> cpu:0:w5120:r:5
git-14585 [000] d... 1935.682484: select_task_rq_fair: NAK git:14583 rejects cpu:4:w2048:r:2 -> cpu:0:w4096:r:4
git-14581 [000] d... 1935.682745: select_task_rq_fair: NAK git:14573 rejects cpu:4:w1024:r:1 -> cpu:0:w4096:r:4
git-14566 [004] d... 1935.683435: select_task_rq_fair: NAK git:14581 rejects cpu:0:w2048:r:2 -> cpu:4:w5120:r:5

None of that is particularly wonderful. I could show much more, and
piles of numbers, but will refrain. My intended audience isn't dense,
and besides, I'm fishing.

Here 0-day bot, here boy.. tasty patch just below.. yum yum :)

----- dinky, trivial, effective, and self explanatory patch below -----

Mitigate PELT lag induced latencies: do not blindly migrate to a CPU that
is busier than the one you're on simply because wake_affine() approved a
migration, but there is no idle CPU near the target, as doing so leads to
excessive stacking depth for userspace, AND kernel threads. Serializing
anything unnecessarily is bad, doing so to our own threads is dainbramaged.

select_idle_sibling() is intended to be a utilization optimization, let
it take an active role in avoiding utilization pessimization situations.

Signed-off-by: Mike Galbraith <efault@xxxxxx>
---
include/linux/sched.h | 3 +-
kernel/sched/fair.c | 70 +++++++++++++++++++++++++++++++++++++++++++++---
kernel/sched/features.h | 4 ++
3 files changed, 72 insertions(+), 5 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -541,7 +541,6 @@ struct task_struct {
unsigned int ptrace;

#ifdef CONFIG_SMP
- struct llist_node wake_entry;
int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
@@ -549,8 +548,10 @@ struct task_struct {
#endif
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
+ u64 wakee_placed;
struct task_struct *last_wakee;

+ struct llist_node wake_entry;
int wake_cpu;
#endif
int on_rq;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6185,13 +6185,23 @@ static int select_idle_cpu(struct task_s
return cpu;
}

+static int task_recently_placed(struct task_struct *p)
+{
+ s64 cold = sysctl_sched_migration_cost;
+
+ if (!sched_feat(SIS_MIN_LAT) || cold <= 0)
+ return 0;
+ return task_rq(p)->clock_task - p->wakee_placed < cold;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- int i;
+ unsigned long w_src, w_dst, w_p;
+ int i, llc_affine = cpus_share_cache(prev, target);

if (idle_cpu(target))
return target;
@@ -6199,7 +6209,7 @@ static int select_idle_sibling(struct ta
/*
* If the previous cpu is cache affine and idle, don't be stupid.
*/
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ if (prev != target && llc_affine && idle_cpu(prev))
return prev;

sd = rcu_dereference(per_cpu(sd_llc, target));
@@ -6218,7 +6228,55 @@ static int select_idle_sibling(struct ta
if ((unsigned)i < nr_cpumask_bits)
return i;

- return target;
+ if (prev == target || !sched_feat(SIS_MIN_LAT))
+ return target;
+
+ /*
+ * Given there is zero temporal information encoded in PELT data,
+ * placement based solely upon laggy util averages will inevitably
+ * lead to some latency inducing task stacking, the less idle time
+ * available for distribution to sleepers, the more they will stack
+ * if we blindly return target simply because wake_affine() ACKs.
+ * Do some sanity checking, migrate IFF that benefits the wakee.
+ */
+
+ w_src = scale_load_down(cpu_rq(prev)->load.weight);
+ w_dst = scale_load_down(cpu_rq(target)->load.weight);
+
+ /* Cool, PELT approved a likely latency winner, take it. */
+ if (w_dst < w_src)
+ return target;
+
+ w_p = scale_load_down(p->se.load.weight);
+
+ /*
+ * Be a bit liberal with cross node migrations, but keep is reasonable.
+ * We can't scale without using interconnect, so don't let laggy PELT
+ * data cause tasks to migrate too madly and stack too deeply.
+ */
+ if (!llc_affine && w_dst <= 2 * w_p + w_src)
+ return target;
+
+ /*
+ * When heavily loaded, stacking may be the best/only latency option,
+ * and LAST_BUDDY the only hope of preventing throughput collapse.
+ */
+ if (unlikely(task_cfs_rq(current)->nr_running >= sched_nr_latency) &&
+ task_cfs_rq(p)->nr_running >= sched_nr_latency)
+ return target;
+
+ /*
+ * Reject harmful migrations, and ask LB to please leave the wakee
+ * alone for a bit so it can perhaps grow some utilization that may
+ * bring PELT numbers closer to reflecting reality. We have to nip
+ * stack building in the bud, before it gets out of hand. Once built,
+ * LB is the only way out, but it wants to stack light tasks, which
+ * stifles their utilization growth. Tap the LB brake pedal.
+ */
+ if (w_dst > w_src + w_p)
+ p->wakee_placed = cpu_rq(prev)->clock_task;
+
+ return prev;
}

/*
@@ -6996,6 +7054,9 @@ static int task_hot(struct task_struct *
if (sysctl_sched_migration_cost == 0)
return 0;

+ if (task_recently_placed(p) && env->idle != CPU_NEWLY_IDLE)
+ return 1;
+
delta = rq_clock_task(env->src_rq) - p->se.exec_start;

return delta < (s64)sysctl_sched_migration_cost;
@@ -7100,7 +7161,8 @@ int can_migrate_task(struct task_struct

/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed) &&
+ !task_recently_placed(p)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,6 +57,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_AVG_CPU, false)
SCHED_FEAT(SIS_PROP, true)
+/*
+ * When doing wakeups, attempt to mitigate PELT util lag induced latencies.
+ */
+SCHED_FEAT(SIS_MIN_LAT, true)

/*
* Issue a WARN when we do multiple update_rq_clock() calls