[RFC PATCH 05/11] sched: Rewrite select_idle_siblings()

From: Yuyang Du
Date: Thu Jun 16 2016 - 05:46:46 EST


From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

select_idle_siblings() is a known pain point for a number of
workloads; it either does too much or not enough and sometimes just
does plain wrong.

This rewrite attempts to address a number of issues (but sadly not
all).

The current code does an unconditional sched_domain iteration; with
the intent of finding an idle core (on SMT hardware). The problems
which this patch tries to address are:

- its pointless to look for idle cores if the machine is real busy;
at which point you're just wasting cycles.

- it's behaviour is inconsistent between SMT and !SMT hardware in
that !SMT hardware ends up doing a scan for any idle CPU in the LLC
domain, while SMT hardware does a scan for idle cores and if that
fails, falls back to a scan for idle threads on the 'target' core.

The new code replaces the sched_domain scan with 3 explicit scans:

1) search for an idle core in the LLC
2) search for an idle CPU in the LLC
3) search for an idle thread in the 'target' core

where 1 and 3 are conditional on SMT support and 1 and 2 have runtime
heuristics to skip the step.

Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu
goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT
siblings of the CPU going idle. Similarly, we clear
sd_llc_shared->has_idle_cores when we fail to find an idle core.

Step 2) tracks the average cost of the scan and compares this to the
average idle time guestimate for the CPU doing the wakeup. There is a
significant fudge factor involved to deal with the variability of the
averages. Esp. hackbench was sensitive to this.

Step 3) is unconditional; we assume (also per step 1) that scanning
all SMT siblings in a core is 'cheap'.

With this; SMT systems gain step 2, which cures a few benchmarks --
notably one from Facebook.

One 'feature' of the sched_domain iteration, which we preserve in the
new code, is that it would start scanning from the 'target' CPU,
instead of scanning the cpumask in cpu id order. This avoids multiple
CPUs in the LLC scanning for idle to gang up and find the same CPU
quite as much. The down side is that tasks can end up hopping across
the LLC for no apparent reason.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched.h | 3 +
kernel/sched/core.c | 3 +
kernel/sched/fair.c | 270 ++++++++++++++++++++++++++++++++++++++--------
kernel/sched/idle_task.c | 4 +-
4 files changed, 233 insertions(+), 47 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96b371c..d74e757 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1060,6 +1060,7 @@ struct sched_group;
struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
+ int has_idle_cores;
};

struct sched_domain {
@@ -1092,6 +1093,8 @@ struct sched_domain {
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;

+ u64 avg_scan_cost; /* select_idle_sibling */
+
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bd313b2..e224581 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7342,6 +7342,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif

DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);

void __init sched_init(void)
{
@@ -7378,6 +7379,8 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+ per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d8dad2..8335ed5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1501,8 +1501,15 @@ balance:
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
*/
- if (!cur)
+ if (!cur) {
+ /*
+ * select_idle_siblings() uses an per-cpu cpumask that
+ * can be used from IRQ context.
+ */
+ local_irq_disable();
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ local_irq_enable();
+ }

assign:
assigned = true;
@@ -4532,6 +4539,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}

#ifdef CONFIG_SMP
+
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
@@ -5187,64 +5199,233 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
}

/*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
+ */
+
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+ int next;
+
+again:
+ next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+
+ if (*wrapped) {
+ if (next >= start)
+ return nr_cpumask_bits;
+ } else {
+ if (next >= nr_cpumask_bits) {
+ *wrapped = 1;
+ n = -1;
+ goto again;
+ }
+ }
+
+ return next;
+}
+
+#define for_each_cpu_wrap(cpu, mask, start, wrap) \
+ for ((wrap) = 0, (cpu) = (start)-1; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
+ (cpu) < nr_cpumask_bits; )
+
+#ifdef CONFIG_SCHED_SMT
+
+static inline void set_idle_cores(int cpu, int val)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);
+
+ return def;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core, true))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core, 1);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if there are
+ * no idle cores left in the system; tracked through sd_llc->shared->has_idle_cores
+ * and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int core, cpu, wrap;
+
+ if (!test_idle_cores(target, false))
+ return -1;
+
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+ for_each_cpu_wrap(core, cpus, target, wrap) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ return core;
+ }
+
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ set_idle_cores(target, 0);
+
+ return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+void update_idle_core(struct rq *rq) { }
+
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ u64 avg_idle = this_rq()->avg_idle;
+ u64 avg_cost = this_sd->avg_scan_cost;
+ u64 time, cost;
+ s64 delta;
+ int cpu, wrap;
+
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ if ((avg_idle / 512) < avg_cost)
+ return -1;
+
+ time = local_clock();
+
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ break;
+ }
+
+ time = local_clock() - time;
+ cost = this_sd->avg_scan_cost;
+ delta = (s64)(time - cost) / 8;
+ this_sd->avg_scan_cost += delta;
+
+ return cpu;
+}
+
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
int i = task_cpu(p);

if (idle_cpu(target))
return target;

/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous cpu is cache affine and idle, don't be stupid.
*/
if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
return i;

- /*
- * Otherwise, iterate the domains and find an eligible idle cpu.
- *
- * A completely idle sched group at higher domains is more
- * desirable than an idle group at a lower level, because lower
- * domains have smaller groups and usually share hardware
- * resources which causes tasks to contend on them, e.g. x86
- * hyperthread siblings in the lowest domain (SMT) can contend
- * on the shared cpu pipeline.
- *
- * However, while we prefer idle groups at higher domains
- * finding an idle cpu at the lowest domain is still better than
- * returning 'target', which we've already established, isn't
- * idle.
- */
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- /* Ensure the entire group is idle */
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (!sd)
+ return target;
+
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;

- /*
- * It doesn't matter which cpu we pick, the
- * whole group is idle.
- */
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
- }
-done:
return target;
}

@@ -7276,9 +7457,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512

-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 2ce5458..5baf75c 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,11 +23,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}

+extern void update_idle_core(struct rq *rq);
+
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
put_prev_task(rq, prev);
-
+ update_idle_core(rq);
schedstat_inc(rq, sched_goidle);
return rq->idle;
}
--
1.7.9.5