[RFC PATCH 11/11] sched/fair: Refactor select_task_rq_fair()
From: Yuyang Du
Date: Thu Jun 16 2016 - 05:47:01 EST
This refactoring attempts to achieve:
- Decouple waker/wakee with the three kinds of wakeup SD_* flags.
- Form a complete topology view in the select
- Determine fast idle select vs. slow avg select with more info
To enable this refactoring:
echo NEW_SELECT > sched_features
Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
include/linux/sched.h | 1 +
kernel/sched/fair.c | 284 ++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/features.h | 1 +
3 files changed, 285 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4b2a0fa..c4b4b90 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1475,6 +1475,7 @@ struct task_struct {
struct llist_node wake_entry;
int on_cpu;
unsigned int wakee_flips;
+ unsigned int wakee_count;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f15461f..1ab41b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4986,12 +4986,14 @@ static void record_wakee(struct task_struct *p)
*/
if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
current->wakee_flips >>= 1;
+ current->wakee_count >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
if (current->last_wakee != p) {
current->last_wakee = p;
current->wakee_flips++;
+ current->wakee_count++;
}
}
@@ -5025,6 +5027,45 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+#define TP_IDENT 0x1 /* waker is wakee */
+#define TP_SMT 0x2 /* SMT sibling */
+#define TP_LLC 0x4 /* cpus_share_cache(waker, wakee) */
+#define TP_SHARE_CACHE 0x7 /* waker and wakee share cache */
+#define TP_LLC_SOCK 0x10 /* all CPUs have complete LLC coverage */
+#define TP_NOLLC_SOCK 0x20 /* !TP_LLC_SOCK */
+#define TP_NOLLC_XSOCK 0x40 /* cross socket */
+
+static int wake_wide2(int topology, struct task_struct *p)
+{
+ unsigned int master = current->wakee_count;
+ unsigned int slave = p->wakee_count;
+ unsigned int master_flips = current->wakee_flips;
+ unsigned int slave_flips = p->wakee_flips;
+ int factor = this_cpu_read(sd_llc_size);
+
+ if (!master_flips)
+ master /= master_flips;
+ else
+ master = !!master;
+
+ if (!slave_flips)
+ slave /= slave_flips;
+ else
+ slave = !!slave;
+
+ if (master < slave)
+ swap(master, slave);
+
+ /*
+ * The system is either TP_NOLLC_SOCK or TP_NOLLC_XSOCK, the waker
+ * and wakee may still share some cache though.
+ */
+ if (topology | TP_SHARE_CACHE)
+ return master > factor;
+
+ return slave > factor;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
@@ -5399,6 +5440,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return cpu;
}
+static int select_idle_cpu2(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu, wrap;
+
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ break;
+ }
+
+ return cpu;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
@@ -5424,7 +5479,10 @@ static int select_idle_sibling(struct task_struct *p, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_cpu(p, sd, target);
+ if (sched_feat(NEW_SELECT))
+ i = select_idle_cpu2(p, sd, target);
+ else
+ i = select_idle_cpu(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -5470,6 +5528,227 @@ static int cpu_util(int cpu)
}
/*
+ * XXX just copied from waker_affine().
+ * use util other than load for some topologes?
+ * real-time vs. avg
+ */
+static int
+wake_waker(int topology, struct sched_domain *sd, struct task_struct *p, int sync)
+{
+ s64 this_load, load;
+ s64 this_eff_load, prev_eff_load;
+ int idx, this_cpu, prev_cpu;
+ struct task_group *tg;
+ unsigned long weight;
+ int balanced;
+
+ idx = sd->wake_idx;
+ this_cpu = smp_processor_id();
+ prev_cpu = task_cpu(p);
+ load = source_load(prev_cpu, idx);
+ this_load = target_load(this_cpu, idx);
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync) {
+ tg = task_group(current);
+ weight = current->se.avg.load_avg;
+
+ this_load += effective_load(tg, this_cpu, -weight, -weight);
+ load += effective_load(tg, prev_cpu, 0, -weight);
+ }
+
+ tg = task_group(p);
+ weight = p->se.avg.load_avg;
+
+ /*
+ * In low-load situations, where prev_cpu is idle and this_cpu is idle
+ * due to the sync cause above having dropped this_load to 0, we'll
+ * always have an imbalance, but there's really nothing you can do
+ * about that, so that's good too.
+ *
+ * Otherwise check if either cpus are near enough in load to allow this
+ * task to be woken on this_cpu.
+ */
+ this_eff_load = 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
+
+ if (this_load > 0) {
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+ }
+
+ balanced = this_eff_load <= prev_eff_load;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+
+ if (!balanced)
+ return 0;
+
+ schedstat_inc(sd, ttwu_move_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+ return 1;
+}
+static int wake_idle(struct sched_domain *sd)
+{
+ u64 avg_cost = sd->avg_scan_cost, avg_idle = this_rq()->avg_idle;
+
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ if ((avg_idle / 512) < avg_cost)
+ return 0;
+
+ return 1;
+}
+
+static inline int
+waker_wakee_topology(int waker, int wakee, int *allowed, struct task_struct *p)
+{
+ int topology;
+
+ if (static_branch_likely(&sched_llc_complete))
+ topology = TP_LLC_SOCK;
+
+ if (waker == wakee)
+ return TP_IDENT | topology;
+
+ *allowed = cpumask_test_cpu(waker, tsk_cpus_allowed(p));
+
+#ifdef CONFIG_SCHED_SMT
+ if (static_branch_likely(&sched_smt_present) &&
+ cpumask_test_cpu(waker, cpu_smt_mask(wakee)))
+ return TP_SMT | topology;
+#endif
+ if (cpus_share_cache(wakee, waker))
+ return TP_LLC | topology;
+
+ /* We are here without TP_LLC_SOCK for sure */
+ if (unlikely(cpus_share_socket(wakee, waker)))
+ return TP_NOLLC_SOCK;
+
+ return TP_NOLLC_XSOCK;
+}
+
+/*
+ * Notes:
+ * - how one-time local suboptimal select approaches to overall global optimal selects
+ * - certain randomness
+ * - spread vs. consolidate
+ * - load vs. util, real-time vs. avg
+ * - use topology more
+ */
+static int
+select_task_rq_fair2(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+{
+ struct sched_domain *tmp, *sd = NULL, *this_sd_llc = NULL;
+ int waker_allowed = 1, select_idle = 0;
+ int cpu = smp_processor_id(), new_cpu = prev_cpu;
+ int sync = wake_flags & WF_SYNC;
+ int topology = waker_wakee_topology(cpu, prev_cpu, &waker_allowed, p);
+
+ record_wakee(p);
+
+ rcu_read_lock();
+
+ for_each_domain(cpu, tmp) {
+ /* Stop if domain flags says no */
+ if (!(tmp->flags & SD_LOAD_BALANCE) || !(tmp->flags & sd_flag))
+ break;
+
+ sd = tmp;
+ }
+
+ if (!sd)
+ goto out_unlock;
+
+ this_sd_llc = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd_llc || (sd->span_weight < this_sd_llc->span_weight))
+ goto select_avg;
+
+ /* Now we can attempt to fast select an idle CPU */
+ if (topology & TP_LLC_SOCK) {
+
+ if (wake_idle(this_sd_llc))
+ select_idle = 1;
+
+ } else if (!wake_wide2(topology, p))
+ select_idle = 1;
+
+ if (topology > TP_IDENT && waker_allowed &&
+ wake_waker(topology, this_sd_llc, p, sync))
+ new_cpu = cpu;
+
+ if (select_idle) {
+ /*
+ * Scan the LLC domain for idle CPUs; this is dynamically
+ * regulated by comparing the average scan cost (tracked in
+ * this_sd_llc->avg_scan_cost) against the average idle time
+ * for this rq (as found in this_rq->avg_idle).
+ */
+ u64 time = local_clock();
+
+ new_cpu = select_idle_sibling(p, new_cpu);
+ time = local_clock() - time;
+ this_sd_llc->avg_scan_cost +=
+ (s64)(time - this_sd_llc->avg_scan_cost) / 8;
+
+ goto out_unlock;
+ }
+
+select_avg:
+ while (sd) {
+ struct sched_group *group;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+out_unlock:
+ rcu_read_unlock();
+
+ return new_cpu;
+}
+
+/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
@@ -5489,6 +5768,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
+ if (sched_feat(NEW_SELECT))
+ return select_task_rq_fair2(p, prev_cpu, sd_flag, wake_flags);
+
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa..ea41750 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(NEW_SELECT, false)
--
1.7.9.5