[PATCH] sched: Make select_idle_sibling search domain configurable

From: Xi Wang
Date: Tue Jul 28 2020 - 03:03:12 EST


The scope of select_idle_sibling idle cpu search is LLC. This
becomes a problem for the AMD CCX architecture, as the sd_llc is only
4 cores. On a many core machine, the range of search is too small to
reach a satisfactory level of statistical multiplexing / efficient
utilization of short idle time slices.

With this patch idle sibling search is detached from LLC and it
becomes run time configurable. To reduce search and migration
overheads, a presearch domain is added. The presearch domain will be
searched first before the "main search" domain, e.g.:

sysctl_sched_wake_idle_domain == 2 ("MC" domain)
sysctl_sched_wake_idle_presearch_domain == 1 ("DIE" domain)

Presearch will go through 4 cores of a CCX. If no idle cpu is found
during presearch, full search will go through the remaining cores of
a cpu socket.

Heuristics including sd->avg_scan_cost and sds->have_idle_cores
are only active for the main search.

On a 128 core (2 socket * 64 core, 256 hw threads) AMD machine ran
hackbench as

hackbench -g 20 -f 20 --loops 10000

A snapshot of run time was

Baseline: 11.8
With the patch: 7.6 (configured as in the example above)

Signed-off-by: Xi Wang <xii@xxxxxxxxxx>
---
block/blk-mq.c | 2 +-
block/blk-softirq.c | 2 +-
include/linux/cpuset.h | 10 +-
include/linux/sched/topology.h | 11 +-
kernel/cgroup/cpuset.c | 32 ++++--
kernel/sched/core.c | 10 +-
kernel/sched/fair.c | 191 +++++++++++++++++++++------------
kernel/sched/sched.h | 9 +-
kernel/sched/topology.c | 87 ++++++++++-----
kernel/sysctl.c | 25 +++++
10 files changed, 256 insertions(+), 123 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e0d173beaa3..20aee9f047e2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -626,7 +626,7 @@ void blk_mq_force_complete_rq(struct request *rq)

cpu = get_cpu();
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
- shared = cpus_share_cache(cpu, ctx->cpu);
+ shared = cpus_share_sis(cpu, ctx->cpu);

if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
rq->csd.func = __blk_mq_complete_request_remote;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 6e7ec87d49fa..dd38ac0e1f2e 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -108,7 +108,7 @@ void __blk_complete_request(struct request *req)
*/
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
- shared = cpus_share_cache(cpu, ccpu);
+ shared = cpus_share_sis(cpu, ccpu);
} else
ccpu = cpu;

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 04c20de66afc..8b243aa8462e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -117,6 +117,7 @@ static inline int cpuset_do_slab_mem_spread(void)
extern bool current_cpuset_is_being_rebound(void);

extern void rebuild_sched_domains(void);
+extern void rebuild_sched_domains_force(void);

extern void cpuset_print_current_mems_allowed(void);

@@ -173,7 +174,7 @@ static inline void cpuset_force_rebuild(void) { }

static inline void cpuset_update_active_cpus(void)
{
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL, 0);
}

static inline void cpuset_wait_for_hotplug(void) { }
@@ -259,7 +260,12 @@ static inline bool current_cpuset_is_being_rebound(void)

static inline void rebuild_sched_domains(void)
{
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL, 0);
+}
+
+static inline void rebuild_sched_domains_force(void)
+{
+ partition_sched_domains(1, NULL, NULL, 1);
}

static inline void cpuset_print_current_mems_allowed(void)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fb11091129b3..aff9739cf516 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -151,16 +151,17 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)

extern void partition_sched_domains_locked(int ndoms_new,
cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new);
+ struct sched_domain_attr *dattr_new,
+ int force_update);

extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new);
+ struct sched_domain_attr *dattr_new, int force_update);

/* Allocate an array of sched domains, for partition_sched_domains(). */
cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);

-bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_sis(int this_cpu, int that_cpu);

typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
@@ -199,7 +200,7 @@ struct sched_domain_attr;

static inline void
partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+ struct sched_domain_attr *dattr_new, int force_update)
{
}

@@ -209,7 +210,7 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{
}

-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
+static inline bool cpus_share_sis(int this_cpu, int that_cpu)
{
return true;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 642415b8c3c9..5087b90c4c47 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -962,10 +962,10 @@ static void rebuild_root_domains(void)

static void
partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+ struct sched_domain_attr *dattr_new, int force_update)
{
mutex_lock(&sched_domains_mutex);
- partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new, force_update);
rebuild_root_domains();
mutex_unlock(&sched_domains_mutex);
}
@@ -981,7 +981,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
*
* Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_locked(int force_update)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
@@ -1007,23 +1007,33 @@ static void rebuild_sched_domains_locked(void)
ndoms = generate_sched_domains(&doms, &attr);

/* Have scheduler rebuild the domains */
- partition_and_rebuild_sched_domains(ndoms, doms, attr);
+ partition_and_rebuild_sched_domains(ndoms, doms, attr, force_update);
}
#else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_locked(int force_update)
{
}
#endif /* CONFIG_SMP */

-void rebuild_sched_domains(void)
+void __rebuild_sched_domains(int force_update)
{
get_online_cpus();
percpu_down_write(&cpuset_rwsem);
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_locked(force_update);
percpu_up_write(&cpuset_rwsem);
put_online_cpus();
}

+void rebuild_sched_domains(void)
+{
+ __rebuild_sched_domains(0);
+}
+
+void rebuild_sched_domains_force(void)
+{
+ __rebuild_sched_domains(1);
+}
+
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -1437,7 +1447,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
rcu_read_unlock();

if (need_rebuild_sched_domains)
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_locked(0);
}

/**
@@ -1837,7 +1847,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_locked(0);
}

return 0;
@@ -1903,7 +1913,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spin_unlock_irq(&callback_lock);

if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_locked(0);

if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1994,7 +2004,7 @@ static int update_prstate(struct cpuset *cs, int val)
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmp);

- rebuild_sched_domains_locked();
+ rebuild_sched_domains_locked(0);
out:
free_cpumasks(NULL, &tmp);
return err;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e15543cb8481..e28548fc63f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2350,9 +2350,9 @@ void wake_up_if_idle(int cpu)
rcu_read_unlock();
}

-bool cpus_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_sis(int this_cpu, int that_cpu)
{
- return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+ return per_cpu(sd_sis_id, this_cpu) == per_cpu(sd_sis_id, that_cpu);
}

static inline bool ttwu_queue_cond(int cpu, int wake_flags)
@@ -2361,7 +2361,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
- if (!cpus_share_cache(smp_processor_id(), cpu))
+ if (!cpus_share_sis(smp_processor_id(), cpu))
return true;

/*
@@ -6501,7 +6501,7 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL, 0);
if (--num_cpus_frozen)
return;
/*
@@ -6522,7 +6522,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL, 0);
}
return 0;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..0ed71f2f3a81 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5736,8 +5736,8 @@ static void record_wakee(struct task_struct *p)
* at a frequency roughly N times higher than one of its wakees.
*
* In order to determine whether we should let the load spread vs consolidating
- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.
+ * sis domain, we look for a minimum 'flip' frequency of sis_size in one partner,
+ * and a factor of sis_size higher frequency in the other.
*
* With both conditions met, we can be relatively sure that the relationship is
* non-monogamous, with partner count exceeding socket size.
@@ -5750,7 +5750,7 @@ static int wake_wide(struct task_struct *p)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = __this_cpu_read(sd_llc_size);
+ int factor = __this_cpu_read(sd_sis_size);

if (master < slave)
swap(master, slave);
@@ -5786,7 +5786,7 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/
- if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ if (available_idle_cpu(this_cpu) && cpus_share_sis(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;

if (sync && cpu_rq(this_cpu)->nr_running == 1)
@@ -5978,7 +5978,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;

- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -5987,7 +5987,7 @@ static inline bool test_idle_cores(int cpu, bool def)
{
struct sched_domain_shared *sds;

- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);

@@ -5996,7 +5996,7 @@ static inline bool test_idle_cores(int cpu, bool def)

/*
* Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
+ * information in sd_sis_shared->has_idle_cores.
*
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
@@ -6024,13 +6024,12 @@ void __update_idle_core(struct rq *rq)
}

/*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * Scan the entire sis domain for idle cores; this dynamically switches off if
* there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * sd_sis->shared->has_idle_cores and enabled through update_idle_core() above.
*/
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_core(struct task_struct *p, struct cpumask *cpus, int target)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int core, cpu;

if (!static_branch_likely(&sched_smt_present))
@@ -6039,18 +6038,18 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;

- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
for_each_cpu_wrap(core, cpus, target) {
bool idle = true;

+ if (core != cpumask_first(cpu_smt_mask(core)))
+ continue;
+
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
idle = false;
break;
}
}
- cpumask_andnot(cpus, cpus, cpu_smt_mask(core));

if (idle)
return core;
@@ -6099,45 +6098,45 @@ static inline int select_idle_smt(struct task_struct *p, int target)
#endif /* CONFIG_SCHED_SMT */

/*
- * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * Scan the sis domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct cpumask *cpus,
+ bool main_search, unsigned int span_weight, int target)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time;
int this = smp_processor_id();
int cpu, nr = INT_MAX;

- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
+ if (main_search) {
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_sis));
+ if (!this_sd)
+ return -1;

- /*
- * Due to large variance we need a large fuzz factor; hackbench in
- * particularly is sensitive here.
- */
- avg_idle = this_rq()->avg_idle / 512;
- avg_cost = this_sd->avg_scan_cost + 1;
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ avg_idle = this_rq()->avg_idle / 512;
+ avg_cost = this_sd->avg_scan_cost + 1;

- if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
- return -1;
+ if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
+ return -1;

- if (sched_feat(SIS_PROP)) {
- u64 span_avg = sd->span_weight * avg_idle;
- if (span_avg > 4*avg_cost)
- nr = div_u64(span_avg, avg_cost);
- else
- nr = 4;
+ if (sched_feat(SIS_PROP)) {
+ u64 span_avg = span_weight * avg_idle;
+ if (span_avg > 4*avg_cost)
+ nr = div_u64(span_avg, avg_cost);
+ else
+ nr = 4;
+ }
}

time = cpu_clock(this);

- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return -1;
@@ -6145,8 +6144,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
break;
}

- time = cpu_clock(this) - time;
- update_avg(&this_sd->avg_scan_cost, time);
+ if (main_search) {
+ time = cpu_clock(this) - time;
+ update_avg(&this_sd->avg_scan_cost, time);
+ }

return cpu;
}
@@ -6186,19 +6187,21 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
}

/*
- * Try and locate an idle core/thread in the LLC cache domain.
+ * Try and locate an idle core/thread in the sis domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
- struct sched_domain *sd;
- int i, recent_used_cpu;
+ struct sched_domain *sd_asym;
+ struct sched_domain *sd[2];
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int i, r, recent_used_cpu;

/*
* For asymmetric CPU capacity systems, our domain of interest is
- * sd_asym_cpucapacity rather than sd_llc.
+ * sd_asym_cpucapacity rather than sd_sis.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ sd_asym = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
@@ -6207,10 +6210,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
* capacity path.
*/
- if (!sd)
+ if (!sd_asym)
goto symmetric;

- i = select_idle_capacity(p, sd, target);
+ i = select_idle_capacity(p, sd_asym, target);
return ((unsigned)i < nr_cpumask_bits) ? i : target;
}

@@ -6221,7 +6224,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) &&
+ if (prev != target && cpus_share_sis(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;

@@ -6243,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
- cpus_share_cache(recent_used_cpu, target) &&
+ cpus_share_sis(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
@@ -6254,21 +6257,35 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return recent_used_cpu;
}

- sd = rcu_dereference(per_cpu(sd_llc, target));
- if (!sd)
- return target;
+ for (i = 0; ; i++) {
+ if (i == 0) {
+ sd[0] = rcu_dereference(per_cpu(sd_sis_pre, target));
+ if (!sd[0])
+ continue;
+ cpumask_and(cpus, sched_domain_span(sd[0]), p->cpus_ptr);
+ } else if (i == 1) {
+ sd[1] = rcu_dereference(per_cpu(sd_sis, target));
+ if (!sd[1])
+ continue;
+ cpumask_and(cpus, sched_domain_span(sd[1]), p->cpus_ptr);
+ if (sd[0])
+ cpumask_andnot(cpus, cpus, sched_domain_span(sd[0]));
+ } else {
+ break;
+ }

- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ r = select_idle_core(p, cpus, target);
+ if ((unsigned)r < nr_cpumask_bits)
+ return r;

- i = select_idle_cpu(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ r = select_idle_cpu(p, cpus, (i == 1), sd[i]->span_weight, target);
+ if ((unsigned)r < nr_cpumask_bits)
+ return r;

- i = select_idle_smt(p, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ r = select_idle_smt(p, target);
+ if ((unsigned)r < nr_cpumask_bits)
+ return r;
+ }

return target;
}
@@ -6718,6 +6735,46 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
return new_cpu;
}

+
+#ifdef CONFIG_SMP
+
+extern int sysctl_sched_wake_idle_domain;
+extern int sysctl_sched_wake_idle_presearch_domain;
+
+DEFINE_MUTEX(wake_idle_domain_mutex);
+
+int proc_sched_wake_idle_domain_handler(struct ctl_table *table,
+ int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *table;
+ int *sysctl = tmp.data;
+ int val = *sysctl;
+ int min = -1, max = INT_MAX;
+ int rc;
+
+ tmp.extra1 = &min;
+ tmp.extra2 = &max;
+ tmp.data = &val;
+
+ rc = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (rc || !write)
+ return rc;
+
+ mutex_lock(&wake_idle_domain_mutex);
+ *sysctl = val;
+ rebuild_sched_domains_force();
+ mutex_unlock(&wake_idle_domain_mutex);
+
+ pr_info("Idle cpu search (select_idle_sibling) domains changed to: "
+ "sched_wake_idle_domain %d sched_wake_idle_presearch domain %d\n",
+ sysctl_sched_wake_idle_domain, sysctl_sched_wake_idle_presearch_domain);
+
+ return 0;
+}
+
+#endif
+
static void detach_entity_cfs_rq(struct sched_entity *se);

/*
@@ -10136,21 +10193,21 @@ static void nohz_balancer_kick(struct rq *rq)
* cache use, instead we want to embrace asymmetry and only
* ensure tasks have enough CPU capacity.
*
- * Skip the LLC logic because it's not relevant in that case.
+ * Skip the sis logic because it's not relevant in that case.
*/
goto unlock;
}

- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
if (sds) {
/*
- * If there is an imbalance between LLC domains (IOW we could
- * increase the overall cache use), we need some less-loaded LLC
+ * If there is an imbalance between sis domains (IOW we could
+ * increase the overall cache use), we need some less-loaded sis
* domain to pull some load. Likewise, we may need to spread
- * load within the current LLC domain (e.g. packed SMT cores but
+ * load within the current sis domain (e.g. packed SMT cores but
* other CPUs are idle). We can't really know from here how busy
* the others are - so just get a nohz balance going if it looks
- * like this LLC domain has tasks we could move.
+ * like this sis domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
@@ -10170,7 +10227,7 @@ static void set_cpu_sd_state_busy(int cpu)
struct sched_domain *sd;

rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference(per_cpu(sd_sis, cpu));

if (!sd || !sd->nohz_idle)
goto unlock;
@@ -10200,7 +10257,7 @@ static void set_cpu_sd_state_idle(int cpu)
struct sched_domain *sd;

rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference(per_cpu(sd_sis, cpu));

if (!sd || sd->nohz_idle)
goto unlock;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..641a5bacdf77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1415,10 +1415,11 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
return sd;
}

-DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
-DECLARE_PER_CPU(int, sd_llc_size);
-DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_sis);
+DECLARE_PER_CPU(int, sd_sis_size);
+DECLARE_PER_CPU(int, sd_sis_id);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_sis_shared);
+DECLARE_PER_CPU(struct sched_domain *, sd_sis_pre);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..bdda783c5148 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -605,41 +605,75 @@ static void destroy_sched_domains(struct sched_domain *sd)
}

/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first CPU number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_cache().
+ * sd_sis is the select_idle_sibling search domain. It is generalized sd_llc
+ * not limited by the SD_SHARE_PKG_RESOURCE flag. With the sysctls sd_sis is
+ * also run time configurable.
+ * To limit overheads from searching / migrating among cores that don't share
+ * llc, a presearch domain can be enabled such that most searches / migrations
+ * still happen inside a smaller domain when the machine is lightly loaded.
+ *
+ * Keep a special pointer for this allows us to avoid some pointer chasing in
+ * select_idle_sibling(). Also keep a unique ID per domain (we use the first CPU
+ * number in the cpumask of the domain), this allows us to quickly tell if
+ * two CPUs are in the same sis domain, see cpus_share_sis().
*/
-DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_sis);
+DEFINE_PER_CPU(int, sd_sis_size);
+DEFINE_PER_CPU(int, sd_sis_id);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_sis_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);

+int sysctl_sched_wake_idle_domain = -1;
+int sysctl_sched_wake_idle_presearch_domain = -1;
+DEFINE_PER_CPU(struct sched_domain *, sd_sis_pre);
+
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
- struct sched_domain *sd;
+ struct sched_domain *sd, *sdp;
int id = cpu;
int size = 1;
+ int level;
+
+ if (sysctl_sched_wake_idle_domain < 0) {
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ } else {
+ level = 0;
+ for_each_domain(cpu, sd) {
+ if (level == sysctl_sched_wake_idle_domain)
+ break;
+ level++;
+ }
+ }

- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
sds = sd->shared;
}

- rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
- per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
- rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ rcu_assign_pointer(per_cpu(sd_sis, cpu), sd);
+ per_cpu(sd_sis_size, cpu) = size;
+ per_cpu(sd_sis_id, cpu) = id;
+ rcu_assign_pointer(per_cpu(sd_sis_shared, cpu), sds);
+
+ sdp = NULL;
+ if (sd && sysctl_sched_wake_idle_presearch_domain >= 0) {
+ level = 0;
+ for_each_domain(cpu, sdp) {
+ if (sdp == sd) {
+ sdp = NULL;
+ break;
+ }
+ if (level == sysctl_sched_wake_idle_presearch_domain)
+ break;
+ level++;
+ }
+ }
+ rcu_assign_pointer(per_cpu(sd_sis_pre, cpu), sdp);

sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -1400,14 +1434,12 @@ sd_init(struct sched_domain_topology_level *tl,
}

/*
- * For all levels sharing cache; connect a sched_domain_shared
- * instance.
+ * Connect sched_domain_shared instances. As sd_sis can be changed at run
+ * time, link all domains.
*/
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
- atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);

sd->private = sdd;

@@ -2204,7 +2236,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* Call with hotplug lock and sched_domains_mutex held
*/
void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+ struct sched_domain_attr *dattr_new, int force_update)
{
bool __maybe_unused has_eas = false;
int i, j, n;
@@ -2217,6 +2249,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],

/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
+ new_topology |= force_update;

if (!doms_new) {
WARN_ON_ONCE(dattr_new);
@@ -2310,9 +2343,9 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
* Call with hotplug lock held
*/
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+ struct sched_domain_attr *dattr_new, int force_update)
{
mutex_lock(&sched_domains_mutex);
- partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new, 0);
mutex_unlock(&sched_domains_mutex);
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..b474851e1a66 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -144,6 +144,10 @@ static const int cap_last_cap = CAP_LAST_CAP;
#ifdef CONFIG_DETECT_HUNG_TASK
static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#endif
+#ifdef CONFIG_SMP
+extern int sysctl_sched_wake_idle_domain;
+extern int sysctl_sched_wake_idle_presearch_domain;
+#endif

#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
@@ -202,6 +206,11 @@ static int max_extfrag_threshold = 1000;

#endif /* CONFIG_SYSCTL */

+#ifdef CONFIG_SMP
+int proc_sched_wake_idle_domain_handler(struct ctl_table *table,
+ int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
static int bpf_stats_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -1834,6 +1843,22 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
+#ifdef CONFIG_SMP
+ {
+ .procname = "sched_wake_idle_domain",
+ .data = &sysctl_sched_wake_idle_domain,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_sched_wake_idle_domain_handler,
+ },
+ {
+ .procname = "sched_wake_idle_presearch_domain",
+ .data = &sysctl_sched_wake_idle_presearch_domain,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_sched_wake_idle_domain_handler,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
--
2.28.0.rc0.142.g3c755180ce-goog