[PATCH 09/10] sched/topology: Introduce fallback LLC

From: Srikar Dronamraju
Date: Thu Apr 22 2021 - 06:24:43 EST


On some systems, LLC sizes may be too small. Some of these systems may
also support multiple cache access latency levels i.e between the
previous LLC and waker LLC, there could be other LLCs that have a lesser
cache access latency to waker LLC. If the waker LLC is busy, then
scheduler could choose to scheduler a task on such LLC.

Here is one approach to identity a static fallback LLC for each LLC for
systems that support multiple cache access latency levels. In this
approach, the fallback LLCs are decided at boot/CPU bring up time. There
is a one-to-one mapping between the LLC and fallback LLC. The fallback
LLC will only be used if wakeup is a sync wakeup and the current LLC is
more busy than the fallback LLC. Also scheduler will not choose fallback
LLC if the previous LLC has same cache access latency as fallback LLC.

It is expected that fallback LLC has to be part of parent domain of
LLC domain. Archs can choose to use fallback LLC by setting the
SD_FALLBACK_LLC flag.

Cc: LKML <linux-kernel@xxxxxxxxxxxxxxx>
Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Cc: Gautham R Shenoy <ego@xxxxxxxxxxxxxxxxxx>
Cc: Parth Shah <parth@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Valentin Schneider <valentin.schneider@xxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxxx>
Signed-off-by: Srikar Dronamraju <srikar@xxxxxxxxxxxxxxxxxx>
---
include/linux/sched/sd_flags.h | 7 ++++++
include/linux/sched/topology.h | 1 +
kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++---
kernel/sched/topology.c | 45 ++++++++++++++++++++++++++++++++--
4 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 34b21e971d77..3ca44dd421e4 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -129,6 +129,13 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
*/
SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

+/*
+ * Consider waking task on near-by idle LLC.
+ *
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_FALLBACK_LLC, SDF_NEEDS_GROUPS)
+
/*
* Prefer to place tasks in a sibling domain
*
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 285165a35f21..b0446191319a 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -74,6 +74,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int idle_core;
+ int fallback_llc_id;
};

struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index db5dc9875e4c..8ea6d0183fc8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5873,7 +5873,8 @@ static int wake_affine_idler_llc(struct task_struct *p, int this_cpu, int prev_c
{
int pnr_busy, pllc_size, tnr_busy, tllc_size;
struct sched_domain_shared *tsds, *psds;
- int diff;
+ bool try_fallback = false;
+ int diff, fcpu = -1;

tsds = rcu_dereference(per_cpu(sd_llc_shared, this_cpu));
psds = rcu_dereference(per_cpu(sd_llc_shared, prev_cpu));
@@ -5890,6 +5891,43 @@ static int wake_affine_idler_llc(struct task_struct *p, int this_cpu, int prev_c
}
}

+ tnr_busy = atomic_read(&tsds->nr_busy_cpus);
+ tllc_size = per_cpu(sd_llc_size, this_cpu);
+
+ if (sync) {
+ struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, this_cpu));
+
+ /*
+ * task is a target of *sync* wakeup. However there are no
+ * idle cores in the waking CPU. Ignore fallback LLC if the
+ * previous CPU is part of the LLC's parent domain.
+ */
+ try_fallback = !cpumask_test_cpu(prev_cpu, sched_domain_span(sd->parent));
+ fcpu = tsds->fallback_llc_id;
+ }
+
+ if (try_fallback && fcpu != -1 && cpumask_test_cpu(fcpu, p->cpus_ptr)) {
+ struct sched_domain_shared *fsds;
+ int fnr_busy, fllc_size;
+
+ fsds = rcu_dereference(per_cpu(sd_llc_shared, fcpu));
+ if (fsds && fsds != psds) {
+ if (fsds->idle_core != -1) {
+ if (cpumask_test_cpu(fsds->idle_core, p->cpus_ptr))
+ return fsds->idle_core;
+ return fcpu;
+ }
+
+ fnr_busy = atomic_read(&fsds->nr_busy_cpus);
+ fllc_size = per_cpu(sd_llc_size, fcpu);
+ if (fnr_busy * tllc_size < tnr_busy * fllc_size) {
+ tnr_busy = fnr_busy;
+ tllc_size = fllc_size;
+ this_cpu = fcpu;
+ }
+ }
+ }
+
if (available_idle_cpu(prev_cpu) || sched_idle_cpu(prev_cpu))
return prev_cpu;
if (psds->idle_core != -1) {
@@ -5908,10 +5946,7 @@ static int wake_affine_idler_llc(struct task_struct *p, int this_cpu, int prev_c
}
}

- tnr_busy = atomic_read(&tsds->nr_busy_cpus);
pnr_busy = atomic_read(&psds->nr_busy_cpus);
-
- tllc_size = per_cpu(sd_llc_size, this_cpu);
pllc_size = per_cpu(sd_llc_size, prev_cpu);

if (pnr_busy == pllc_size && tnr_busy == tllc_size) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 00e4669bb241..89aa8986c58b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -603,6 +603,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)

static void destroy_sched_domain(struct sched_domain *sd)
{
+ struct sched_domain_shared *sds = sd->shared;
+
/*
* A normal sched domain may have multiple group references, an
* overlapping domain, having private groups, only one. Iterate,
@@ -610,8 +612,18 @@ static void destroy_sched_domain(struct sched_domain *sd)
*/
free_sched_groups(sd->groups, 1);

- if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
- kfree(sd->shared);
+ if (sds && atomic_dec_and_test(&sds->ref)) {
+ struct sched_domain_shared *next_sds;
+
+ if (sds->fallback_llc_id != -1) {
+ next_sds = rcu_dereference(per_cpu(sd_llc_shared, sds->fallback_llc_id));
+ if (next_sds && next_sds->fallback_llc_id != -1)
+ next_sds->fallback_llc_id = -1;
+
+ sds->fallback_llc_id = -1;
+ }
+ kfree(sds);
+ }
kfree(sd);
}

@@ -663,9 +675,36 @@ static void update_top_cache_domain(int cpu)

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
if (sd) {
+ struct sched_domain *sd_parent = sd->parent;
+
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
sds = sd->shared;
+
+ if (sds->fallback_llc_id == -1 && sd_parent &&
+ sd_parent->flags & SD_FALLBACK_LLC) {
+ const struct cpumask *parent_span = sched_domain_span(sd->parent);
+ struct cpumask *span = sched_domains_tmpmask;
+ int fcpu;
+
+ /*
+ * If LLC's parent domain has SD_FALLBACK_LLC flag
+ * set and this LLC's fallback_llc_id is not yet
+ * set, then walk through the LLC parent's domain to
+ * find a fallback_llc.
+ */
+ cpumask_andnot(span, parent_span, sched_domain_span(sd));
+ for_each_cpu_wrap(fcpu, span, cpu) {
+ struct sched_domain_shared *next_sds;
+
+ next_sds = rcu_dereference(per_cpu(sd_llc_shared, fcpu));
+ if (next_sds && next_sds->fallback_llc_id == -1) {
+ sds->fallback_llc_id = fcpu;
+ next_sds->fallback_llc_id = cpu;
+ break;
+ }
+ }
+ }
}

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
@@ -1370,6 +1409,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
+ SD_FALLBACK_LLC | \
SD_NUMA | \
SD_ASYM_PACKING)

@@ -1475,6 +1515,7 @@ sd_init(struct sched_domain_topology_level *tl,
atomic_inc(&sd->shared->ref);
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
sd->shared->idle_core = -1;
+ sd->shared->fallback_llc_id = -1;
}

sd->private = sdd;
--
2.18.2