[PATCH] sched/fair: Prefer idle CPU to cache affinity

From: Srikar Dronamraju
Date: Fri Feb 26 2021 - 11:42:52 EST


On POWER8 and POWER9, the last level cache (L2) has been at the level of
a group of 8 threads (SMT8 on POWER8, a big-core comprising of a pair of
SMT4 cores on POWER9). However, on POWER10, the LLC domain is at the
level of a group of SMT4 threads within the SMT8 core. Due to the
shrinking in the size of the LLC domain, the probability of finding an
idle CPU in the LLC domain of the target is lesser on POWER10 compared
to the previous generation processors.

With commit 9538abee18cc ("powerpc/smp: Add support detecting
thread-groups sharing L2 cache") benchmarks such as Daytrader
(https://github.com/WASdev/sample.daytrader7) show a drop in throughput
in a configuration consisting of 1 JVM spanning across 6-8 Bigcores on
POWER10. Analysis showed that this was because more number of wakeups
were happening on busy CPUs when the utilization was 60-70%. This drop
in throughput also shows up as a drop in CPU utilization. However most
other benchmarks benefit with detecting the thread-groups that share L2
cache.

Current order of preference to pick a LLC while waking a wake-affine
task:
1. Between the waker CPU and previous CPU, prefer the LLC of the CPU
that is idle.

2. Between the waker CPU and previous CPU, prefer the LLC of the CPU
that is less lightly loaded.

In the current situation where waker and previous CPUs are busy, but
only one of its LLC has an idle CPU, Scheduler may end up picking a LLC
with no idle CPUs. To mitigate this, add a new step between 1 and 2
where Scheduler compares idle CPUs in waker and previous LLCs and picks
the appropriate one.

The other alternative is to search for an idle CPU in the other LLC, if
the current select_idle_sibling is unable to find an idle CPU in the
preferred LLC. But that may increase the time to select a CPU.


5.11-rc6 5.11-rc6+revert 5.11-rc6+patch
8CORE/1JVM 80USERS throughput 6651.6 6716.3 (0.97%) 6940 (4.34%)
sys/user:time 59.75/23.86 61.77/24.55 60/24

8CORE/2JVM 80USERS throughput 6425.4 6446.8 (0.33%) 6473.2 (0.74%)
sys/user:time 70.59/24.25 72.28/23.77 70/24

8CORE/4JVM 80USERS throughput 5355.3 5551.2 (3.66%) 5586.6 (4.32%)
sys/user:time 76.74/21.79 76.54/22.73 76/22

8CORE/8JVM 80USERS throughput 4420.6 4553.3 (3.00%) 4405.8 (-0.33%)
sys/user:time 79.13/20.32 78.76/21.01 79/20

Cc: LKML <linux-kernel@xxxxxxxxxxxxxxx>
Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Cc: Michael Neuling <mikey@xxxxxxxxxxx>
Cc: Gautham R Shenoy <ego@xxxxxxxxxxxxxxxxxx>
Cc: Parth Shah <parth@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Valentin Schneider <valentin.schneider@xxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Co-developed-by: Gautham R Shenoy <ego@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Gautham R Shenoy <ego@xxxxxxxxxxxxxxxxxx>
Co-developed-by: Parth Shah <parth@xxxxxxxxxxxxx>
Signed-off-by: Parth Shah <parth@xxxxxxxxxxxxx>
Signed-off-by: Srikar Dronamraju <srikar@xxxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++++--
kernel/sched/features.h | 2 ++
2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a8bd7b13634..d49bfcdc4a19 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5869,6 +5869,36 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}

+static int prefer_idler_llc(int this_cpu, int prev_cpu, int sync)
+{
+ struct sched_domain_shared *tsds, *psds;
+ int pnr_busy, pllc_size, tnr_busy, tllc_size, diff;
+
+ tsds = rcu_dereference(per_cpu(sd_llc_shared, this_cpu));
+ tnr_busy = atomic_read(&tsds->nr_busy_cpus);
+ tllc_size = per_cpu(sd_llc_size, this_cpu);
+
+ psds = rcu_dereference(per_cpu(sd_llc_shared, prev_cpu));
+ pnr_busy = atomic_read(&psds->nr_busy_cpus);
+ pllc_size = per_cpu(sd_llc_size, prev_cpu);
+
+ /* No need to compare, if both LLCs are fully loaded */
+ if (pnr_busy == pllc_size && tnr_busy == pllc_size)
+ return nr_cpumask_bits;
+
+ if (sched_feat(WA_WAKER) && tnr_busy < tllc_size)
+ return this_cpu;
+
+ /* For better wakeup latency, prefer idler LLC to cache affinity */
+ diff = tnr_busy * pllc_size - sync - pnr_busy * tllc_size;
+ if (!diff)
+ return nr_cpumask_bits;
+ if (diff < 0)
+ return this_cpu;
+
+ return prev_cpu;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
@@ -5877,6 +5907,10 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (sched_feat(WA_IDLE))
target = wake_affine_idle(this_cpu, prev_cpu, sync);

+ if (sched_feat(WA_IDLER_LLC) && target == nr_cpumask_bits &&
+ !cpus_share_cache(this_cpu, prev_cpu))
+ target = prefer_idler_llc(this_cpu, prev_cpu, sync);
+
if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);

@@ -5884,8 +5918,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (target == nr_cpumask_bits)
return prev_cpu;

- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ if (target == this_cpu) {
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ }
+
return target;
}

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1bc2b158fc51..e2de3ba8d5b1 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,6 +83,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)

SCHED_FEAT(WA_IDLE, true)
SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_IDLER_LLC, true)
+SCHED_FEAT(WA_WAKER, false)
SCHED_FEAT(WA_BIAS, true)

/*
--
2.18.4