[PATCH] sched/fair: Introduce SIS_FILTER to skip overloaded CPUs during SIS

From: Abel Wu
Date: Thu Jul 07 2022 - 14:16:47 EST


Currently SIS_UTIL is used to limit the scan depth of idle CPUs in
select_idle_cpu(). There could be another optimization to filter
the overloaded CPUs so as to further speed up select_idle_cpu().
Launch the CPU overload check in periodic tick, and take consideration
of nr_running, avg_util and runnable_avg of that CPU. If the CPU is
overloaded, add it into per LLC overload cpumask, so select_idle_cpu()
could skip those overloaded CPUs. Although this detection is in periodic
tick, checking the pelt signal of the CPU would make the 'overloaded' state
more stable and reduce the frequency to update the LLC shared mask,
so as to mitigate the cache contention in the LLC.

The following results are tested on top of latest sched/core tip.
The baseline is with SIS_UTIL enabled, and compared it with both SIS_FILTER
/SIS_UTIL enabled. Positive %compare stands for better performance.

hackbench
=========
case load baseline(std%) compare%( std%)
process-pipe 1 group 1.00 ( 0.59) -1.35 ( 0.88)
process-pipe 2 groups 1.00 ( 0.38) -1.49 ( 0.04)
process-pipe 4 groups 1.00 ( 0.45) +0.10 ( 0.91)
process-pipe 8 groups 1.00 ( 0.11) +0.03 ( 0.38)
process-sockets 1 group 1.00 ( 3.48) +2.88 ( 7.07)
process-sockets 2 groups 1.00 ( 2.38) -3.78 ( 2.81)
process-sockets 4 groups 1.00 ( 0.26) -1.79 ( 0.82)
process-sockets 8 groups 1.00 ( 0.07) -0.35 ( 0.07)
threads-pipe 1 group 1.00 ( 0.87) -0.21 ( 0.71)
threads-pipe 2 groups 1.00 ( 0.63) +0.34 ( 0.45)
threads-pipe 4 groups 1.00 ( 0.18) -0.02 ( 0.50)
threads-pipe 8 groups 1.00 ( 0.08) +0.46 ( 0.05)
threads-sockets 1 group 1.00 ( 0.80) -0.08 ( 1.06)
threads-sockets 2 groups 1.00 ( 0.55) +0.06 ( 0.85)
threads-sockets 4 groups 1.00 ( 1.00) -2.13 ( 0.18)
threads-sockets 8 groups 1.00 ( 0.07) -0.41 ( 0.08)

netperf
=======
case load baseline(std%) compare%( std%)
TCP_RR 28 threads 1.00 ( 0.50) +0.19 ( 0.53)
TCP_RR 56 threads 1.00 ( 0.33) +0.31 ( 0.35)
TCP_RR 84 threads 1.00 ( 0.23) +0.15 ( 0.28)
TCP_RR 112 threads 1.00 ( 0.20) +0.03 ( 0.21)
TCP_RR 140 threads 1.00 ( 0.17) +0.20 ( 0.18)
TCP_RR 168 threads 1.00 ( 0.17) +112.84 ( 40.35)
TCP_RR 196 threads 1.00 ( 16.66) +0.39 ( 15.72)
TCP_RR 224 threads 1.00 ( 10.28) +0.05 ( 9.97)
UDP_RR 28 threads 1.00 ( 16.15) -0.13 ( 0.93)
UDP_RR 56 threads 1.00 ( 7.76) +1.24 ( 0.44)
UDP_RR 84 threads 1.00 ( 11.68) -0.49 ( 6.33)
UDP_RR 112 threads 1.00 ( 8.49) -0.21 ( 7.77)
UDP_RR 140 threads 1.00 ( 8.49) +2.05 ( 19.88)
UDP_RR 168 threads 1.00 ( 8.91) +1.67 ( 11.74)
UDP_RR 196 threads 1.00 ( 19.96) +4.35 ( 21.37)
UDP_RR 224 threads 1.00 ( 19.44) +4.38 ( 16.61)

tbench
======
case load baseline(std%) compare%( std%)
loopback 28 threads 1.00 ( 0.12) +0.57 ( 0.12)
loopback 56 threads 1.00 ( 0.11) +0.42 ( 0.11)
loopback 84 threads 1.00 ( 0.09) +0.71 ( 0.03)
loopback 112 threads 1.00 ( 0.03) -0.13 ( 0.08)
loopback 140 threads 1.00 ( 0.29) +0.59 ( 0.01)
loopback 168 threads 1.00 ( 0.01) +0.86 ( 0.03)
loopback 196 threads 1.00 ( 0.02) +0.97 ( 0.21)
loopback 224 threads 1.00 ( 0.04) +0.83 ( 0.22)

schbench
========
case load baseline(std%) compare%( std%)
normal 1 mthread 1.00 ( 0.00) -8.82 ( 0.00)
normal 2 mthreads 1.00 ( 0.00) +0.00 ( 0.00)
normal 4 mthreads 1.00 ( 0.00) +17.02 ( 0.00)
normal 8 mthreads 1.00 ( 0.00) -4.84 ( 0.00)

Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
---
include/linux/sched/topology.h | 6 +++++
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 47 ++++++++++++++++++++++++++++++++++
kernel/sched/features.h | 1 +
kernel/sched/sched.h | 2 ++
kernel/sched/topology.c | 3 ++-
6 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 816df6cc444e..c03076850a67 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -82,8 +82,14 @@ struct sched_domain_shared {
atomic_t nr_busy_cpus;
int has_idle_cores;
int nr_idle_scan;
+ unsigned long overloaded_cpus[];
};

+static inline struct cpumask *sdo_mask(struct sched_domain_shared *sds)
+{
+ return to_cpumask(sds->overloaded_cpus);
+}
+
struct sched_domain {
/* These fields must be setup */
struct sched_domain __rcu *parent; /* top domain must be null terminated */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d3e2c5a7c1b7..452eb63ee6f6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5395,6 +5395,7 @@ void scheduler_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
+ update_overloaded_rq(rq);

rq_unlock(rq, &rf);

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f80ae86bb404..34b1650f85f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6323,6 +6323,50 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd

#endif /* CONFIG_SCHED_SMT */

+/* derived from group_is_overloaded() */
+static inline bool rq_overloaded(struct rq *rq, int cpu, unsigned int imbalance_pct)
+{
+ if (rq->nr_running - rq->cfs.idle_h_nr_running <= 1)
+ return false;
+
+ if ((SCHED_CAPACITY_SCALE * 100) <
+ (cpu_util_cfs(cpu) * imbalance_pct))
+ return true;
+
+ if ((SCHED_CAPACITY_SCALE * imbalance_pct) <
+ (cpu_runnable(rq) * 100))
+ return true;
+
+ return false;
+}
+
+void update_overloaded_rq(struct rq *rq)
+{
+ struct sched_domain_shared *sds;
+ struct sched_domain *sd;
+ int cpu;
+
+ if (!sched_feat(SIS_FILTER))
+ return;
+
+ cpu = cpu_of(rq);
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (unlikely(!sd))
+ return;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (unlikely(!sds))
+ return;
+
+ if (rq_overloaded(rq, cpu, sd->imbalance_pct)) {
+ /* avoid duplicated write, mitigate cache contention */
+ if (!cpumask_test_cpu(cpu, sdo_mask(sds)))
+ cpumask_set_cpu(cpu, sdo_mask(sds));
+ } else {
+ if (cpumask_test_cpu(cpu, sdo_mask(sds)))
+ cpumask_clear_cpu(cpu, sdo_mask(sds));
+ }
+}
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -6383,6 +6427,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
}
}

+ if (sched_feat(SIS_FILTER) && !has_idle_core && sd->shared)
+ cpumask_andnot(cpus, cpus, sdo_mask(sd->shared));
+
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..1bebdb87c2f4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_PROP, false)
SCHED_FEAT(SIS_UTIL, true)
+SCHED_FEAT(SIS_FILTER, true)

/*
* Issue a WARN when we do multiple update_rq_clock() calls
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 02c970501295..316127ab1ec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1812,6 +1812,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)

extern int group_balance_cpu(struct sched_group *sg);

+void update_overloaded_rq(struct rq *rq);
+
#ifdef CONFIG_SCHED_DEBUG
void update_sched_domain_debugfs(void);
void dirty_sched_domain_sysctl(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..0d149e76a3b3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1641,6 +1641,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+ cpumask_clear(sdo_mask(sd->shared));
}

sd->private = sdd;
@@ -2106,7 +2107,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)

*per_cpu_ptr(sdd->sd, j) = sd;

- sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ sds = kzalloc_node(sizeof(struct sched_domain_shared) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sds)
return -ENOMEM;
--
2.25.1