[RFC PATCH 3/3] sched/fair: Traverse cpufreq policies to detect capacity inversion

From: Qais Yousef
Date: Sun Nov 27 2022 - 09:18:22 EST


We used performance domains to traverse the list of domains that share
the same cpufreq policy to detect when this domain is severely impacted
by thermal pressure to cause it to be lower than another domain in the
system - capacity inversion.

Since performance domains are only available for when energy model or
schedutil are present, this makes the detection mechanism unavailable
for Capacity Aware Scheduling (CAS).

Since we only care about traversing the capacity_orig() of any cpu
within that domain; export for_each_active_policy() to traverse the
cpufreq policies instead of performance domains.

Introduce a new for_each_active_policy_safe() to protect against races
with deletion. Races against additions are fine since we can't
eliminate the race without having to do heavy handed locking which is
unacceptable in this path. The policy should be visible in the next
tick if we missed it.

Fixes: 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
Signed-off-by: Qais Yousef (Google) <qyousef@xxxxxxxxxxx>
---

Rafael, Viresh, I hope it's okay to export these macros in the public header.
And that my usage is okay; I'm not sure if I missed important locking rules.


drivers/cpufreq/cpufreq.c | 12 +-----------
include/linux/cpufreq.h | 26 ++++++++++++++++++++++++++
kernel/sched/fair.c | 13 +++++--------
3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 69b3d61852ac..b11e7c545fc1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -31,17 +31,7 @@
#include <linux/units.h>
#include <trace/events/power.h>

-static LIST_HEAD(cpufreq_policy_list);
-
-/* Macros to iterate over CPU policies */
-#define for_each_suitable_policy(__policy, __active) \
- list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
- if ((__active) == !policy_is_inactive(__policy))
-
-#define for_each_active_policy(__policy) \
- for_each_suitable_policy(__policy, true)
-#define for_each_inactive_policy(__policy) \
- for_each_suitable_policy(__policy, false)
+LIST_HEAD(cpufreq_policy_list);

/* Iterate over governors */
static LIST_HEAD(cpufreq_governor_list);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d5595d57f4e5..c3c79d4ad821 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -780,6 +780,32 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
continue; \
else

+#ifdef CONFIG_CPU_FREQ
+extern struct list_head cpufreq_policy_list;
+
+/* Macros to iterate over CPU policies */
+#define for_each_suitable_policy(__policy, __active) \
+ list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
+ if ((__active) == !policy_is_inactive(__policy))
+
+#define for_each_suitable_policy_safe(__policy, __n, __active) \
+ list_for_each_entry_safe(__policy, __n, &cpufreq_policy_list, policy_list) \
+ if ((__active) == !policy_is_inactive(__policy))
+#else
+#define for_each_suitable_policy(__policy, __active) while (0)
+#define for_each_suitable_policy_safe(__policy, __n, __active) while (0)
+#endif
+
+#define for_each_active_policy(__policy) \
+ for_each_suitable_policy(__policy, true)
+#define for_each_inactive_policy(__policy) \
+ for_each_suitable_policy(__policy, false)
+
+#define for_each_active_policy_safe(__policy, __n) \
+ for_each_suitable_policy_safe(__policy, __n, true)
+#define for_each_inactive_policy_safe(__policy, __n) \
+ for_each_suitable_policy_safe(__policy, __n, false)
+

int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c0dd57e562a..4bbbca85134b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
* * Thermal pressure will impact all cpus in this perf domain
* equally.
*/
- if (sched_energy_enabled()) {
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
- struct perf_domain *pd = rcu_dereference(rq->rd->pd);
+ struct cpufreq_policy *policy, __maybe_unused *policy_n;

rq->cpu_capacity_inverted = 0;

- SCHED_WARN_ON(!rcu_read_lock_held());
-
- for (; pd; pd = pd->next) {
- struct cpumask *pd_span = perf_domain_span(pd);
+ for_each_active_policy_safe(policy, policy_n) {
unsigned long pd_cap_orig, pd_cap;

/* We can't be inverted against our own pd */
- if (cpumask_test_cpu(cpu_of(rq), pd_span))
+ if (cpumask_test_cpu(cpu_of(rq), policy->cpus))
continue;

- cpu = cpumask_any(pd_span);
+ cpu = cpumask_any(policy->cpus);
pd_cap_orig = arch_scale_cpu_capacity(cpu);

if (capacity_orig < pd_cap_orig)
--
2.25.1