Re: [PATCH] PM / EM: Inefficient OPPs detection

From: Quentin Perret
Date: Thu May 20 2021 - 08:19:56 EST


Hi Vincent,

Apologies for the late reply.

On Wednesday 28 Apr 2021 at 15:46:10 (+0100), Vincent Donnefort wrote:
> I do, it shows that the distribution is quite wide. I also have a 95% confidence
> interval, as follow:
> w/o LUT W/ LUT
>
> Mean std Mean std
>
> Phase0: 10791+/-79 2262 10657+/-71 2240 [1]
> Phase1: 2924 +/-19 529 2894 +/-16 513 [1]
> Phase2: 2207 +/-19 535 2162 +/-17 515
> Phase3: 1897 +/-18 504 1864 +/-17 515 [1]
> Phase4: Mid CPU 1700 +/-46 463 1609 +/-26 458
> Phase4: Big CPU 1187 +/-15 407 1129 +/-15 385
> Phase5: 987 +/-14 395 900 +/-12 365
>
>
> [1] I included these results originally as the p-value for the test I used
> showed we can reject confidently the null hypothesis that the 2 samples are
> coming from the same distribution... However, the confidence intervals for
> the mean overlaps. It is then complicated to conclude for those phases.

Aha, thanks for sharing. So yes, it's not all that obvious that we need
the extra complexity of the LUT there :/. In this case I'd be inclined
to suggest a simpler version first, and we can always optimize later.

How much would you hate something like the below (totally untested)? We
still end up with the double lookup in sugov, but as you've pointed out
in another reply we can't easily workaround, and as per the above the
linear search isn't that bad compared to the LUT. Maybe we could try a
few micro-optimizations on top (e.g. binary searching the EM table), but
IIRC that wasn't making much of difference last time I tried.

Thoughts?

Thanks,
Quentin

---
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 757fc60658fa..8320f5e87992 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -22,6 +22,7 @@ struct em_perf_state {
unsigned long frequency;
unsigned long power;
unsigned long cost;
+ unsigned int inefficient;
};

/**
@@ -85,6 +86,25 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
bool milliwatts);
void em_dev_unregister_perf_domain(struct device *dev);

+static inline struct em_perf_state *__em_find_ps(struct em_perf_domain *pd, unsigned long freq)
+{
+ struct em_perf_state *ps;
+ int i;
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ ps = &pd->table[i];
+ if (ps->frequency >= freq && !ps->inefficient)
+ break;
+ }
+
+ return ps;
+}
+
+static inline unsigned long em_cpu_find_freq(struct em_perf_domain *pd, unsigned long freq)
+{
+ return pd ? __em_find_ps(pd, freq)->frequency : freq;
+}
+
/**
* em_cpu_energy() - Estimates the energy consumed by the CPUs of a
performance domain
@@ -104,7 +124,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
{
unsigned long freq, scale_cpu;
struct em_perf_state *ps;
- int i, cpu;
+ int cpu;

if (!sum_util)
return 0;
@@ -118,16 +138,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
scale_cpu = arch_scale_cpu_capacity(cpu);
ps = &pd->table[pd->nr_perf_states - 1];
freq = map_util_freq(max_util, ps->frequency, scale_cpu);
-
- /*
- * Find the lowest performance state of the Energy Model above the
- * requested frequency.
- */
- for (i = 0; i < pd->nr_perf_states; i++) {
- ps = &pd->table[i];
- if (ps->frequency >= freq)
- break;
- }
+ ps = __em_find_ps(pd, freq);

/*
* The capacity of a CPU in the domain at the performance state (ps)
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0f4530b3a8cd..990048e9ec79 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -161,7 +161,8 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
* power efficient than a lower one.
*/
opp_eff = freq / power;
- if (opp_eff >= prev_opp_eff)
+ table[i].inefficient = (opp_eff >= prev_opp_eff);
+ if (table[i].inefficient)
dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
i, i - 1);
prev_opp_eff = opp_eff;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4f09afd2f321..9186d52d8660 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -10,6 +10,7 @@

#include "sched.h"

+#include <linux/energy_model.h>
#include <linux/sched/cpufreq.h>
#include <trace/events/power.h>

@@ -42,6 +43,8 @@ struct sugov_policy {

bool limits_changed;
bool need_freq_update;
+
+ struct em_perf_domain *pd;
};

struct sugov_cpu {
@@ -152,6 +155,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
policy->cpuinfo.max_freq : policy->cur;

freq = map_util_freq(util, freq, max);
+ freq = em_cpu_find_freq(sg_policy->pd, freq);

if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
@@ -756,6 +760,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->cached_raw_freq = 0;

sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ sg_policy->pd = em_cpu_get(policy->cpu);

for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);