[PATCH v3] sched/topology: remove sysctl_sched_energy_aware depending on the architecture

From: Shrikanth Hegde
Date: Wed Sep 13 2023 - 07:49:11 EST


sysctl_sched_energy_aware is available for the admin to disable/enable
energy aware scheduling(EAS). EAS is enabled only if few conditions are
met by the platform. They are, asymmetric CPU capacity, no SMT,
valid cpufreq policy, frequency invariant load tracking. It is possible
platform when booting may not have EAS capability, but can do that after.
For example, changing/registering the cpufreq policy.

At present, though platform doesn't support EAS, this sysctl is still
present and it ends up calling rebuild of sched domain on write to 1 and
NOP when writing to 0. That is confusing and un-necessary.

Desired behaviour can be, have the sysctl only when the platform can do
EAS. i.e when platform becomes capable enable the sysctl and when it can't
remove the sysctl. On Supported platform using this sysctl, admin should be
able to enable/disable EAS.

Update the sysctl guide as well.

Different Scenarios:
Scenario 1: System while booting has EAS capability.
Expected: sysctl will be present and admin can enable/disable EAS by writing
1 or 0 respectively. This operation shouldn't remove the sysctl specially when
disabling as sysctl would be needed to enable it later.
Scenario 2: System becomes capable of EAS later
Expected: At boot, sysctl will not be present. Once eas is enabled by passing
all the checks, perf domains will be built and sysctl will be enabled. Any
further change to sysctl should be treated same as Scenario 1.
Scenario 3: system becomes not capable of EAS.
Expected: Since EAS is going to be disabled now, remove the sysctl in this
scenario. If it becomes capable of EAS later again, that would be Scenario 2.

v2->v3:
Chen Yu and Pierre Gondois both pointed out that if platform becomes
capable of EAS later, this patch was not allowing that to happen.
Addressed that by using a variable to indicate the sysctl change
and re-worded the commit message with desired behaviour,
v1->v2:
Chen Yu had pointed out that this will not destroy the perf domains on
architectures where EAS is supported by changing the sysctl.
[v1] Link: https://lore.kernel.org/lkml/20230829065040.920629-1-sshegde@xxxxxxxxxxxxxxxxxx/
[v2] Link: https://lore.kernel.org/lkml/20230901065249.137242-1-sshegde@xxxxxxxxxxxxxxxxxx/

Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxxxxxxx>
---
Documentation/admin-guide/sysctl/kernel.rst | 3 +-
kernel/sched/topology.c | 49 +++++++++++++++++----
2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 3800fab1619b..455e12f1331b 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1134,7 +1134,8 @@ automatically on platforms where it can run (that is,
platforms with asymmetric CPU topologies and having an Energy
Model available). If your platform happens to meet the
requirements for EAS but you do not want to use it, change
-this value to 0.
+this value to 0. If platform doesn't support EAS at this moment,
+this would be removed.

task_delayacct
===============
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a5bc678c08..57df938d5ec0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -208,9 +208,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
-static unsigned int sysctl_sched_energy_aware = 1;
+static unsigned int sysctl_sched_energy_aware;
+static struct ctl_table_header *sysctl_eas_header;
static DEFINE_MUTEX(sched_energy_mutex);
static bool sched_energy_update;
+static bool is_sysctl_eas_changing;

void rebuild_sched_domains_energy(void)
{
@@ -226,6 +228,7 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
+ int prev_val = sysctl_sched_energy_aware;

if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -233,8 +236,13 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
state = static_branch_unlikely(&sched_energy_present);
- if (state != sysctl_sched_energy_aware)
+ if (state != sysctl_sched_energy_aware && prev_val != sysctl_sched_energy_aware) {
+ is_sysctl_eas_changing = true;
+ if (sysctl_sched_energy_aware && !state)
+ pr_warn("Attempt to build energy domains when EAS is disabled\n");
rebuild_sched_domains_energy();
+ is_sysctl_eas_changing = false;
+ }
}

return ret;
@@ -255,7 +263,14 @@ static struct ctl_table sched_energy_aware_sysctls[] = {

static int __init sched_energy_aware_sysctl_init(void)
{
- register_sysctl_init("kernel", sched_energy_aware_sysctls);
+ int cpu = cpumask_first(cpu_active_mask);
+
+ if (sched_smt_active() || !per_cpu(sd_asym_cpucapacity, cpu) ||
+ !arch_scale_freq_invariant())
+ return 0;
+
+ sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
+ sysctl_sched_energy_aware = 1;
return 0;
}

@@ -336,10 +351,29 @@ static void sched_energy_set(bool has_eas)
if (sched_debug())
pr_info("%s: stopping EAS\n", __func__);
static_branch_disable_cpuslocked(&sched_energy_present);
+#ifdef CONFIG_PROC_SYSCTL
+ /*
+ * if the architecture supports EAS and forcefully
+ * perf domains are destroyed, there should be a sysctl
+ * to enable it later. If this was due to dynamic system
+ * change such as SMT<->NON_SMT then remove sysctl.
+ */
+ if (sysctl_eas_header && !is_sysctl_eas_changing) {
+ unregister_sysctl_table(sysctl_eas_header);
+ sysctl_eas_header = NULL;
+ sysctl_sched_energy_aware = 0;
+ }
+#endif
} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
if (sched_debug())
pr_info("%s: starting EAS\n", __func__);
static_branch_enable_cpuslocked(&sched_energy_present);
+#ifdef CONFIG_PROC_SYSCTL
+ if (!sysctl_eas_header) {
+ sysctl_eas_header = register_sysctl("kernel", sched_energy_aware_sysctls);
+ sysctl_sched_energy_aware = 1;
+ }
+#endif
}
}

@@ -380,15 +414,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
struct cpufreq_policy *policy;
struct cpufreq_governor *gov;

- if (!sysctl_sched_energy_aware)
+ if (!sysctl_sched_energy_aware && is_sysctl_eas_changing)
goto free;

/* EAS is enabled for asymmetric CPU capacity topologies. */
if (!per_cpu(sd_asym_cpucapacity, cpu)) {
- if (sched_debug()) {
- pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
- cpumask_pr_args(cpu_map));
- }
+ if (sched_debug())
+ pr_info("rd %*pbl: Disabling EAS, CPUs do not have asymmetric capacities\n",
+ cpumask_pr_args(cpu_map));
goto free;
}

--
2.31.1