[PATCH] cpufreq: intel_pstate: Generic governors support

From: Rafael J. Wysocki
Date: Sun Nov 13 2016 - 19:00:18 EST


From: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>

There may be reasons to use generic cpufreq governors (eg. schedutil)
on Intel platforms instead of the intel_pstate driver's internal
governor. However, that currently can only be done by disabling
intel_pstate altogether and using the acpi-cpufreq driver instead
of it, which is subject to limitations.

First of all, acpi-cpufreq only works on systems where the _PSS
object is present in the ACPI tables for all logical CPUs. Second,
on those systems acpi-cpufreq will only use frequencies listed by
_PSS which may be suboptimal. In particular, by convention, the
whole turbo range is represented in _PSS as a single P-state and
the frequency assigned to it is greater by 1 MHz than the greatest
non-turbo frequency listed by _PSS. That may confuse governors to
use turbo frequencies less frequently which may lead to suboptimal
performance.

For this reason, make it possible to use the intel_pstate driver
with generic cpufreq governors as a "normal" cpufreq driver. That
mode is enforced by adding intel_pstate=passive to the kernel
command line and cannot be disabled at run time. In that mode,
intel_pstate provides a cpufreq driver interface including
the ->target() and ->fast_switch() callbacks and is listed in
scaling_driver as "intel_cpufreq".

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
---

Rebased on top of my linux-next branch, which in turn is based on 4.9-rc5 now.

I'm running this on my IVB laptop w/ the schedutil governor, no problems so
far (fingers crossed).

Thanks,
Rafael

---
Documentation/kernel-parameters.txt | 6 +
drivers/cpufreq/intel_pstate.c | 169 ++++++++++++++++++++++++++++++------
2 files changed, 151 insertions(+), 24 deletions(-)

Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -37,6 +37,8 @@
#include <asm/cpufeature.h>
#include <asm/intel-family.h>

+#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
+
#define ATOM_RATIOS 0x66a
#define ATOM_VIDS 0x66b
#define ATOM_TURBO_RATIOS 0x66c
@@ -122,6 +124,8 @@ struct sample {
* @scaling: Scaling factor to convert frequency to cpufreq
* frequency units
* @turbo_pstate: Max Turbo P state possible for this platform
+ * @max_freq: @max_pstate frequency in cpufreq units
+ * @turbo_freq: @turbo_pstate frequency in cpufreq units
*
* Stores the per cpu model P state limits and current P state.
*/
@@ -132,6 +136,8 @@ struct pstate_data {
int max_pstate_physical;
int scaling;
int turbo_pstate;
+ unsigned int max_freq;
+ unsigned int turbo_freq;
};

/**
@@ -470,7 +476,7 @@ static void intel_pstate_init_acpi_perf_
{
}

-static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+static inline int intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
{
}
#endif
@@ -1225,6 +1231,8 @@ static void intel_pstate_get_cpu_pstates
cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
cpu->pstate.scaling = pstate_funcs.get_scaling();
+ cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
+ cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;

if (pstate_funcs.get_vid)
pstate_funcs.get_vid(cpu);
@@ -1367,8 +1375,6 @@ static inline void intel_pstate_update_p
{
int max_perf, min_perf;

- update_turbo_state();
-
intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
pstate = clamp_t(int, pstate, min_perf, max_perf);
trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
@@ -1389,6 +1395,8 @@ static inline void intel_pstate_adjust_b
target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ?
cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu);

+ update_turbo_state();
+
intel_pstate_update_pstate(cpu, target_pstate);

sample = &cpu->sample;
@@ -1670,22 +1678,21 @@ static int intel_pstate_verify_policy(st
return 0;
}

-static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
+static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
{
- int cpu_num = policy->cpu;
- struct cpudata *cpu = all_cpu_data[cpu_num];
-
- pr_debug("CPU %d exiting\n", cpu_num);
-
- intel_pstate_clear_update_util_hook(cpu_num);
+ intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
+}

- if (hwp_active)
- return;
+static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
+{
+ pr_debug("CPU %d exiting\n", policy->cpu);

- intel_pstate_set_min_pstate(cpu);
+ intel_pstate_clear_update_util_hook(policy->cpu);
+ if (!hwp_active)
+ intel_cpufreq_stop_cpu(policy);
}

-static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
+static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
{
struct cpudata *cpu;
int rc;
@@ -1696,11 +1703,6 @@ static int intel_pstate_cpu_init(struct

cpu = all_cpu_data[policy->cpu];

- if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
- policy->policy = CPUFREQ_POLICY_PERFORMANCE;
- else
- policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
/*
* We need sane value in the cpu->perf_limits, so inherit from global
* perf_limits limits, which are seeded with values based on the
@@ -1720,9 +1722,11 @@ static int intel_pstate_cpu_init(struct
policy->cpuinfo.max_freq *= cpu->pstate.scaling;

intel_pstate_init_acpi_perf_limits(policy);
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+ policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
cpumask_set_cpu(policy->cpu, policy->cpus);

+ policy->fast_switch_possible = true;
+
return 0;
}

@@ -1730,10 +1734,27 @@ static int intel_pstate_cpu_exit(struct
{
intel_pstate_exit_perf_limits(policy);

+ policy->fast_switch_possible = false;
+
return 0;
}

-static struct cpufreq_driver intel_pstate_driver = {
+static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
+{
+ int ret = intel_cpufreq_cpu_init(policy);
+
+ if (ret)
+ return ret;
+
+ if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
+ policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+ else
+ policy->policy = CPUFREQ_POLICY_POWERSAVE;
+
+ return 0;
+}
+
+static struct cpufreq_driver intel_pstate = {
.flags = CPUFREQ_CONST_LOOPS,
.verify = intel_pstate_verify_policy,
.setpolicy = intel_pstate_set_policy,
@@ -1745,6 +1766,99 @@ static struct cpufreq_driver intel_pstat
.name = "intel_pstate",
};

+static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
+{
+ struct cpudata *cpu = all_cpu_data[policy->cpu];
+ struct perf_limits *perf_limits = limits;
+
+ update_turbo_state();
+ policy->cpuinfo.max_freq = limits->turbo_disabled ?
+ cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+
+ cpufreq_verify_within_cpu_limits(policy);
+
+ if (per_cpu_limits)
+ perf_limits = cpu->perf_limits;
+
+ intel_pstate_update_perf_limits(policy, perf_limits);
+
+ return 0;
+}
+
+static unsigned int intel_cpufreq_turbo_update(struct cpudata *cpu,
+ struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ unsigned int max_freq;
+
+ update_turbo_state();
+
+ max_freq = limits->no_turbo || limits->turbo_disabled ?
+ cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+ policy->cpuinfo.max_freq = max_freq;
+ if (policy->max > max_freq)
+ policy->max = max_freq;
+
+ if (target_freq > max_freq)
+ target_freq = max_freq;
+
+ return target_freq;
+}
+
+static int intel_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ struct cpudata *cpu = all_cpu_data[policy->cpu];
+ struct cpufreq_freqs freqs;
+ int target_pstate;
+
+ freqs.old = policy->cur;
+ freqs.new = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+
+ cpufreq_freq_transition_begin(policy, &freqs);
+ switch (relation) {
+ case CPUFREQ_RELATION_L:
+ target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
+ break;
+ case CPUFREQ_RELATION_H:
+ target_pstate = freqs.new / cpu->pstate.scaling;
+ break;
+ default:
+ target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
+ break;
+ }
+ intel_pstate_update_pstate(cpu, target_pstate);
+ cpufreq_freq_transition_end(policy, &freqs, false);
+
+ return 0;
+}
+
+static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ struct cpudata *cpu = all_cpu_data[policy->cpu];
+ int target_pstate;
+
+ target_freq = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+ target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
+ intel_pstate_update_pstate(cpu, target_pstate);
+ return target_freq;
+}
+
+static struct cpufreq_driver intel_cpufreq = {
+ .flags = CPUFREQ_CONST_LOOPS,
+ .verify = intel_cpufreq_verify_policy,
+ .target = intel_cpufreq_target,
+ .fast_switch = intel_cpufreq_fast_switch,
+ .init = intel_cpufreq_cpu_init,
+ .exit = intel_pstate_cpu_exit,
+ .stop_cpu = intel_cpufreq_stop_cpu,
+ .name = "intel_cpufreq",
+};
+
+static struct cpufreq_driver *intel_pstate_driver = &intel_pstate;
+
static int no_load __initdata;
static int no_hwp __initdata;
static int hwp_only __initdata;
@@ -1963,7 +2077,7 @@ hwp_cpu_matched:
if (!hwp_active && hwp_only)
goto out;

- rc = cpufreq_register_driver(&intel_pstate_driver);
+ rc = cpufreq_register_driver(intel_pstate_driver);
if (rc)
goto out;

@@ -1978,7 +2092,9 @@ out:
get_online_cpus();
for_each_online_cpu(cpu) {
if (all_cpu_data[cpu]) {
- intel_pstate_clear_update_util_hook(cpu);
+ if (intel_pstate_driver == &intel_pstate)
+ intel_pstate_clear_update_util_hook(cpu);
+
kfree(all_cpu_data[cpu]);
}
}
@@ -1994,8 +2110,13 @@ static int __init intel_pstate_setup(cha
if (!str)
return -EINVAL;

- if (!strcmp(str, "disable"))
+ if (!strcmp(str, "disable")) {
no_load = 1;
+ } else if (!strcmp(str, "passive")) {
+ pr_info("Passive mode enabled\n");
+ intel_pstate_driver = &intel_cpufreq;
+ no_hwp = 1;
+ }
if (!strcmp(str, "no_hwp")) {
pr_info("HWP disabled\n");
no_hwp = 1;
Index: linux-pm/Documentation/kernel-parameters.txt
===================================================================
--- linux-pm.orig/Documentation/kernel-parameters.txt
+++ linux-pm/Documentation/kernel-parameters.txt
@@ -1760,6 +1760,12 @@ bytes respectively. Such letter suffixes
disable
Do not enable intel_pstate as the default
scaling driver for the supported processors
+ passive
+ Use intel_pstate as a scaling driver, but configure it
+ to work with generic cpufreq governors (instead of
+ enabling its internal governor). This mode cannot be
+ used along with the hardware-managed P-states (HWP)
+ feature.
force
Enable intel_pstate on systems that prohibit it by default
in favor of acpi-cpufreq. Forcing the intel_pstate driver