RE: [PATCH V4 2/2] cpufreq: intel_pstate: Implement QoS supported freq constraints

From: Doug Smythies
Date: Thu Aug 08 2019 - 12:25:42 EST


On 2019.08.07 00:06 Viresh Kumar wrote:

Thanks for your work on this.

> Intel pstate driver exposes min_perf_pct and max_perf_pct sysfs files,
> which can be used to force a limit on the min/max P state of the driver.
> Though these files eventually control the min/max frequencies that the
> CPUs will run at, they don't make a change to policy->min/max values.
>
> When the values of these files are changed (in passive mode of the
> driver), it leads to calling ->limits() callback of the cpufreq
> governors, like schedutil. On a call to it the governors shall
> forcefully update the frequency to come within the limits. Since the
> limits, i.e. policy->min/max, aren't updated by the driver, the
> governors fails to get the target freq within limit and sometimes aborts
> the update believing that the frequency is already set to the target
> value.
>
> This patch implements the QoS supported frequency constraints to update
> policy->min/max values whenever min_perf_pct or max_perf_pct files are
> updated. This is only done for the passive mode as of now, as the driver
> is already working fine in active mode.
>
> Fixes: ecd288429126 ("cpufreq: schedutil: Don't set next_freq to UINT_MAX")
> Reported-by: Doug Smythies <dsmythies@xxxxxxxxx>
> Signed-off-by: Viresh Kumar <viresh.kumar@xxxxxxxxxx>

Tested by: Doug Smythies <dsmythies@xxxxxxxxx>
Thermald seems to now be working O.K. for all the governors.

I do note that if one sets
/sys/devices/system/cpu/cpufreq/policy*/scaling_max_freq
It seems to override subsequent attempts via
/sys/devices/system/cpu/intel_pstate/max_perf_pct.
Myself, I find this confusing.

So the question becomes which one is the "master"?

Example:

# for file in /sys/devices/system/cpu/cpufreq/policy*/scaling_max_freq; do echo "2200000" > $file; done
# cat /sys/devices/system/cpu/cpufreq/policy*/scaling_max_freq
2200000
2200000
2200000
2200000
2200000
2200000
2200000
2200000
root@s15:/home/doug/temp# cat /sys/devices/system/cpu/intel_pstate/max_perf_pct
... (Note: 50% = 1900000)
root@s15:/home/doug/temp# cat /sys/devices/system/cpu/cpufreq/policy*/scaling_max_freq
1900000
1900000
1900000
1900000
1900000
1900000
1900000
1900000
root@s15:/home/doug/temp# echo 100 > /sys/devices/system/cpu/intel_pstate/max_perf_pct
... (Note: 50% = 3800000, and my expectation is 3.8 GHz below)
root@s15:/home/doug/temp# cat /sys/devices/system/cpu/cpufreq/policy*/scaling_max_freq
2200000
2200000
2200000
2200000
2200000
2200000
2200000
2200000

Similarly for the minimum side of things:

root@s15:/home/doug/temp# for file in /sys/devices/system/cpu/cpufreq/policy*/scaling_min_freq; do echo "3200000" > $file; done
root@s15:/home/doug/temp# cat /sys/devices/system/cpu/cpufreq/policy*/scaling_min_freq
3200000
3200000
3200000
3200000
3200000
3200000
3200000
3200000
root@s15:/home/doug/temp# echo 42 > /sys/devices/system/cpu/intel_pstate/min_perf_pct
root@s15:/home/doug/temp# cat /sys/devices/system/cpu/intel_pstate/min_perf_pct
42 ... (note 42% = 1600000 = processor minimum, and that is my expectation below.)
root@s15:/home/doug/temp# cat /sys/devices/system/cpu/cpufreq/policy*/scaling_min_freq
3200000
3200000
3200000
3200000
3200000
3200000
3200000
3200000

I thought these minimum anomalies would cause problems for thermald, but
for whatever reason, it seems to work properly.

> ---
> V3->V4:
> - Reimplemented the solution using QoS constraints instead of
> resolve_freq() callback.
>
> drivers/cpufreq/intel_pstate.c | 120 +++++++++++++++++++++++++++++++--
> 1 file changed, 116 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index cc27d4c59dca..e9fbd6c36822 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -24,6 +24,7 @@
> #include <linux/fs.h>
> #include <linux/acpi.h>
> #include <linux/vmalloc.h>
> +#include <linux/pm_qos.h>
> #include <trace/events/power.h>
>
> #include <asm/div64.h>
> @@ -1085,6 +1086,47 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
> return count;
> }
>
> +static struct cpufreq_driver intel_pstate;
> +
> +static void update_qos_request(enum dev_pm_qos_req_type type)
> +{
> + int max_state, turbo_max, freq, i, perf_pct;
> + struct dev_pm_qos_request *req;
> + struct cpufreq_policy *policy;
> +
> + for_each_possible_cpu(i) {
> + struct cpudata *cpu = all_cpu_data[i];
> +
> + policy = cpufreq_cpu_get(i);
> + if (!policy)
> + continue;
> +
> + req = policy->driver_data;
> + cpufreq_cpu_put(policy);
> +
> + if (!req)
> + continue;
> +
> + if (hwp_active)
> + intel_pstate_get_hwp_max(i, &turbo_max, &max_state);
> + else
> + turbo_max = cpu->pstate.turbo_pstate;
> +
> + if (type == DEV_PM_QOS_MIN_FREQUENCY) {

Is it O.K. to assume if the passed op code is
not DEV_PM_QOS_MIN_FREQUENCY
then it must have been
DEV_PM_QOS_MAX_FREQUENCY
?

It is within this patch, but what about in future?

> + perf_pct = global.min_perf_pct;
> + } else {
> + req++;
> + perf_pct = global.max_perf_pct;
> + }
> +
> + freq = DIV_ROUND_UP(turbo_max * perf_pct, 100);
> + freq *= cpu->pstate.scaling;
> +
> + if (dev_pm_qos_update_request(req, freq))
> + pr_warn("Failed to update freq constraint: CPU%d\n", i);

I get many of these messages (4520 so far, always in groups of 8 (I have 8 CPUs)),
and have yet to figure out exactly why. It seems to actually be working fine.

> + }
> +}
> +
> static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
> const char *buf, size_t count)
> {
> @@ -1108,7 +1150,10 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
>
> mutex_unlock(&intel_pstate_limits_lock);
>
> - intel_pstate_update_policies();
> + if (intel_pstate_driver == &intel_pstate)
> + intel_pstate_update_policies();
> + else
> + update_qos_request(DEV_PM_QOS_MAX_FREQUENCY);
>
> mutex_unlock(&intel_pstate_driver_lock);
>
> @@ -1139,7 +1184,10 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
>
> mutex_unlock(&intel_pstate_limits_lock);
>
> - intel_pstate_update_policies();
> + if (intel_pstate_driver == &intel_pstate)
> + intel_pstate_update_policies();
> + else
> + update_qos_request(DEV_PM_QOS_MIN_FREQUENCY);
>
> mutex_unlock(&intel_pstate_driver_lock);
>
> @@ -2332,8 +2380,16 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
>
> static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
> {
> - int ret = __intel_pstate_cpu_init(policy);
> + int max_state, turbo_max, min_freq, max_freq, ret;
> + struct dev_pm_qos_request *req;
> + struct cpudata *cpu;
> + struct device *dev;
> +
> + dev = get_cpu_device(policy->cpu);
> + if (!dev)
> + return -ENODEV;
>
> + ret = __intel_pstate_cpu_init(policy);
> if (ret)
> return ret;
>
> @@ -2342,7 +2398,63 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
> /* This reflects the intel_pstate_get_cpu_pstates() setting. */
> policy->cur = policy->cpuinfo.min_freq;
>
> + req = kcalloc(2, sizeof(*req), GFP_KERNEL);
> + if (!req) {
> + ret = -ENOMEM;
> + goto pstate_exit;
> + }
> +
> + cpu = all_cpu_data[policy->cpu];
> +
> + if (hwp_active)
> + intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state);
> + else
> + turbo_max = cpu->pstate.turbo_pstate;
> +
> + min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
> + min_freq *= cpu->pstate.scaling;
> + max_freq = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100);
> + max_freq *= cpu->pstate.scaling;
> +
> + ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_MIN_FREQUENCY,
> + min_freq);
> + if (ret < 0) {
> + dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
> + goto free_req;
> + }
> +
> + ret = dev_pm_qos_add_request(dev, req + 1, DEV_PM_QOS_MAX_FREQUENCY,
> + max_freq);
> + if (ret < 0) {
> + dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
> + goto remove_min_req;
> + }
> +
> + policy->driver_data = req;
> +
> return 0;
> +
> +remove_min_req:
> + dev_pm_qos_remove_request(req);
> +free_req:
> + kfree(req);
> +pstate_exit:
> + intel_pstate_exit_perf_limits(policy);
> +
> + return ret;
> +}
> +
> +static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> +{
> + struct dev_pm_qos_request *req;
> +
> + req = policy->driver_data;
> +
> + dev_pm_qos_remove_request(req + 1);
> + dev_pm_qos_remove_request(req);
> + kfree(req);
> +
> + return intel_pstate_cpu_exit(policy);
> }
>
> static struct cpufreq_driver intel_cpufreq = {
> @@ -2351,7 +2463,7 @@ static struct cpufreq_driver intel_cpufreq = {
> .target = intel_cpufreq_target,
> .fast_switch = intel_cpufreq_fast_switch,
> .init = intel_cpufreq_cpu_init,
> - .exit = intel_pstate_cpu_exit,
> + .exit = intel_cpufreq_cpu_exit,
> .stop_cpu = intel_cpufreq_stop_cpu,
> .update_limits = intel_pstate_update_limits,
> .name = "intel_cpufreq",
> --
> 2.21.0.rc0.269.g1a574e7a288b