Re: [PATCH 2/5] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

From: Rafael J. Wysocki
Date: Fri Jun 16 2017 - 20:37:36 EST


On Wednesday, June 07, 2017 07:39:13 PM Len Brown wrote:
> From: Len Brown <len.brown@xxxxxxxxx>
>
> The goal of this change is to give users a uniform and meaningful
> result when they read /sys/...cpufreq/scaling_cur_freq
> on modern x86 hardware, as compared to what they get today.
>
> Modern x86 processors include the hardware needed
> to accurately calculate frequency over an interval --
> APERF, MPERF, and the TSC.
>
> Here we provide an x86 routine to make this calculation
> on supported hardware, and use it in preference to any
> driver driver-specific cpufreq_driver.get() routine.
>
> MHz is computed like so:
>
> MHz = base_MHz * delta_APERF / delta_MPERF
>
> MHz is the average frequency of the busy processor
> over a measurement interval. The interval is
> defined to be the time between successive invocations
> of aperfmperf_khz_on_cpu(), which are expected to to
> happen on-demand when users read sysfs attribute
> cpufreq/scaling_cur_freq.
>
> As with previous methods of calculating MHz,
> idle time is excluded.
>
> base_MHz above is from TSC calibration global "cpu_khz".
>
> This x86 native method to calculate MHz returns a meaningful result
> no matter if P-states are controlled by hardware or firmware
> and/or if the Linux cpufreq sub-system is or is-not installed.
>
> When this routine is invoked more frequently, the measurement
> interval becomes shorter. However, the code limits re-computation
> to 10ms intervals so that average frequency remains meaningful.
>
> Discerning users are encouraged to take advantage of
> the turbostat(8) utility, which can gracefully handle
> concurrent measurement intervals of arbitrary length.
>
> Signed-off-by: Len Brown <len.brown@xxxxxxxxx>
> ---
> arch/x86/kernel/cpu/Makefile | 1 +
> arch/x86/kernel/cpu/aperfmperf.c | 82 ++++++++++++++++++++++++++++++++++++++++
> drivers/cpufreq/cpufreq.c | 7 +++-
> include/linux/cpufreq.h | 13 +++++++
> 4 files changed, 102 insertions(+), 1 deletion(-)
> create mode 100644 arch/x86/kernel/cpu/aperfmperf.c
>
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index 5200001..cdf8249 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -21,6 +21,7 @@ obj-y += common.o
> obj-y += rdrand.o
> obj-y += match.o
> obj-y += bugs.o
> +obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
>
> obj-$(CONFIG_PROC_FS) += proc.o
> obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
> diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
> new file mode 100644
> index 0000000..5ccf63a
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/aperfmperf.c
> @@ -0,0 +1,82 @@
> +/*
> + * x86 APERF/MPERF KHz calculation for
> + * /sys/.../cpufreq/scaling_cur_freq
> + *
> + * Copyright (C) 2017 Intel Corp.
> + * Author: Len Brown <len.brown@xxxxxxxxx>
> + *
> + * This file is licensed under GPLv2.
> + */
> +
> +#include <linux/jiffies.h>
> +#include <linux/math64.h>
> +#include <linux/percpu.h>
> +#include <linux/smp.h>
> +
> +struct aperfmperf_sample {
> + unsigned int khz;
> + unsigned long jiffies;
> + u64 aperf;
> + u64 mperf;
> +};
> +
> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
> +
> +/*
> + * aperfmperf_snapshot_khz()
> + * On the current CPU, snapshot APERF, MPERF, and jiffies
> + * unless we already did it within 10ms
> + * calculate kHz, save snapshot
> + */
> +static void aperfmperf_snapshot_khz(void *dummy)
> +{
> + u64 aperf, aperf_delta;
> + u64 mperf, mperf_delta;
> + struct aperfmperf_sample *s = &get_cpu_var(samples);
> +
> + /* Don't bother re-computing within 10 ms */
> + if (time_before(jiffies, s->jiffies + HZ/100))
> + goto out;
> +
> + rdmsrl(MSR_IA32_APERF, aperf);
> + rdmsrl(MSR_IA32_MPERF, mperf);
> +
> + aperf_delta = aperf - s->aperf;
> + mperf_delta = mperf - s->mperf;
> +
> + /*
> + * There is no architectural guarantee that MPERF
> + * increments faster than we can read it.
> + */
> + if (mperf_delta == 0)
> + goto out;
> +
> + /*
> + * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
> + * khz = (cpu_khz * aperf_delta) / mperf_delta
> + */
> + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
> + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
> + else /* khz = aperf_delta / (mperf_delta / cpu_khz) */
> + s->khz = div64_u64(aperf_delta,
> + div64_u64(mperf_delta, cpu_khz));
> + s->jiffies = jiffies;
> + s->aperf = aperf;
> + s->mperf = mperf;
> +
> +out:
> + put_cpu_var(samples);
> +}
> +
> +unsigned int aperfmperf_khz_on_cpu(unsigned int cpu)
> +{
> + if (!cpu_khz)
> + return 0;
> +
> + if (!static_cpu_has(X86_FEATURE_APERFMPERF))
> + return 0;
> +
> + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
> +
> + return per_cpu(samples.khz, cpu);
> +}
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 26b643d..a667fac 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -635,8 +635,13 @@ show_one(scaling_max_freq, max);
> static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
> {
> ssize_t ret;
> + unsigned int freq;
>
> - if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get)
> + freq = arch_freq_get_on_cpu(policy->cpu);
> + if (freq)
> + ret = sprintf(buf, "%u\n", freq);
> + else if (cpufreq_driver && cpufreq_driver->setpolicy &&
> + cpufreq_driver->get)
> ret = sprintf(buf, "%u\n", cpufreq_driver->get(policy->cpu));
> else
> ret = sprintf(buf, "%u\n", policy->cur);

I wonder if we could change intel_pstate_get() to simply return
aperfmperf_khz_on_cpu(cpu_num)?

That would allow us to avoid the extra branch here and get rid of the
#ifdef x86 from the header.

> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index a5ce0bbe..cfc6220 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -883,6 +883,19 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
> }
> #endif
>
> +#ifdef CONFIG_X86
> +extern unsigned int aperfmperf_khz_on_cpu(unsigned int cpu);
> +static inline unsigned int arch_freq_get_on_cpu(int cpu)
> +{
> + return aperfmperf_khz_on_cpu(cpu);
> +}
> +#else
> +static inline unsigned int arch_freq_get_on_cpu(int cpu)
> +{
> + return 0;
> +}
> +#endif
> +
> /* the following are really really optional */
> extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
> extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
>

Thanks,
Rafael