Re: [PATCH v3] cpufreq: powernv: Add support of frequency domain

From: Rafael J. Wysocki
Date: Fri Feb 09 2018 - 07:20:53 EST


On Monday, January 22, 2018 9:17:34 AM CET Abhishek Goel wrote:
> Frequency-domain indicates group of CPUs that would share same frequency.
> It is detected using device-tree node "frequency-domain-indicator".
> frequency-domain-indicator is a bitmask which will have different value
> depending upon the generation of the processor.
>
> CPUs of the same chip for which the result of a bitwise AND between
> their PIR and the frequency-domain-indicator is the same share the same
> frequency.
>
> In this patch, we define hash-table indexed by the aforementioned
> bitwise ANDed value to store the cpumask of the CPUs sharing the same
> frequency domain. Further, the cpufreq policy will be created per
> frequency-domain
>
> So for POWER9, a cpufreq policy is created per quad while for POWER8 it
> is created per core. Governor decides frequency for each policy but
> multiple cores may come under same policy. In such case frequency needs
> to be set on each core sharing that policy.
>
> Signed-off-by: Abhishek Goel <huntbag@xxxxxxxxxxxxxxxxxx>
> ---
>
> Skiboot patch required for the corresponding device-tree changes have been
> posted here : http://patchwork.ozlabs.org/patch/862256/
>
> drivers/cpufreq/powernv-cpufreq.c | 104 ++++++++++++++++++++++++++++++++++----
> 1 file changed, 95 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
> index b6d7c4c..aab23a4 100644
> --- a/drivers/cpufreq/powernv-cpufreq.c
> +++ b/drivers/cpufreq/powernv-cpufreq.c
> @@ -37,6 +37,7 @@
> #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
> #include <asm/opal.h>
> #include <linux/timer.h>
> +#include <linux/hashtable.h>
>
> #define POWERNV_MAX_PSTATES 256
> #define PMSR_PSAFE_ENABLE (1UL << 30)
> @@ -130,6 +131,9 @@ enum throttle_reason_type {
> static int nr_chips;
> static DEFINE_PER_CPU(struct chip *, chip_info);
>
> +static u32 freq_domain_indicator;
> +static bool p9_occ_quirk;
> +
> /*
> * Note:
> * The set of pstates consists of contiguous integers.
> @@ -194,6 +198,38 @@ static inline void reset_gpstates(struct cpufreq_policy *policy)
> gpstates->last_gpstate_idx = 0;
> }
>
> +#define SIZE NR_CPUS
> +#define ORDER_FREQ_MAP ilog2(SIZE)
> +
> +static DEFINE_HASHTABLE(freq_domain_map, ORDER_FREQ_MAP);
> +
> +struct hashmap {
> + cpumask_t mask;
> + int chip_id;
> + u32 pir_key;
> + struct hlist_node hash_node;
> +};
> +
> +static void insert(u32 key, int cpu)
> +{
> + struct hashmap *data;
> +
> + hash_for_each_possible(freq_domain_map, data, hash_node, key%SIZE) {
> + if (data->chip_id == cpu_to_chip_id(cpu) &&
> + data->pir_key == key) {
> + cpumask_set_cpu(cpu, &data->mask);
> + return;
> + }
> + }
> +
> + data = kzalloc(sizeof(*data), GFP_KERNEL);
> + hash_add(freq_domain_map, &data->hash_node, key%SIZE);
> + cpumask_set_cpu(cpu, &data->mask);
> + data->chip_id = cpu_to_chip_id(cpu);
> + data->pir_key = key;
> +
> +}
> +
> /*
> * Initialize the freq table based on data obtained
> * from the firmware passed via device-tree
> @@ -206,6 +242,7 @@ static int init_powernv_pstates(void)
> u32 len_ids, len_freqs;
> u32 pstate_min, pstate_max, pstate_nominal;
> u32 pstate_turbo, pstate_ultra_turbo;
> + u32 key;
>
> power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
> if (!power_mgt) {
> @@ -246,9 +283,18 @@ static int init_powernv_pstates(void)
> else
> powernv_pstate_info.wof_enabled = true;
>
> + if (of_device_is_compatible(power_mgt, "freq-domain-v1") &&
> + of_property_read_u32(power_mgt, "ibm,freq-domain-indicator",
> + &freq_domain_indicator))
> + pr_warn("ibm,freq-domain-indicator not found\n");
> +
> + if (of_device_is_compatible(power_mgt, "p9-occ-quirk"))
> + p9_occ_quirk = true;
> +
> next:
> pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
> pstate_nominal, pstate_max);
> + pr_info("frequency domain indicator %d", freq_domain_indicator);
> pr_info("Workload Optimized Frequency is %s in the platform\n",
> (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
>
> @@ -276,6 +322,15 @@ static int init_powernv_pstates(void)
> return -ENODEV;
> }
>
> + if (freq_domain_indicator) {
> + hash_init(freq_domain_map);
> + for_each_possible_cpu(i) {
> + key = ((u32) get_hard_smp_processor_id(i) &
> + freq_domain_indicator);
> + insert(key, i);
> + }
> + }
> +
> powernv_pstate_info.nr_pstates = nr_pstates;
> pr_debug("NR PStates %d\n", nr_pstates);
> for (i = 0; i < nr_pstates; i++) {
> @@ -760,25 +815,56 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
>
> spin_unlock(&gpstates->gpstate_lock);
>
> - /*
> - * Use smp_call_function to send IPI and execute the
> - * mtspr on target CPU. We could do that without IPI
> - * if current CPU is within policy->cpus (core)
> + /* The current DVFS implementation in firmware requires that to set a
> + * frequency in a quad, all cores of the quad need to set frequency in
> + * their respective PMCR's. Ideally setting frequency on any of the
> + * core of that quad should change frequency for the quad.
> */
> - smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
> + if (p9_occ_quirk) {
> + cpumask_t temp;
> + u32 cpu;
> +
> + cpumask_copy(&temp, policy->cpus);
> + while (!cpumask_empty(&temp)) {
> + cpu = cpumask_first(&temp);
> + smp_call_function_any(cpu_sibling_mask(cpu),
> + set_pstate, &freq_data, 1);
> + cpumask_andnot(&temp, &temp, cpu_sibling_mask(cpu));
> + }
> + } else {
> + smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
> + }
> +
> return 0;
> }
>
> static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
> {
> - int base, i, ret;
> + int ret;
> struct kernfs_node *kn;
> struct global_pstate_info *gpstates;
>
> - base = cpu_first_thread_sibling(policy->cpu);
> + if (!freq_domain_indicator) {

IMO it would be more straightforward to write this conditional as

if (freq_domain_indicator) {
do something
} else {
do something else
}

> + int base, i;
>
> - for (i = 0; i < threads_per_core; i++)
> - cpumask_set_cpu(base + i, policy->cpus);
> + base = cpu_first_thread_sibling(policy->cpu);
> + for (i = 0; i < threads_per_core; i++)
> + cpumask_set_cpu(base + i, policy->cpus);
> + } else {
> + u32 key;
> + struct hashmap *data;
> +
> + key = ((u32) get_hard_smp_processor_id(policy->cpu) &
> + freq_domain_indicator);

The outer paren on the right-hand side is not necessary.

The explicit case to u32 isn't necessary as well AFAICS.

> + hash_for_each_possible(freq_domain_map, data, hash_node,
> + key%SIZE) {
> + if (data->chip_id == cpu_to_chip_id(policy->cpu) &&
> + data->pir_key == key) {

You have slightly confusing indentation here, as the continuation of the
condition is at the same level as the statements in the conditional block.

> + cpumask_copy(policy->cpus, &data->mask);
> + break;
> + }
> + }
> + }
>
> kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
> if (!kn) {
>

Overall, from cpufreq perspective, I don't see anything to complain about
here.

Thanks,
Rafael