[RFC/RFT][PATCH v3 2/2] cpufreq: schedutil: Switching frequencies from interrupt context

From: Rafael J. Wysocki
Date: Wed Feb 24 2016 - 18:29:22 EST


From: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>

Modify the ACPI cpufreq driver to provide a method for switching
CPU frequencies from interrupt context and update the cpufreq core
and the schedutil governor to use that method if available.

Introduce a new cpufreq driver callback, ->fast_switch, to be
invoked for frequency switching from interrupt context via
new helper function cpufreq_driver_fast_switch().

Modify the schedutil governor to call cpufreq_driver_fast_switch()
from its sugov_update_commit() function and avoid queuing up the
irq_work if that is successful.

Implement the ->fast_switch callback in the ACPI cpufreq driver
(with a limited coverage for the time being).

In addition to the above, cpufreq_governor_limits() is modified so
it doesn't call __cpufreq_driver_target() to enforce the new limits
immediately if the fast_switch_enabled flag is set for the policy,
because in that case the frequency will be updated immediately
using the new limits anyway and the additional invocation of
__cpufreq_driver_target() might be racing with that violating
the cpufreq_driver_fast_switch() requirements.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
---

An update here.

I wasn't sure about the changes in cpufreq_governor_limits() and I finally
concluded that for the case when "fast switching" was not used, that function
still should set the frequency within the limits directly (because it may
take some time for a regular update to happen even though the sample delay
is reset in there). OTOH, if "fast switching" is used, a regular update is
pretty much guaranteed to happen immediately, so the direct frequency setting
should not be necessary in that case.

That observation led to some rework all over (exept for acpi_cpufreq_fast_switch()
and cpu_freq_fast_write_intel() which are the same as before).

Of course, the patch still is a prototype. :-)

Thanks,
Rafael

---
drivers/cpufreq/acpi-cpufreq.c | 60 ++++++++++++++++++++++++++++++++++++
drivers/cpufreq/cpufreq.c | 31 ++++++++++++++++++
drivers/cpufreq/cpufreq_governor.c | 13 ++++---
drivers/cpufreq/cpufreq_governor.h | 1
drivers/cpufreq/cpufreq_schedutil.c | 22 ++++++++++---
include/linux/cpufreq.h | 5 +++
6 files changed, 122 insertions(+), 10 deletions(-)

Index: linux-pm/drivers/cpufreq/acpi-cpufreq.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/acpi-cpufreq.c
+++ linux-pm/drivers/cpufreq/acpi-cpufreq.c
@@ -70,6 +70,7 @@ struct acpi_cpufreq_data {
unsigned int cpu_feature;
unsigned int acpi_perf_cpu;
cpumask_var_t freqdomain_cpus;
+ void (*cpu_freq_fast_write)(u32 val);
};

/* acpi_perf_data is a pointer to percpu data. */
@@ -243,6 +244,15 @@ static unsigned extract_freq(u32 val, st
}
}

+void cpu_freq_fast_write_intel(u32 val)
+{
+ u32 lo, hi;
+
+ rdmsr(MSR_IA32_PERF_CTL, lo, hi);
+ lo = (lo & ~INTEL_MSR_RANGE) | (val & INTEL_MSR_RANGE);
+ wrmsr(MSR_IA32_PERF_CTL, lo, hi);
+}
+
struct msr_addr {
u32 reg;
};
@@ -484,6 +494,50 @@ out:
return result;
}

+unsigned int acpi_cpufreq_fast_switch(struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ struct acpi_cpufreq_data *data = policy->driver_data;
+ struct cpufreq_frequency_table *entry;
+ struct acpi_processor_performance *perf;
+ unsigned int uninitialized_var(next_perf_state);
+ unsigned int uninitialized_var(next_freq);
+ unsigned int best_diff;
+
+ for (entry = data->freq_table, best_diff = UINT_MAX;
+ entry->frequency != CPUFREQ_TABLE_END; entry++) {
+ unsigned int diff, freq = entry->frequency;
+
+ if (freq == CPUFREQ_ENTRY_INVALID)
+ continue;
+
+ diff = abs(freq - target_freq);
+ if (diff >= best_diff)
+ continue;
+
+ best_diff = diff;
+ next_perf_state = entry->driver_data;
+ next_freq = freq;
+ if (best_diff == 0)
+ goto found;
+ }
+ if (best_diff == UINT_MAX)
+ return CPUFREQ_ENTRY_INVALID;
+
+ found:
+ perf = to_perf_data(data);
+ if (perf->state == next_perf_state) {
+ if (unlikely(data->resume))
+ data->resume = 0;
+ else
+ return next_freq;
+ }
+
+ data->cpu_freq_fast_write(perf->states[next_perf_state].control);
+ perf->state = next_perf_state;
+ return next_freq;
+}
+
static unsigned long
acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
{
@@ -745,6 +799,7 @@ static int acpi_cpufreq_cpu_init(struct
pr_debug("HARDWARE addr space\n");
if (check_est_cpu(cpu)) {
data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
+ data->cpu_freq_fast_write = cpu_freq_fast_write_intel;
break;
}
if (check_amd_hwpstate_cpu(cpu)) {
@@ -760,6 +815,10 @@ static int acpi_cpufreq_cpu_init(struct
goto err_unreg;
}

+ policy->fast_switch_possible = data->cpu_freq_fast_write != NULL &&
+ !acpi_pstate_strict && !(policy_is_shared(policy) &&
+ policy->shared_type != CPUFREQ_SHARED_TYPE_ANY);
+
data->freq_table = kzalloc(sizeof(*data->freq_table) *
(perf->state_count+1), GFP_KERNEL);
if (!data->freq_table) {
@@ -894,6 +953,7 @@ static struct freq_attr *acpi_cpufreq_at
static struct cpufreq_driver acpi_cpufreq_driver = {
.verify = cpufreq_generic_frequency_table_verify,
.target_index = acpi_cpufreq_target,
+ .fast_switch = acpi_cpufreq_fast_switch,
.bios_limit = acpi_processor_get_bios_limit,
.init = acpi_cpufreq_cpu_init,
.exit = acpi_cpufreq_cpu_exit,
Index: linux-pm/include/linux/cpufreq.h
===================================================================
--- linux-pm.orig/include/linux/cpufreq.h
+++ linux-pm/include/linux/cpufreq.h
@@ -82,6 +82,7 @@ struct cpufreq_policy {
void *governor_data;
bool governor_enabled; /* governor start/stop flag */
char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
+ bool fast_switch_possible;

struct work_struct update; /* if update_policy() needs to be
* called, but you're in IRQ context */
@@ -271,6 +272,8 @@ struct cpufreq_driver {
unsigned int relation); /* Deprecated */
int (*target_index)(struct cpufreq_policy *policy,
unsigned int index);
+ unsigned int (*fast_switch)(struct cpufreq_policy *policy,
+ unsigned int target_freq);
/*
* Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION
* unset.
@@ -485,6 +488,8 @@ struct cpufreq_governor {
};

/* Pass a target to the cpufreq driver */
+void cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
+ unsigned int target_freq);
int cpufreq_driver_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation);
Index: linux-pm/drivers/cpufreq/cpufreq.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq.c
+++ linux-pm/drivers/cpufreq/cpufreq.c
@@ -1814,6 +1814,37 @@ EXPORT_SYMBOL(cpufreq_unregister_notifie
* GOVERNORS *
*********************************************************************/

+/**
+ * cpufreq_driver_fast_switch - Carry out a fast CPU frequency switch.
+ * @policy: cpufreq policy to switch the frequency for.
+ * @target_freq: New frequency to set (may be approximate).
+ *
+ * Carry out a fast frequency switch from interrupt context.
+ *
+ * This function must not be called if policy->fast_switch_possible is unset.
+ *
+ * Governors calling this function must guarantee that it will never be invoked
+ * twice in parallel for the same policy and that it will never be called in
+ * parallel with either ->target() or ->target_index() for the same policy.
+ *
+ * If CPUFREQ_ENTRY_INVALID is returned by the driver's ->fast_switch()
+ * callback, the hardware configuration must be preserved.
+ */
+void cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ unsigned int freq;
+
+ if (target_freq == policy->cur)
+ return;
+
+ freq = cpufreq_driver->fast_switch(policy, target_freq);
+ if (freq != CPUFREQ_ENTRY_INVALID) {
+ policy->cur = freq;
+ trace_cpu_frequency(freq, smp_processor_id());
+ }
+}
+
/* Must set freqs->new to intermediate frequency */
static int __target_intermediate(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs, int index)
Index: linux-pm/drivers/cpufreq/cpufreq_governor.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq_governor.c
+++ linux-pm/drivers/cpufreq/cpufreq_governor.c
@@ -613,15 +613,18 @@ static int cpufreq_governor_limits(struc

mutex_lock(&policy_dbs->timer_mutex);

- if (policy->max < policy->cur)
- __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
- else if (policy->min > policy->cur)
- __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
+ if (!policy_dbs->fast_switch_enabled) {
+ if (policy->max < policy->cur)
+ __cpufreq_driver_target(policy, policy->max,
+ CPUFREQ_RELATION_H);
+ else if (policy->min > policy->cur)
+ __cpufreq_driver_target(policy, policy->min,
+ CPUFREQ_RELATION_L);
+ }

gov_update_sample_delay(policy_dbs, 0);

mutex_unlock(&policy_dbs->timer_mutex);
-
return 0;
}

Index: linux-pm/drivers/cpufreq/cpufreq_schedutil.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq_schedutil.c
+++ linux-pm/drivers/cpufreq/cpufreq_schedutil.c
@@ -83,12 +83,23 @@ static unsigned int sugov_next_freq(stru
static void sugov_update_commit(struct policy_dbs_info *policy_dbs, u64 time,
unsigned int next_freq)
{
- struct sugov_policy *sg_policy = to_sg_policy(policy_dbs);
-
- sg_policy->next_freq = next_freq;
policy_dbs->last_sample_time = time;
- policy_dbs->work_in_progress = true;
- irq_work_queue(&policy_dbs->irq_work);
+
+ if (policy_dbs->fast_switch_enabled) {
+ cpufreq_driver_fast_switch(policy_dbs->policy, next_freq);
+ /*
+ * Restore the sample delay in case it has been set to 0 from
+ * sysfs in the meantime.
+ */
+ gov_update_sample_delay(policy_dbs,
+ policy_dbs->dbs_data->sampling_rate);
+ } else {
+ struct sugov_policy *sg_policy = to_sg_policy(policy_dbs);
+
+ sg_policy->next_freq = next_freq;
+ policy_dbs->work_in_progress = true;
+ irq_work_queue(&policy_dbs->irq_work);
+ }
}

static void sugov_update_shared(struct update_util_data *data, u64 time,
@@ -188,6 +199,7 @@ static bool sugov_start(struct cpufreq_p

gov_update_sample_delay(policy_dbs, policy_dbs->dbs_data->sampling_rate);
policy_dbs->last_sample_time = 0;
+ policy_dbs->fast_switch_enabled = policy->fast_switch_possible;

for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
Index: linux-pm/drivers/cpufreq/cpufreq_governor.h
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq_governor.h
+++ linux-pm/drivers/cpufreq/cpufreq_governor.h
@@ -124,6 +124,7 @@ struct policy_dbs_info {
/* Status indicators */
bool is_shared; /* This object is used by multiple CPUs */
bool work_in_progress; /* Work is being queued up or in progress */
+ bool fast_switch_enabled; /* Switch frequencies from interrup context */
};

static inline void gov_update_sample_delay(struct policy_dbs_info *policy_dbs,