[PATCH v1 07/13] perf/core: add idle hooks

From: Stephane Eranian
Date: Thu Sep 09 2021 - 03:59:10 EST


This patch adds a new set of hooks to connect perf_events with the
idle task. On some PMU models, it may be necessary to flush or stop
the PMU when going to low power. Upon return from low power, the opposite
action, i.e., re-enable the PMU, may be necessary. The patch adds
perf_pmu_register_lopwr_cb() to register a callback on entry or return
from low power. The callback is invoked with a boolean arg. If true,
then this is an entry. If false, this is a return.

The callback is invoked from the idle code with interrupts already
disabled.

Signed-off-by: Stephane Eranian <eranian@xxxxxxxxxx>
---
include/linux/perf_event.h | 8 ++++++
kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++++++++
kernel/sched/idle.c | 15 +++++++++-
3 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2d510ad750ed..32ffc009b2ec 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -300,6 +300,7 @@ struct pmu {
/* number of address filters this PMU can do */
unsigned int nr_addr_filters;

+ struct list_head lopwr_entry;
/*
* Fully disable/enable this PMU, can be used to protect from the PMI
* as well as for lazy/batch writing of the MSRs.
@@ -430,6 +431,8 @@ struct pmu {
void (*sched_task) (struct perf_event_context *ctx,
bool sched_in);

+ void (*lopwr_cb) (bool lopwr_in);
+
/*
* Kmem cache of PMU specific data
*/
@@ -1429,6 +1432,11 @@ extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);
+extern void perf_lopwr_cb(bool lopwr_in);
+extern void perf_lopwr_active_inc(void);
+extern void perf_lopwr_active_dec(void);
+extern void perf_register_lopwr_cb(struct pmu *pmu, void (*lowpwr_cb)(bool));
+
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1cb1f9b8392e..f739fd92e74b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3521,6 +3521,64 @@ void perf_sched_cb_inc(struct pmu *pmu)
this_cpu_inc(perf_sched_cb_usages);
}

+/*
+ * The perf_lopwr_cb() is invoked from the idle task. As such it
+ * cannot grab a mutex that may end up sleeping. The idle task cannot
+ * sleep by construction. Therefore we create a spinlock and a new
+ * list of PMUs to invoke on idle. The list is protected by a spinlock
+ * Normally we would use the pmus_lock and iterate over each PMUs. But
+ * mutex is not possible and we need to iterate only over the PMU which
+ * do require a idle callback.
+ */
+static DEFINE_SPINLOCK(lopwr_cb_lock);
+static LIST_HEAD(lopwr_cb_pmus);
+static DEFINE_PER_CPU(int, lopwr_nr_active);
+
+void perf_lopwr_active_inc(void)
+{
+ __this_cpu_inc(lopwr_nr_active);
+}
+
+void perf_lopwr_active_dec(void)
+{
+ __this_cpu_dec(lopwr_nr_active);
+}
+
+/*
+ * lopwr_in = true means going to low power state
+ * lopwr_in = false means returning from low power state
+ */
+void perf_lopwr_cb(bool lopwr_in)
+{
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (!__this_cpu_read(lopwr_nr_active))
+ return;
+
+ spin_lock_irqsave(&lopwr_cb_lock, flags);
+
+ list_for_each_entry(pmu, &lopwr_cb_pmus, lopwr_entry) {
+ if (pmu->lopwr_cb)
+ pmu->lopwr_cb(lopwr_in);
+ }
+
+ spin_unlock_irqrestore(&lopwr_cb_lock, flags);
+}
+EXPORT_SYMBOL_GPL(perf_lopwr_cb);
+
+void perf_register_lopwr_cb(struct pmu *pmu, void (*func)(bool))
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lopwr_cb_lock, flags);
+
+ pmu->lopwr_cb = func;
+ list_add_tail(&pmu->lopwr_entry, &lopwr_cb_pmus);
+
+ spin_unlock_irqrestore(&lopwr_cb_lock, flags);
+}
+
/*
* This function provides the context switch callback to the lower code
* layer. It is invoked ONLY when the context switch callback is enabled.
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 912b47aa99d8..14ce130aee1b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -179,6 +179,7 @@ static void cpuidle_idle_call(void)
*/
if (need_resched()) {
local_irq_enable();
+ perf_lopwr_cb(false);
return;
}

@@ -191,7 +192,14 @@ static void cpuidle_idle_call(void)
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();

+ if (!cpu_idle_force_poll)
+ perf_lopwr_cb(true);
+
default_idle_call();
+
+ if (!cpu_idle_force_poll)
+ perf_lopwr_cb(false);
+
goto exit_idle;
}

@@ -249,8 +257,10 @@ static void cpuidle_idle_call(void)
/*
* It is up to the idle functions to reenable local interrupts
*/
- if (WARN_ON_ONCE(irqs_disabled()))
+ if (WARN_ON_ONCE(irqs_disabled())) {
local_irq_enable();
+ perf_lopwr_cb(false);
+ }
}

/*
@@ -279,9 +289,12 @@ static void do_idle(void)
__current_set_polling();
tick_nohz_idle_enter();

+
while (!need_resched()) {
rmb();

+ perf_lopwr_cb(true);
+
local_irq_disable();

if (cpu_is_offline(cpu)) {
--
2.33.0.153.gba50c8fa24-goog