[PATCH 06/12] perf: Fix race between event install and jump_labels
From: Peter Zijlstra
Date: Wed Feb 24 2016 - 12:52:59 EST
perf_install_in_context() relies upon the context switch hooks to have
scheduled in events when the IPI misses its target -- after all, if
the task has moved from the CPU (or wasn't running at all), it will
have to context switch to run elsewhere.
This however doesn't appear to be happening.
It is possible for the IPI to not happen (task wasn't running) only to
later observe the task running with an inactive context.
The only possible explanation is that the context switch hooks are not
called. Therefore put in a sync_sched() after toggling the jump_label
to guarantee all CPUs will have them enabled before we install an
event.
A simple if (0->1) sync_sched() will not in fact work, because any
further increment can race and complete before the sync_sched().
Therefore we must jump through some hoops.
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/perf_event.h | 6 ++---
kernel/events/core.c | 49 +++++++++++++++++++++++++++++++++++++--------
2 files changed, 44 insertions(+), 11 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -906,7 +906,7 @@ perf_sw_event_sched(u32 event_id, u64 nr
}
}
-extern struct static_key_deferred perf_sched_events;
+extern struct static_key_false perf_sched_events;
static __always_inline bool
perf_sw_migrate_enabled(void)
@@ -925,7 +925,7 @@ static inline void perf_event_task_migra
static inline void perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- if (static_key_false(&perf_sched_events.key))
+ if (static_branch_unlikely(&perf_sched_events))
__perf_event_task_sched_in(prev, task);
if (perf_sw_migrate_enabled() && task->sched_migrated) {
@@ -942,7 +942,7 @@ static inline void perf_event_task_sched
{
perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
- if (static_key_false(&perf_sched_events.key))
+ if (static_branch_unlikely(&perf_sched_events))
__perf_event_task_sched_out(prev, next);
}
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -321,7 +321,13 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
@@ -3542,12 +3548,22 @@ static void unaccount_event(struct perf_
if (has_branch_stack(event))
dec = true;
- if (dec)
- static_key_slow_dec_deferred(&perf_sched_events);
+ if (dec) {
+ if (!atomic_add_unless(&perf_sched_count, -1, 1))
+ schedule_delayed_work(&perf_sched_work, HZ);
+ }
unaccount_event_cpu(event, event->cpu);
}
+static void perf_sched_delayed(struct work_struct *work)
+{
+ mutex_lock(&perf_sched_mutex);
+ if (atomic_dec_and_test(&perf_sched_count))
+ static_branch_disable(&perf_sched_events);
+ mutex_unlock(&perf_sched_mutex);
+}
+
/*
* The following implement mutual exclusion of events on "exclusive" pmus
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -7786,8 +7802,28 @@ static void account_event(struct perf_ev
if (is_cgroup_event(event))
inc = true;
- if (inc)
- static_key_slow_inc(&perf_sched_events.key);
+ if (inc) {
+ if (atomic_inc_not_zero(&perf_sched_count))
+ goto enabled;
+
+ mutex_lock(&perf_sched_mutex);
+ if (!atomic_read(&perf_sched_count)) {
+ static_branch_enable(&perf_sched_events);
+ /*
+ * Guarantee that all CPUs observe they key change and
+ * call the perf scheduling hooks before proceeding to
+ * install events that need them.
+ */
+ synchronize_sched();
+ }
+ /*
+ * Now that we have waited for the sync_sched(), allow further
+ * increments to by-pass the mutex.
+ */
+ atomic_inc(&perf_sched_count);
+ mutex_unlock(&perf_sched_mutex);
+ }
+enabled:
account_event_cpu(event, event->cpu);
}
@@ -9352,9 +9388,6 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&perf_sched_events, HZ);
-
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.