[tip:perf/urgent] perf: Fix race between event install and jump_labels

From: tip-bot for Peter Zijlstra
Date: Thu Feb 25 2016 - 03:06:28 EST


Commit-ID: 9107c89e269d2738019861bb518e3d59bef01781
Gitweb: http://git.kernel.org/tip/9107c89e269d2738019861bb518e3d59bef01781
Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
AuthorDate: Wed, 24 Feb 2016 18:45:45 +0100
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Thu, 25 Feb 2016 08:42:34 +0100

perf: Fix race between event install and jump_labels

perf_install_in_context() relies upon the context switch hooks to have
scheduled in events when the IPI misses its target -- after all, if
the task has moved from the CPU (or wasn't running at all), it will
have to context switch to run elsewhere.

This however doesn't appear to be happening.

It is possible for the IPI to not happen (task wasn't running) only to
later observe the task running with an inactive context.

The only possible explanation is that the context switch hooks are not
called. Therefore put in a sync_sched() after toggling the jump_label
to guarantee all CPUs will have them enabled before we install an
event.

A simple if (0->1) sync_sched() will not in fact work, because any
further increment can race and complete before the sync_sched().
Therefore we must jump through some hoops.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: dvyukov@xxxxxxxxxx
Cc: eranian@xxxxxxxxxx
Cc: oleg@xxxxxxxxxx
Cc: panand@xxxxxxxxxx
Cc: sasha.levin@xxxxxxxxxx
Cc: vince@xxxxxxxxxx
Link: http://lkml.kernel.org/r/20160224174947.980211985@xxxxxxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/perf_event.h | 6 +++---
kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++++++++--------
2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3915661..f5c5a3f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -906,7 +906,7 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
}
}

-extern struct static_key_deferred perf_sched_events;
+extern struct static_key_false perf_sched_events;

static __always_inline bool
perf_sw_migrate_enabled(void)
@@ -925,7 +925,7 @@ static inline void perf_event_task_migrate(struct task_struct *task)
static inline void perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- if (static_key_false(&perf_sched_events.key))
+ if (static_branch_unlikely(&perf_sched_events))
__perf_event_task_sched_in(prev, task);

if (perf_sw_migrate_enabled() && task->sched_migrated) {
@@ -942,7 +942,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
{
perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

- if (static_key_false(&perf_sched_events.key))
+ if (static_branch_unlikely(&perf_sched_events))
__perf_event_task_sched_out(prev, next);
}

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 92d6999..ea064ca 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -321,7 +321,13 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);

@@ -3536,12 +3542,22 @@ static void unaccount_event(struct perf_event *event)
if (has_branch_stack(event))
dec = true;

- if (dec)
- static_key_slow_dec_deferred(&perf_sched_events);
+ if (dec) {
+ if (!atomic_add_unless(&perf_sched_count, -1, 1))
+ schedule_delayed_work(&perf_sched_work, HZ);
+ }

unaccount_event_cpu(event, event->cpu);
}

+static void perf_sched_delayed(struct work_struct *work)
+{
+ mutex_lock(&perf_sched_mutex);
+ if (atomic_dec_and_test(&perf_sched_count))
+ static_branch_disable(&perf_sched_events);
+ mutex_unlock(&perf_sched_mutex);
+}
+
/*
* The following implement mutual exclusion of events on "exclusive" pmus
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -7780,8 +7796,28 @@ static void account_event(struct perf_event *event)
if (is_cgroup_event(event))
inc = true;

- if (inc)
- static_key_slow_inc(&perf_sched_events.key);
+ if (inc) {
+ if (atomic_inc_not_zero(&perf_sched_count))
+ goto enabled;
+
+ mutex_lock(&perf_sched_mutex);
+ if (!atomic_read(&perf_sched_count)) {
+ static_branch_enable(&perf_sched_events);
+ /*
+ * Guarantee that all CPUs observe they key change and
+ * call the perf scheduling hooks before proceeding to
+ * install events that need them.
+ */
+ synchronize_sched();
+ }
+ /*
+ * Now that we have waited for the sync_sched(), allow further
+ * increments to by-pass the mutex.
+ */
+ atomic_inc(&perf_sched_count);
+ mutex_unlock(&perf_sched_mutex);
+ }
+enabled:

account_event_cpu(event, event->cpu);
}
@@ -9344,9 +9380,6 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&perf_sched_events, HZ);
-
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.