[RFC V2 PATCH 1/4] perf: Add PERF_RECORD_SWITCH to indicate context switches

From: Adrian Hunter
Date: Fri Jul 03 2015 - 08:56:06 EST


There are already two events for context switches, namely
the tracepoint sched:sched_switch and the software event
context_switches. Unfortunately neither are suitable for
use by non-privileged users for the purpose of synchronizing
hardware trace data (e.g. Intel PT) to the context switch.

Tracepoints are no good at all for non-privileged users
because they need either CAP_SYS_ADMIN or
/proc/sys/kernel/perf_event_paranoid <= -1.

On the other hand, kernel software events need either
CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1.

Now many distributions do default perf_event_paranoid to 1
making context_switches a contender, except it has another
problem (which is also shared with sched:sched_switch)
which is that it happens before perf schedules events out
instead of after perf schedules events in. Whereas a
privileged user can see all the events anyway, a
non-privileged user only sees events for their own processes,
in other words they see when their process was scheduled out
not when it was scheduled in. That presents two problems to
use the event: 1. the information comes too late, so tools
have to look ahead in the event stream to find out what the
current state is 2. if they are unlucky tracing might have
stopped before the context-switches event is recorded.

This new PERF_RECORD_SWITCH event does not have those problems
and it also has a couple of other small advantages. It is
easier to use because it is an auxiliary event (like mmap,
comm and task events) which can be enabled by setting a single
bit. It is smaller than sched:sched_switch and easier to parse.

To make the event useful for privileged users also, if the
context is cpu-wide then the event will also provide the
next or previous pid/tid.

Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
---
include/uapi/linux/perf_event.h | 20 +++++++-
kernel/events/core.c | 102 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d97f84c080da..7f1664b818c0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -330,7 +330,8 @@ struct perf_event_attr {
mmap2 : 1, /* include mmap with inode data */
comm_exec : 1, /* flag comm events that are due to an exec */
use_clockid : 1, /* use @clockid for time fields */
- __reserved_1 : 38;
+ context_switch : 1, /* context switch data */
+ __reserved_1 : 37;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -572,9 +573,11 @@ struct perf_event_mmap_page {
/*
* PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
* different events so can reuse the same bit position.
+ * Ditto PERF_RECORD_MISC_SWITCH_OUT.
*/
#define PERF_RECORD_MISC_MMAP_DATA (1 << 13)
#define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
+#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13)
/*
* Indicates that the content of PERF_SAMPLE_IP points to
* the actual instruction that triggered the event. See also
@@ -818,6 +821,21 @@ enum perf_event_type {
*/
PERF_RECORD_LOST_SAMPLES = 13,

+ /*
+ * Records a context switch in or out (flagged by
+ * PERF_RECORD_MISC_SWITCH_OUT). next_prev_pid and next_prev_tid are
+ * (u32)-1 unless the context is cpu-wide, in which case they are the
+ * next (switching out) or previous (switching in) pid/tid.
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u32 next_prev_pid;
+ * u32 next_prev_tid;
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_SWITCH = 14,
+
PERF_RECORD_MAX, /* non-ABI */
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8e13f3e54ec3..5d5e6f5d2829 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -2613,6 +2614,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
local_irq_restore(flags);
}

+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool out);
+
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

@@ -2635,6 +2639,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);

+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, next, true);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);

@@ -2825,6 +2832,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);

+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, prev, false);
+
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
@@ -3448,6 +3458,10 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
+ if (event->attr.context_switch) {
+ static_key_slow_dec_deferred(&perf_sched_events);
+ atomic_dec(&nr_switch_events);
+ }
if (is_cgroup_event(event))
static_key_slow_dec_deferred(&perf_sched_events);
if (has_branch_stack(event))
@@ -5984,6 +5998,90 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
}

/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+ struct task_struct *task;
+ struct task_struct *next_prev;
+
+ struct {
+ struct perf_event_header header;
+ u32 next_prev_pid;
+ u32 next_prev_tid;
+ } event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+ return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+ struct perf_switch_event *switch_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int size = switch_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_switch_match(event))
+ return;
+
+ perf_event_header__init_id(&switch_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ switch_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ /* Only CPU-wide events are allowed to see next/prev pid/tid */
+ if (event->ctx->task) {
+ switch_event->event_id.next_prev_pid = -1;
+ switch_event->event_id.next_prev_tid = -1;
+ } else {
+ switch_event->event_id.next_prev_pid =
+ perf_event_pid(event, switch_event->next_prev);
+ switch_event->event_id.next_prev_tid =
+ perf_event_tid(event, switch_event->next_prev);
+ }
+
+ perf_output_put(&handle, switch_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ switch_event->event_id.header.size = size;
+}
+
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool out)
+{
+ struct perf_switch_event switch_event;
+
+ /* N.B. caller checks nr_switch_events != 0 */
+
+ switch_event = (struct perf_switch_event){
+ .task = task,
+ .next_prev = next_prev,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_SWITCH,
+ .misc = out ? PERF_RECORD_MISC_SWITCH_OUT : 0,
+ .size = sizeof(switch_event.event_id),
+ },
+ /* .next_prev_pid */
+ /* .next_prev_tid */
+ },
+ };
+
+ perf_event_aux(perf_event_switch_output,
+ &switch_event,
+ NULL);
+}
+
+/*
* IRQ throttle logging
*/

@@ -7481,6 +7579,10 @@ static void account_event(struct perf_event *event)
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
+ if (event->attr.context_switch) {
+ atomic_inc(&nr_switch_events);
+ static_key_slow_inc(&perf_sched_events.key);
+ }
if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
if (is_cgroup_event(event))
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/