[PATCH 03/15] perf, core: Introduce pmu context switch callback

From: Yan, Zheng
Date: Wed Dec 11 2013 - 02:15:04 EST


From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx>

The callback is invoked when process is scheduled in/out. To avoid
unnecessary overhead, the callback can be enabled/disabled.

Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx>
---
arch/x86/kernel/cpu/perf_event.c | 7 +++++
arch/x86/kernel/cpu/perf_event.h | 4 +++
include/linux/perf_event.h | 8 ++++++
kernel/events/core.c | 59 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 78 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e13293..6703d17 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1846,6 +1846,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};

+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ if (x86_pmu.sched_task)
+ x86_pmu.sched_task(ctx, sched_in);
+}
+
static void x86_pmu_flush_branch_stack(void)
{
if (x86_pmu.flush_branch_stack)
@@ -1879,6 +1885,7 @@ static struct pmu pmu = {

.event_idx = x86_pmu_event_idx,
.flush_branch_stack = x86_pmu_flush_branch_stack,
+ .sched_task = x86_pmu_sched_task,
};

void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 1213641..5eb3a58 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -417,6 +417,8 @@ struct x86_pmu {

void (*check_microcode)(void);
void (*flush_branch_stack)(void);
+ void (*sched_task)(struct perf_event_context *ctx,
+ bool sched_in);

/*
* Intel Arch Perfmon v2+
@@ -678,6 +680,8 @@ void intel_pmu_pebs_disable_all(void);

void intel_ds_init(void);

+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_reset(void);

void intel_pmu_lbr_enable(struct perf_event *event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8f4a70f..6a3e603 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -251,6 +251,12 @@ struct pmu {
* flush branch stack on context-switches (needed in cpu-wide mode)
*/
void (*flush_branch_stack) (void);
+
+ /*
+ * PMU callback for context-switches. optional
+ */
+ void (*sched_task) (struct perf_event_context *ctx,
+ bool sched_in);
};

/**
@@ -546,6 +552,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
+extern void perf_sched_cb_disable(struct pmu *pmu);
+extern void perf_sched_cb_enable(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
extern int perf_event_refresh(struct perf_event *event, int refresh);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 403b781..11c63d6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -141,6 +141,7 @@ enum event_type_t {
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -2327,6 +2328,59 @@ unlock:
}
}

+void perf_sched_cb_disable(struct pmu *pmu)
+{
+ __get_cpu_var(perf_sched_cb_usages)--;
+}
+
+void perf_sched_cb_enable(struct pmu *pmu)
+{
+ __get_cpu_var(perf_sched_cb_usages)++;
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+ int count = 0;
+
+ if (prev == next)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (!pmu->sched_task)
+ continue;
+
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ pmu = cpuctx->ctx.pmu;
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

@@ -2346,6 +2400,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;

+ if (__get_cpu_var(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);

@@ -2605,6 +2662,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
/* check for system-wide branch_stack events */
if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
perf_branch_stack_sched_in(prev, task);
+ if (__get_cpu_var(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
--
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/