[PATCH 4/7] perf, x86: Save/resotre LBR stack during context switch

From: Yan, Zheng
Date: Tue Jun 25 2013 - 04:50:48 EST


From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx>

When the LBR call stack is enabled, it is necessary to save/restore
the stack on context switch. The solution is saving/restoring the
stack to/from task's perf event context. If task has no perf event
context, just flush the stack on context switch.

Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx>
---
arch/x86/kernel/cpu/perf_event.c | 18 +++--
arch/x86/kernel/cpu/perf_event.h | 13 +++-
arch/x86/kernel/cpu/perf_event_intel.c | 13 ++--
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 108 ++++++++++++++++++++++++++---
include/linux/perf_event.h | 6 +-
kernel/events/core.c | 65 +++++++++--------
6 files changed, 168 insertions(+), 55 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b2eada9..3843f80 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1768,6 +1768,13 @@ static int x86_pmu_event_idx(struct perf_event *event)
return idx + 1;
}

+static void x86_pmu_branch_stack_sched(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ if (x86_pmu.branch_stack_sched)
+ x86_pmu.branch_stack_sched(ctx, sched_in);
+}
+
static void *x86_pmu_event_context_alloc(struct perf_event_context *parent_ctx)
{
struct perf_event_context *ctx;
@@ -1776,6 +1783,9 @@ static void *x86_pmu_event_context_alloc(struct perf_event_context *parent_ctx)
if (!ctx)
return ERR_PTR(-ENOMEM);

+ if (parent_ctx)
+ intel_pmu_lbr_init_context(ctx, parent_ctx);
+
return ctx;
}

@@ -1833,12 +1843,6 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};

-static void x86_pmu_flush_branch_stack(void)
-{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
-}
-
void perf_check_microcode(void)
{
if (x86_pmu.check_microcode)
@@ -1865,7 +1869,7 @@ static struct pmu pmu = {
.commit_txn = x86_pmu_commit_txn,

.event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
+ .branch_stack_sched = x86_pmu_branch_stack_sched,
.event_context_alloc = x86_pmu_event_context_alloc,
};

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 08469de..0116970 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -405,7 +405,6 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);

void (*check_microcode)(void);
- void (*flush_branch_stack)(void);

/*
* Intel Arch Perfmon v2+
@@ -434,6 +433,8 @@ struct x86_pmu {
int lbr_nr; /* hardware stack size */
u64 lbr_sel_mask; /* LBR_SELECT valid bits */
const int *lbr_sel_map; /* lbr_select mappings */
+ void (*branch_stack_sched)(struct perf_event_context *ctx,
+ bool sched_in);

/*
* Extra registers for events
@@ -456,6 +457,12 @@ enum {

struct x86_perf_event_context {
struct perf_event_context ctx;
+
+ u64 lbr_from[MAX_LBR_ENTRIES];
+ u64 lbr_to[MAX_LBR_ENTRIES];
+ u64 lbr_stack_gen;
+ int lbr_callstack_users;
+ bool lbr_stack_saved;
};

#define x86_add_quirk(func_) \
@@ -668,8 +675,12 @@ void intel_pmu_pebs_disable_all(void);

void intel_ds_init(void);

+void intel_pmu_lbr_init_context(struct perf_event_context *child_ctx,
+ struct perf_event_context *parent_ctx);
void intel_pmu_lbr_reset(void);

+void intel_pmu_lbr_sched(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_enable(struct perf_event *event);

void intel_pmu_lbr_disable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3e92a68..f59b46e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1851,16 +1851,11 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}

-static void intel_pmu_flush_branch_stack(void)
+static void intel_pmu_branch_stack_sched(struct perf_event_context *ctx,
+ bool sched_in)
{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
+ intel_pmu_lbr_sched(ctx, sched_in);
}

PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -1914,7 +1909,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
+ .branch_stack_sched = intel_pmu_branch_stack_sched,
};

static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 2136320..43b16b4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -181,6 +181,13 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_32();
else
intel_pmu_lbr_reset_64();
+
+ wrmsrl(x86_pmu.lbr_tos, 0);
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
}

void intel_pmu_lbr_enable(struct perf_event *event)
@@ -190,17 +197,23 @@ void intel_pmu_lbr_enable(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;

- /*
- * Reset the LBR stack if we changed task context to
- * avoid data leaks.
- */
- if (event->ctx->task && cpuc->lbr_context != event->ctx) {
- intel_pmu_lbr_reset();
- cpuc->lbr_context = event->ctx;
- }
cpuc->br_sel = event->hw.branch_reg.reg;
-
cpuc->lbr_users++;
+
+ if (event->ctx->task &&
+ branch_user_callstack(event->hw.branch_reg.reg)) {
+ struct x86_perf_event_context *task_ctx = (void *)event->ctx;
+ /*
+ * Reset the LBR stack if the call stack is not
+ * continuous enabled
+ */
+ if (task_ctx->lbr_callstack_users == 0 &&
+ task_ctx->lbr_stack_gen + 1 < event->ctx->sched_gen)
+ intel_pmu_lbr_reset();
+
+ task_ctx->lbr_callstack_users++;
+ task_ctx->lbr_stack_gen = event->ctx->sched_gen;
+ }
}

void intel_pmu_lbr_disable(struct perf_event *event)
@@ -210,6 +223,13 @@ void intel_pmu_lbr_disable(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;

+ if (event->ctx->task &&
+ branch_user_callstack(event->hw.branch_reg.reg)) {
+ struct x86_perf_event_context *task_ctx = (void *)event->ctx;
+
+ task_ctx->lbr_callstack_users--;
+ }
+
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);

@@ -334,6 +354,76 @@ void intel_pmu_lbr_read(void)
intel_pmu_lbr_filter(cpuc);
}

+static void __intel_pmu_lbr_restore(struct x86_perf_event_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask = x86_pmu.lbr_nr - 1;
+ u64 tos = intel_pmu_lbr_tos();
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_saved = false;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_event_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask = x86_pmu.lbr_nr - 1;
+ u64 tos = intel_pmu_lbr_tos();
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_gen = task_ctx->ctx.sched_gen;
+ task_ctx->lbr_stack_saved = true;
+}
+
+void intel_pmu_lbr_init_context(struct perf_event_context *child_ctx,
+ struct perf_event_context *parent_ctx)
+{
+ struct x86_perf_event_context *task_ctx, *parent_task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ task_ctx = (struct x86_perf_event_context *)child_ctx;
+ parent_task_ctx = (struct x86_perf_event_context *)parent_ctx;
+
+ if (parent_task_ctx->lbr_callstack_users)
+ __intel_pmu_lbr_save(task_ctx);
+ else
+ task_ctx->lbr_stack_saved = false;
+}
+
+void intel_pmu_lbr_sched(struct perf_event_context *ctx, bool sched_in)
+{
+ struct x86_perf_event_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (!ctx) {
+ if (sched_in)
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ task_ctx = (struct x86_perf_event_context *)ctx;
+ if (sched_in) {
+ if (!task_ctx->lbr_stack_saved)
+ intel_pmu_lbr_reset();
+ else
+ __intel_pmu_lbr_restore(task_ctx);
+ } else {
+ __intel_pmu_lbr_save(task_ctx);
+ }
+}
+
/*
* SW filter is used:
* - in case there is no HW filter
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f6d1d59..b3e4faf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -271,9 +271,10 @@ struct pmu {
int (*event_idx) (struct perf_event *event); /*optional */

/*
- * flush branch stack on context-switches (needed in cpu-wide mode)
+ * Save/restore LBR stack on context-switches
*/
- void (*flush_branch_stack) (void);
+ void (*branch_stack_sched) (struct perf_event_context *ctx,
+ bool sched_in);

/*
* Allocate PMU special perf event context
@@ -495,6 +496,7 @@ struct perf_event_context {
struct perf_event_context *parent_ctx;
u64 parent_gen;
u64 generation;
+ u64 sched_gen;
int pin_count;
int nr_cgroups; /* cgroup evts */
int nr_branch_stack; /* branch_stack evt */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3aececc..1101ce8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -140,7 +140,7 @@ enum event_type_t {
*/
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_branch_stack_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -278,6 +278,9 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
+static void perf_branch_stack_sched(struct task_struct *task1,
+ struct task_struct *task2,
+ bool sched_in);

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -1271,8 +1274,11 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}

- if (has_branch_stack(event))
+ if (has_branch_stack(event)) {
+ if (ctx->is_active)
+ __get_cpu_var(perf_branch_stack_events)--;
ctx->nr_branch_stack--;
+ }

ctx->nr_events--;
if (event->attr.inherit_stat)
@@ -1796,8 +1802,10 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct task_struct *task)
{
cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
- if (ctx)
+ if (ctx) {
+ ctx->sched_gen++;
ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ }
cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
if (ctx)
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
@@ -2102,6 +2110,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (likely(!ctx->nr_events))
return;

+ if (!ctx->is_active && is_active)
+ __get_cpu_var(perf_branch_stack_events) -= ctx->nr_branch_stack;
+
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
if (!ctx->nr_active)
@@ -2291,6 +2302,10 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;

+ /* check for branch_stack events running on this cpu */
+ if (__get_cpu_var(perf_branch_stack_events))
+ perf_branch_stack_sched(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);

@@ -2398,6 +2413,9 @@ ctx_sched_in(struct perf_event_context *ctx,
if (likely(!ctx->nr_events))
return;

+ if (ctx->is_active && !is_active)
+ __get_cpu_var(perf_branch_stack_events) += ctx->nr_branch_stack;
+
now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);
@@ -2471,15 +2489,17 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* layer. It is invoked ONLY when there is at least one system-wide context
* with at least one active event using taken branch sampling.
*/
-static void perf_branch_stack_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static void perf_branch_stack_sched(struct task_struct *task1,
+ struct task_struct *task2,
+ bool sched_in)
{
struct perf_cpu_context *cpuctx;
+ struct perf_event_context *task_ctx;
struct pmu *pmu;
unsigned long flags;

/* no need to flush branch stack if not changing task */
- if (prev == task)
+ if (task1 == task2)
return;

local_irq_save(flags);
@@ -2488,25 +2508,26 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,

list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ task_ctx = cpuctx->task_ctx;

/*
* check if the context has at least one
* event using PERF_SAMPLE_BRANCH_STACK
*/
- if (cpuctx->ctx.nr_branch_stack > 0
- && pmu->flush_branch_stack) {
-
+ if (pmu->branch_stack_sched &&
+ (cpuctx->ctx.nr_branch_stack > 0 ||
+ (task_ctx && task_ctx->nr_branch_stack > 0))) {
pmu = cpuctx->ctx.pmu;

- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_lock(cpuctx, task_ctx);

perf_pmu_disable(pmu);

- pmu->flush_branch_stack();
+ pmu->branch_stack_sched(task_ctx, sched_in);

perf_pmu_enable(pmu);

- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_unlock(cpuctx, task_ctx);
}
}

@@ -2547,9 +2568,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);

- /* check for system-wide branch_stack events */
- if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
- perf_branch_stack_sched_in(prev, task);
+ /* check for branch_stack events running on this cpu */
+ if (__get_cpu_var(perf_branch_stack_events))
+ perf_branch_stack_sched(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3134,14 +3155,8 @@ static void free_event(struct perf_event *event)
static_key_slow_dec_deferred(&perf_sched_events);
}

- if (has_branch_stack(event)) {
+ if (has_branch_stack(event))
static_key_slow_dec_deferred(&perf_sched_events);
- /* is system-wide event */
- if (!(event->attach_state & PERF_ATTACH_TASK)) {
- atomic_dec(&per_cpu(perf_branch_stack_events,
- event->cpu));
- }
- }
}

if (event->rb) {
@@ -6562,12 +6577,8 @@ done:
return ERR_PTR(err);
}
}
- if (has_branch_stack(event)) {
+ if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_inc(&per_cpu(perf_branch_stack_events,
- event->cpu));
- }
}

return event;
--
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/