[RFC PATCH 1/1] proc: introduce /proc/<pid>/lbr_stack

From: kan . liang
Date: Mon Feb 23 2015 - 10:31:14 EST


From: Kan Liang <kan.liang@xxxxxxxxx>

Haswell has a new feature that utilizes the existing Last Branch Record
facility to record call chains. It has been implemented in perf. The
call chains information is saved during perf event context.

This patch exposes a /proc/<pid>/lbr_stack file that shows the saved LBR
call chain information.

Currently, there are already some tools which can dump stack(E.g.
gstack). However, all of these tools rely on frame pointer or dwarf
information.
The LBR call stack facility provides an alternative to get stack. It
doesn't need the debug information to construct the stack.
One common case is backtracing through the libpthread library in glibc
which is partially in assembler and neither have full dwarf annotation
nor frame pointers.
It's also helpful for jited code.

Here are some examples. perf_stack uses /proc/<pid>/lbr_stack to dump
stack information.
Example 1:
tchain_edit is a binary with debug information.

./tchain_edit &
[1] 8058

gstack 8058
0 0x000000000040054d in f3 ()
1 0x0000000000400587 in f2 ()
2 0x00000000004005b3 in f1 ()
3 0x00000000004005f4 in main ()

./perf_stack 8058
0 0x0000000000400540: f3 at ??:?
1 0x000000000040057d: f2 at ??:?
2 0x00000000004005ae: f1 at ??:?
3 0x00000000004005f9: main at ??:?

With debug information, both gstack and perf_stack dump stack
information.

Example 2:
tchain_edit_ch is a binary which doesn't include either dwarf or frame
pointer information.

./tchain_edit_ch &
[1] 8084

gstack 8084
0 0x0000000000400568 in ?? ()
1 0x00007fff134a7960 in ?? ()
2 0x0000000000400587 in ?? ()
3 0x00007fff134a7aa8 in ?? ()
4 0x0000004600000000 in ?? ()
5 0x00007fff134a7980 in ?? ()
6 0x00000000004005b8 in ?? ()
7 0x0000000000000000 in ?? ()

gstack shows the wrong stack.

./perf_stack 8084
0 0x0000000000400540: ?? ??:0
1 0x0000000000400582: ?? ??:0
2 0x00000000004005ae: ?? ??:0
3 0x00000000004005f9: ?? ??:0

LBR call stack shows the correct stack.

Here is the perf_stack script.

perf record --call-graph lbr --pid $1 &
perf_pid=$!
running_cpu=`cat /proc/$1/stat | awk '{print $39}'`
cpu_tmp=$((1<<$running_cpu))
cpu=`printf "0x%X" $cpu_tmp`
//run something to force context switch
taskset $cpu sleep 2
//dump LBR call stack
i=0
while read -r line
do
function=$(addr2line $line -e /proc/$1/exe -fap)
echo "#"$i" "$function
i=`expr $i + 1`
done < /proc/$1/lbr_stack
kill -9 $perf_pid

The LBR call stack has following known limitations
- Only available for haswell and later platform
- Only dump user stack
- Exception handing such as setjmp/longjmp will have calls/returns not
match
- Pushing different return address onto the stack will have
calls/returns not match
- If callstack is deeper than the LBR, only the last entries are
captured

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxx>
---
arch/x86/include/asm/perf_event.h | 2 ++
arch/x86/kernel/cpu/perf_event.c | 9 +++++++
arch/x86/kernel/cpu/perf_event.h | 9 +++++--
arch/x86/kernel/cpu/perf_event_intel.c | 1 +
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 16 +++++++++++
fs/proc/base.c | 43 ++++++++++++++++++++++++++++++
include/linux/perf_event.h | 8 +++++-
7 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index dc0f6ed..70f07fd 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -11,6 +11,8 @@

#define X86_PMC_IDX_MAX 64

+#define MAX_LBR_ENTRIES 16
+
#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
#define MSR_ARCH_PERFMON_PERFCTR1 0xc2

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e0dab5c..0b39f72 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1922,6 +1922,14 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
x86_pmu.sched_task(ctx, sched_in);
}

+static void x86_pmu_save_lbr_stack(struct perf_event_context *ctx,
+ __u64 *lbr_nr,
+ struct perf_branch_entry *lbr_entries)
+{
+ if (x86_pmu.save_lbr_stack)
+ x86_pmu.save_lbr_stack(ctx, lbr_nr, lbr_entries);
+}
+
void perf_check_microcode(void)
{
if (x86_pmu.check_microcode)
@@ -1952,6 +1960,7 @@ static struct pmu pmu = {

.event_idx = x86_pmu_event_idx,
.sched_task = x86_pmu_sched_task,
+ .save_lbr_stack = x86_pmu_save_lbr_stack,
.task_ctx_size = sizeof(struct x86_perf_task_context),
};

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a371d27..29d8b14 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -123,8 +123,6 @@ struct intel_shared_regs {
unsigned core_id; /* per-core: core id */
};

-#define MAX_LBR_ENTRIES 16
-
struct cpu_hw_events {
/*
* Generic x86 PMC bits
@@ -474,6 +472,9 @@ struct x86_pmu {
void (*check_microcode)(void);
void (*sched_task)(struct perf_event_context *ctx,
bool sched_in);
+ void (*save_lbr_stack)(struct perf_event_context *ctx,
+ __u64 *lbr_nr,
+ struct perf_branch_entry *lbr_entries);

/*
* Intel Arch Perfmon v2+
@@ -743,6 +744,10 @@ void intel_ds_init(void);

void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);

+void intel_pmu_save_lbr_stack(struct perf_event_context *ctx,
+ __u64 *lbr_nr,
+ struct perf_branch_entry *lbr_entries);
+
void intel_pmu_lbr_reset(void);

void intel_pmu_lbr_enable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9f1dd18..5123581 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2082,6 +2082,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_lbr_sched_task,
+ .save_lbr_stack = intel_pmu_save_lbr_stack,
};

static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 0473874..01d3d8e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -293,6 +293,22 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
}
}

+void intel_pmu_save_lbr_stack(struct perf_event_context *ctx,
+ __u64 *lbr_nr,
+ struct perf_branch_entry *lbr_entries)
+{
+ struct x86_perf_task_context *task_ctx = ctx->task_ctx_data;
+ int i;
+
+ if (task_ctx) {
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_entries[i].from = task_ctx->lbr_from[i];
+ lbr_entries[i].to = task_ctx->lbr_to[i];
+ }
+ *lbr_nr = x86_pmu.lbr_nr;
+ }
+}
+
static inline bool branch_user_callstack(unsigned br_sel)
{
return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3f3d7ae..660cdf2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -87,6 +87,7 @@
#include <linux/slab.h>
#include <linux/flex_array.h>
#include <linux/posix-timers.h>
+#include <linux/perf_event.h>
#ifdef CONFIG_HARDWALL
#include <asm/hardwall.h>
#endif
@@ -265,6 +266,42 @@ static void unlock_trace(struct task_struct *task)
mutex_unlock(&task->signal->cred_guard_mutex);
}

+#ifdef CONFIG_PERF_EVENTS
+static int proc_pid_lbr(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+ struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ __u64 lbr_nr = 0;
+ int i, err = 0;
+
+ rcu_read_lock();
+ ctx = task->perf_event_ctxp[perf_hw_context];
+ if (!ctx || !ctx->pmu || !ctx->pmu->save_lbr_stack) {
+ err = -ENOENT;
+ goto end;
+ }
+
+ raw_spin_lock(&ctx->lock);
+ ctx->pmu->save_lbr_stack(ctx, &lbr_nr, lbr_entries);
+ raw_spin_unlock(&ctx->lock);
+
+ if ((lbr_nr > 0) && (lbr_entries[0].from != 0) && (lbr_entries[0].to != 0)) {
+ seq_printf(m, "%pS\n", (void *)lbr_entries[0].to);
+ for (i = 0; i < lbr_nr; i++) {
+ if ((lbr_entries[i].to == 0) && (lbr_entries[i].from == 0))
+ break;
+ seq_printf(m, "%pS\n", (void *)lbr_entries[i].from);
+ }
+ }
+
+end:
+ rcu_read_unlock();
+
+ return err;
+}
+#endif
+
#ifdef CONFIG_STACKTRACE

#define MAX_STACK_TRACE_DEPTH 64
@@ -2628,6 +2665,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
#endif
+#ifdef CONFIG_PERF_EVENTS
+ ONE("lbr_stack", S_IRUSR, proc_pid_lbr),
+#endif
};

static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -2970,6 +3010,9 @@ static const struct pid_entry tid_base_stuff[] = {
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
+#ifdef CONFIG_PERF_EVENTS
+ ONE("lbr_stack", S_IRUSR, proc_pid_lbr),
+#endif
};

static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 724d372..0d246a5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -267,10 +267,16 @@ struct pmu {
void (*sched_task) (struct perf_event_context *ctx,
bool sched_in);
/*
+ * Save LBR stack
+ */
+ void (*save_lbr_stack) (struct perf_event_context *ctx,
+ __u64 *lbr_nr,
+ struct perf_branch_entry *lbr_entries);
+
+ /*
* PMU specific data size
*/
size_t task_ctx_size;
-
};

/**
--
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/