[PATCH 14/25] perf intel-pt: Support generating branch stack
From: Adrian Hunter
Date: Fri Sep 25 2015 - 09:21:50 EST
Add support for generating branch stack context for PT samples.
The decoder reports a configurable number of branches as branch
context for each sample. Internally it keeps track of them by
using a simple sliding window. We also flush the last branch
buffer on each sample to avoid overlapping intervals.
This is useful for:
- Reporting accurate basic block edge frequencies through the perf
report branch view
- Using with --branch-history to get the wider context of samples
- Other users of LBRs
Also the Documentation is updated.
Examples:
Record with Intel PT:
perf record -e intel_pt//u ls
Branch stacks are used by default if synthesized so:
perf report --itrace=ile
is the same as:
perf report --itrace=ile -b
Branch history can be requested also:
perf report --itrace=igle --branch-history
Based-on-patch-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
---
tools/perf/Documentation/intel-pt.txt | 10 +++
tools/perf/util/intel-pt.c | 115 ++++++++++++++++++++++++++++++++++
2 files changed, 125 insertions(+)
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index 4a0501d7a3b4..f4d8e706619c 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -686,6 +686,7 @@ The letters are:
e synthesize tracing error events
d create a debug log
g synthesize a call chain (use with i or x)
+ l synthesize last branch entries (use with i or x)
"Instructions" events look like they were recorded by "perf record -e
instructions".
@@ -728,6 +729,15 @@ transactions events can be specified. e.g.
--itrace=ig32
--itrace=xg32
+Also the number of last branch entries (default 64, max. 1024) for instructions or
+transactions events can be specified. e.g.
+
+ --itrace=il10
+ --itrace=xl10
+
+Note that last branch entries are cleared for each sample, so there is no overlap
+from one sample to the next.
+
To disable trace decoding entirely, use the option --no-itrace.
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 2c01e723826a..05e8fcc5188b 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -22,6 +22,7 @@
#include "../perf.h"
#include "session.h"
#include "machine.h"
+#include "sort.h"
#include "tool.h"
#include "event.h"
#include "evlist.h"
@@ -115,6 +116,9 @@ struct intel_pt_queue {
void *decoder;
const struct intel_pt_state *state;
struct ip_callchain *chain;
+ struct branch_stack *last_branch;
+ struct branch_stack *last_branch_rb;
+ size_t last_branch_pos;
union perf_event *event_buf;
bool on_heap;
bool stop;
@@ -675,6 +679,19 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
goto out_free;
}
+ if (pt->synth_opts.last_branch) {
+ size_t sz = sizeof(struct branch_stack);
+
+ sz += pt->synth_opts.last_branch_sz *
+ sizeof(struct branch_entry);
+ ptq->last_branch = zalloc(sz);
+ if (!ptq->last_branch)
+ goto out_free;
+ ptq->last_branch_rb = zalloc(sz);
+ if (!ptq->last_branch_rb)
+ goto out_free;
+ }
+
ptq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
if (!ptq->event_buf)
goto out_free;
@@ -732,6 +749,8 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
out_free:
zfree(&ptq->event_buf);
+ zfree(&ptq->last_branch);
+ zfree(&ptq->last_branch_rb);
zfree(&ptq->chain);
free(ptq);
return NULL;
@@ -746,6 +765,8 @@ static void intel_pt_free_queue(void *priv)
thread__zput(ptq->thread);
intel_pt_decoder_free(ptq->decoder);
zfree(&ptq->event_buf);
+ zfree(&ptq->last_branch);
+ zfree(&ptq->last_branch_rb);
zfree(&ptq->chain);
free(ptq);
}
@@ -876,6 +897,57 @@ static int intel_pt_setup_queues(struct intel_pt *pt)
return 0;
}
+static inline void intel_pt_copy_last_branch_rb(struct intel_pt_queue *ptq)
+{
+ struct branch_stack *bs_src = ptq->last_branch_rb;
+ struct branch_stack *bs_dst = ptq->last_branch;
+ size_t nr = 0;
+
+ bs_dst->nr = bs_src->nr;
+
+ if (!bs_src->nr)
+ return;
+
+ nr = ptq->pt->synth_opts.last_branch_sz - ptq->last_branch_pos;
+ memcpy(&bs_dst->entries[0],
+ &bs_src->entries[ptq->last_branch_pos],
+ sizeof(struct branch_entry) * nr);
+
+ if (bs_src->nr >= ptq->pt->synth_opts.last_branch_sz) {
+ memcpy(&bs_dst->entries[nr],
+ &bs_src->entries[0],
+ sizeof(struct branch_entry) * ptq->last_branch_pos);
+ }
+}
+
+static inline void intel_pt_reset_last_branch_rb(struct intel_pt_queue *ptq)
+{
+ ptq->last_branch_pos = 0;
+ ptq->last_branch_rb->nr = 0;
+}
+
+static void intel_pt_update_last_branch_rb(struct intel_pt_queue *ptq)
+{
+ const struct intel_pt_state *state = ptq->state;
+ struct branch_stack *bs = ptq->last_branch_rb;
+ struct branch_entry *be;
+
+ if (!ptq->last_branch_pos)
+ ptq->last_branch_pos = ptq->pt->synth_opts.last_branch_sz;
+
+ ptq->last_branch_pos -= 1;
+
+ be = &bs->entries[ptq->last_branch_pos];
+ be->from = state->from_ip;
+ be->to = state->to_ip;
+ be->flags.abort = !!(state->flags & INTEL_PT_ABORT_TX);
+ be->flags.in_tx = !!(state->flags & INTEL_PT_IN_TX);
+ /* No support for mispredict */
+
+ if (bs->nr < ptq->pt->synth_opts.last_branch_sz)
+ bs->nr += 1;
+}
+
static int intel_pt_inject_event(union perf_event *event,
struct perf_sample *sample, u64 type,
bool swapped)
@@ -890,6 +962,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
struct intel_pt *pt = ptq->pt;
union perf_event *event = ptq->event_buf;
struct perf_sample sample = { .ip = 0, };
+ struct dummy_branch_stack {
+ u64 nr;
+ struct branch_entry entries;
+ } dummy_bs;
if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
return 0;
@@ -912,6 +988,21 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
sample.flags = ptq->flags;
sample.insn_len = ptq->insn_len;
+ /*
+ * perf report cannot handle events without a branch stack when using
+ * SORT_MODE__BRANCH so make a dummy one.
+ */
+ if (pt->synth_opts.last_branch && sort__mode == SORT_MODE__BRANCH) {
+ dummy_bs = (struct dummy_branch_stack){
+ .nr = 1,
+ .entries = {
+ .from = sample.ip,
+ .to = sample.addr,
+ },
+ };
+ sample.branch_stack = (struct branch_stack *)&dummy_bs;
+ }
+
if (pt->synth_opts.inject) {
ret = intel_pt_inject_event(event, &sample,
pt->branches_sample_type,
@@ -961,6 +1052,11 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
sample.callchain = ptq->chain;
}
+ if (pt->synth_opts.last_branch) {
+ intel_pt_copy_last_branch_rb(ptq);
+ sample.branch_stack = ptq->last_branch;
+ }
+
if (pt->synth_opts.inject) {
ret = intel_pt_inject_event(event, &sample,
pt->instructions_sample_type,
@@ -974,6 +1070,9 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
pr_err("Intel Processor Trace: failed to deliver instruction event, error %d\n",
ret);
+ if (pt->synth_opts.last_branch)
+ intel_pt_reset_last_branch_rb(ptq);
+
return ret;
}
@@ -1008,6 +1107,11 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
sample.callchain = ptq->chain;
}
+ if (pt->synth_opts.last_branch) {
+ intel_pt_copy_last_branch_rb(ptq);
+ sample.branch_stack = ptq->last_branch;
+ }
+
if (pt->synth_opts.inject) {
ret = intel_pt_inject_event(event, &sample,
pt->transactions_sample_type,
@@ -1021,6 +1125,9 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
pr_err("Intel Processor Trace: failed to deliver transaction event, error %d\n",
ret);
+ if (pt->synth_opts.callchain)
+ intel_pt_reset_last_branch_rb(ptq);
+
return ret;
}
@@ -1116,6 +1223,9 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
return err;
}
+ if (pt->synth_opts.last_branch)
+ intel_pt_update_last_branch_rb(ptq);
+
if (!pt->sync_switch)
return 0;
@@ -1763,6 +1873,8 @@ static int intel_pt_synth_events(struct intel_pt *pt,
pt->instructions_sample_period = attr.sample_period;
if (pt->synth_opts.callchain)
attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+ if (pt->synth_opts.last_branch)
+ attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
pr_debug("Synthesizing 'instructions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
id, (u64)attr.sample_type);
err = intel_pt_synth_event(session, &attr, id);
@@ -1782,6 +1894,8 @@ static int intel_pt_synth_events(struct intel_pt *pt,
attr.sample_period = 1;
if (pt->synth_opts.callchain)
attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+ if (pt->synth_opts.last_branch)
+ attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
pr_debug("Synthesizing 'transactions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
id, (u64)attr.sample_type);
err = intel_pt_synth_event(session, &attr, id);
@@ -1808,6 +1922,7 @@ static int intel_pt_synth_events(struct intel_pt *pt,
attr.sample_period = 1;
attr.sample_type |= PERF_SAMPLE_ADDR;
attr.sample_type &= ~(u64)PERF_SAMPLE_CALLCHAIN;
+ attr.sample_type &= ~(u64)PERF_SAMPLE_BRANCH_STACK;
pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
id, (u64)attr.sample_type);
err = intel_pt_synth_event(session, &attr, id);
--
1.9.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/