[PATCH 4/5] perf, tools, script: Add brstackasm output for branch stacks

From: Andi Kleen
Date: Mon Mar 28 2016 - 13:46:33 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

Implement printing full disassembled sequences for branch stacks in perf
script. This allows to directly print hot paths for individual samples,
together with branch misprediction and even cycle count information.

% perf record -b ...
% perf script -F brstackasm
...
00007f0668d54e88 movsx (%rsi), %ecx
00007f0668d54e8b lea -0x30(%rcx), %eax
00007f0668d54e8e cmp $0x9, %al
00007f0668d54e90 jbe 0x68d54eaf
00007f0668d54e92 cmp %cl, %dl
00007f0668d54e94 jnz 0x68d54eb5
00007f0668d54e96 add $0x1, %rdi
00007f0668d54e9a movsx (%rdi), %edx
00007f0668d54e9d add $0x1, %rsi
00007f0668d54ea1 test %dl, %dl
00007f0668d54ea3 jnz _dl_cache_libcmp+11 # PRED 21 cycles
00007f0668d54dfb lea -0x30(%rdx), %eax
00007f0668d54dfe cmp $0x9, %al
00007f0668d54e00 ja _dl_cache_libcmp+152 # PRED 2 cycles
00007f0668d54e88 movsx (%rsi), %ecx
00007f0668d54e8b lea -0x30(%rcx), %eax
00007f0668d54e8e cmp $0x9, %al
00007f0668d54e90 jbe 0x68d54eaf
00007f0668d54e92 cmp %cl, %dl
00007f0668d54e94 jnz 0x68d54eb5 # PRED 3 cycles
00007f0668d54eb5 movsx %dl, %eax
00007f0668d54eb8 sub %ecx, %eax
00007f0668d54eba ret # PRED 1 cycles
00007f0668d54fae test %eax, %eax
00007f0668d54fb0 jz _dl_load_cache_lookup+688
00007f0668d54fb6 jns 0x68d54f70
00007f0668d54fb8 lea 0x1(%r14), %ebx
00007f0668d54fbc cmp %r15d, %ebx
00007f0668d54fbf nop
00007f0668d54fc0 jle 0x68d54f79 # PRED 2 cycles

Open issues:
- Occasionally the path does not reach up to the sample IP, as the LBRs
may be freezed earlier. Use precise events to avoid that.

v2: Remove bogus hunk. Document --max-blocks. Fix some printfs.
Port to latest tree.
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
tools/perf/Documentation/perf-script.txt | 14 ++-
tools/perf/builtin-script.c | 183 +++++++++++++++++++++++++++++++
2 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index f2b81d8..0903985 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
- srcline, period, iregs, brstack, brstacksym, flags, asm.
+ srcline, period, iregs, brstack, brstacksym, flags, asm, brstackasm
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace
@@ -176,17 +176,24 @@ OPTIONS
i.e., -f "" is not allowed.

The brstack output includes branch related information with raw addresses using the
- /v/v/v/v/ syntax in the following order:
+ /v/v/v/v/cycles syntax in the following order:
FROM: branch source instruction
TO : branch target instruction
M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
X/- : X=branch inside a transactional region, -=not in transaction region or not supported
A/- : A=TSX abort entry, -=not aborted region or not supported
+ cycles

The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.

When asm is specified the assembler instruction of each sample is printed in disassembled form.

+ When brstackasm is specified the full assembler sequences of branch blocks for each sample
+ is printed (a branch block is a sequence of instructions not containing taken branches).
+ This is the full execution path leading to the sample. This is only supported when the
+ sample was recorded with perf record -b or -j any.
+ The maximum number of branch blocks to print can be configured with the --max-blocks option.
+
-k::
--vmlinux=<file>::
vmlinux pathname
@@ -268,6 +275,9 @@ include::itrace.txt[]
--force::
Don't do ownership validation.

+--max-blocks=N:
+ Maximum number of branch blocks to print with -F brstackasm
+
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 323572e..1072cbb 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -42,6 +42,7 @@ static bool nanosecs;
static const char *cpu_list;
static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
static struct perf_stat_config stat_config;
+static int max_blocks;

unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;

@@ -67,6 +68,7 @@ enum perf_output_field {
PERF_OUTPUT_WEIGHT = 1U << 18,
PERF_OUTPUT_BPF_OUTPUT = 1U << 19,
PERF_OUTPUT_ASM = 1U << 20,
+ PERF_OUTPUT_BRSTACKASM = 1U << 21,
};

struct output_option {
@@ -94,6 +96,7 @@ struct output_option {
{.str = "weight", .field = PERF_OUTPUT_WEIGHT},
{.str = "bpf-output", .field = PERF_OUTPUT_BPF_OUTPUT},
{.str = "asm", .field = PERF_OUTPUT_ASM},
+ {.str = "brstackasm", .field = PERF_OUTPUT_BRSTACKASM},
};

/* default set to maintain compatibility with current format */
@@ -293,6 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
"selected.\n");
return -EINVAL;
}
+ if (PRINT_FIELD(BRSTACKASM) &&
+ !(perf_evlist__combined_branch_type(session->evlist) &
+ PERF_SAMPLE_BRANCH_ANY)) {
+ pr_err("Display of branch stack assembler requested, but non all-branch filter set\n");
+ return -EINVAL;
+ }
+
if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -621,6 +631,175 @@ static void print_sample_brstacksym(struct perf_sample *sample,
}
}

+#ifdef HAVE_UDIS86
+#define MAXBB 16384UL
+#define MAXINSN 16
+
+static int grab_bb(char *buffer, u64 start, u64 end,
+ struct machine *machine, struct thread *thread,
+ bool *is64bit, u8 *cpumode)
+{
+ int offset, len;
+ struct addr_location al;
+ bool kernel;
+
+ if (!start || !end)
+ return 0;
+
+ kernel = machine__kernel_ip(machine, start);
+ if (kernel)
+ *cpumode = PERF_RECORD_MISC_KERNEL;
+ else
+ *cpumode = PERF_RECORD_MISC_USER;
+ if (kernel != machine__kernel_ip(machine, end))
+ return 0;
+
+ memset(&al, 0, sizeof(al));
+ if (end - start > MAXBB - MAXINSN) {
+ printf("\tbasic block %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
+ start, end, end - start);
+ return 0;
+ }
+
+ thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+ if (!al.map || !al.map->dso) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+ start, end);
+ return 0;
+ }
+ if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+ start, end);
+ return 0;
+ }
+
+ /* Load maps to ensure dso->is_64_bit has been updated */
+ map__load(al.map, machine->symbol_filter);
+
+ offset = al.map->map_ip(al.map, start);
+ len = dso__data_read_offset(al.map->dso, machine,
+ offset, (u8 *)buffer,
+ end - start + MAXINSN);
+
+ *is64bit = al.map->dso->is_64_bit;
+ return len;
+}
+#endif
+
+static void print_sample_brstackasm(struct perf_sample *sample,
+ struct thread *thread __maybe_unused,
+ struct perf_event_attr *attr __maybe_unused,
+ struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+ struct branch_stack *br = sample->branch_stack;
+ u64 start, end;
+ int i;
+ static bool ud_initialized = false;
+ static struct perf_ud ud;
+ char buffer[MAXBB];
+ int len;
+ bool last;
+ bool is64bit;
+ int nr;
+
+ if (!(br && br->nr))
+ return;
+ nr = br->nr;
+ if (max_blocks && nr > max_blocks + 1)
+ nr = max_blocks + 1;
+
+ if (!ud_initialized) {
+ ud_initialized = true;
+ ud_init(&ud.ud_obj);
+ ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+ ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+ }
+ ud.thread = thread;
+ ud.cpu = sample->cpu;
+
+ putchar('\n');
+ for (i = nr - 2; i >= 0; i--) {
+ if (br->entries[i].from || br->entries[i].to)
+ printf("%d: %lx-%lx\n", i,
+ br->entries[i].from,
+ br->entries[i].to);
+ start = br->entries[i + 1].to;
+ end = br->entries[i].from;
+
+ /*
+ * Leave extra bytes for the final jump instruction for
+ * which we don't know the length
+ */
+ len = grab_bb(buffer, start, end + MAXINSN,
+ machine, thread, &is64bit,
+ &ud.cpumode);
+ if (len <= 0)
+ continue;
+
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, start);
+ ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+ last = false;
+ while (ud_disassemble(&ud.ud_obj) && !last) {
+ if (ud_insn_ptr(&ud.ud_obj) ==
+ (uint8_t *)buffer + end - start) {
+ printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s\n",
+ ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj),
+ br->entries[i].flags.predicted ? " PRED" : "",
+ br->entries[i].flags.mispred ? " MISPRED" : "",
+ br->entries[i].flags.in_tx ? " INTX" : "",
+ br->entries[i].flags.abort ? " ABORT" : "");
+ if (br->entries[i].flags.cycles)
+ printf(" %d cycles", br->entries[i].flags.cycles);
+ last = true;
+ } else {
+ printf("\t%016" PRIx64 "\t%s\n",
+ ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+ }
+ }
+ }
+
+ /*
+ * Hit the branch? In this case we are already done, and the target
+ * has not been executed yet.
+ */
+ if (br->entries[0].from == sample->ip)
+ return;
+ if (br->entries[0].flags.abort)
+ return;
+
+ /*
+ * Print final block upto sample
+ */
+ start = br->entries[0].to;
+ end = sample->ip;
+ len = grab_bb(buffer, start, end, machine, thread, &is64bit,
+ &ud.cpumode);
+ ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+ if (len <= 0) {
+ /* Print at least last IP if basic block did not work */
+ len = grab_bb(buffer, sample->ip, sample->ip + MAXINSN,
+ machine, thread, &is64bit, &ud.cpumode);
+ if (len <= 0)
+ return;
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, sample->ip);
+ if (ud_disassemble(&ud.ud_obj))
+ printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+ return;
+ }
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, start);
+ while (ud_disassemble(&ud.ud_obj) &&
+ ud_insn_ptr(&ud.ud_obj) <= (uint8_t *)buffer + end - start)
+ printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+#endif
+}

static void print_sample_addr(struct perf_sample *sample,
struct thread *thread,
@@ -898,6 +1077,8 @@ print_rest:
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);

+ if (PRINT_FIELD(BRSTACKASM))
+ print_sample_brstackasm(sample, thread, attr, machine);
if (PRINT_FIELD(ASM))
print_sample_asm(sample, thread, attr, al, machine);

@@ -2129,6 +2310,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"Show the mmap events"),
OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
"Show context switch events (if recorded)"),
+ OPT_INTEGER(0, "max-blocks", &max_blocks,
+ "Maximum number of code blocks to dump with brstackasm"),
OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
OPT_BOOLEAN(0, "ns", &nanosecs,
"Use 9 decimal places when displaying time"),
--
2.5.5