[PATCH 12/12] perf, tools, script: Implement dwarf resolving of instructions
From: Andi Kleen
Date: Mon Nov 27 2017 - 19:26:37 EST
From: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Implement resolving arguments of instructions to dwarf variable names.
When we sample an instruction, decode the instruction and try to
symbolize the register or destination it is using. Also print the type.
It builds on the perf probe debugging information reverse lookup
infrastructure added earlier.
The dwarf decoding magic is all done using Masami Hiramatsu's perf probe code.
This is useful for
- The PTWRITE instruction: when the compiler generates debugging information
for PTWRITE arguments. The value logged by PTWRITE is available to the
PT decoder, so it can print the value.
- It also works for other samples with an IP, so it's possible to follow
their memory access patterns (but not the values)
For the sample we use the instruction decoder to decode the instruction
at the sample point, and then map the arguments to dwarf information.
For structure reference we only print the numeric offset, but do not
resolve the field name.
Absolute memory references are not supported
It doesn't distinguish SSE (but AVX) registers from GPRs
(this would require extending the instruction decoder to detect SSE
instructions)
Example:
>From perf itself
% perf record -e intel_pt//u -a sleep 1
% perf script --itrace=i0ns -F insnvar,insn,ip,sym -f 2>&1 | xed -F insn: -A -64 | less
...
4f7e61 xyarray__max_y pushq %rbp
4f7e62 xyarray__max_y mov %rsp, %rbp
4f7e65 xyarray__max_y sub $0x20, %rsp
4f7e69 xyarray__max_y movq %rdi, -0x18(%rbp) { -24(xy), struct xyarray* }
4f7e6d xyarray__max_y movq %fs:0x28, %rax
4f7e76 xyarray__max_y movq %rax, -0x8(%rbp) { -8(xy), struct xyarray* }
4f7e7a xyarray__max_y xor %eax, %eax
4f7e7c xyarray__max_y movq -0x18(%rbp), %rax { -24(xy), struct xyarray* }
4f7e80 xyarray__max_y movq 0x20(%rax), %rax
4f7e84 xyarray__max_y movq -0x8(%rbp), %rdx { -8(xy), struct xyarray* }
4f7e88 xyarray__max_y xorq %fs:0x28, %rdx
4f7e91 xyarray__max_y jz 0x7
4f7e98 xyarray__max_y leaveq
4f7e99 xyarray__max_y retq
In this example we now know that this function accesses two fields in struct xyarray *
Open Issues:
- It is fairly slow. Some caching would likely help.
- Frame pointer references are usually not correctly resolved,
which are common in unoptimized code. That's usually fine
because memory access on the stack is not very interesting.
- It cannot resolve some references.
But I find it already quite useful.
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
tools/perf/Documentation/perf-script.txt | 8 +-
tools/perf/arch/x86/util/Build | 1 +
tools/perf/arch/x86/util/operand.c | 131 +++++++++++++++++++++++++
tools/perf/builtin-script.c | 162 ++++++++++++++++++++++++++++++-
tools/perf/util/Build | 1 +
tools/perf/util/operand.c | 16 +++
tools/perf/util/operand.h | 16 +++
tools/perf/util/probe-event.c | 3 +
8 files changed, 335 insertions(+), 3 deletions(-)
create mode 100644 tools/perf/arch/x86/util/operand.c
create mode 100644 tools/perf/util/operand.c
create mode 100644 tools/perf/util/operand.h
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index e296944cc03f..d3b93b7f804b 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -117,7 +117,9 @@ OPTIONS
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
- brstackoff, callindent, insn, insnlen, synth, phys_addr, iregvals.
+ brstackoff, callindent, insn, insnlen, synth, phys_addr, iregvals,
+ insnvar
+
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
@@ -221,6 +223,10 @@ OPTIONS
(with perf record -I ...) to their symbolic names in the program. This requires availability
of debug information in the binaries.
+ With insnvar try to decode and symbolize operands of sampled or traced instructions
+ using debug information. When PTWRITEs are synthesized with Intel PT the values of the
+ PTWRITEs are automatically symbolized.
+
-k::
--vmlinux=<file>::
vmlinux pathname
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index 139f9f1a56f9..8e9a2140e72b 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -5,6 +5,7 @@ libperf-y += insnlen.o
libperf-y += kvm-stat.o
libperf-y += perf_regs.o
libperf-y += group.o
+libperf-y += operand.o
libperf-$(CONFIG_DWARF) += dwarf-regs.o
libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
diff --git a/tools/perf/arch/x86/util/operand.c b/tools/perf/arch/x86/util/operand.c
new file mode 100644
index 000000000000..c78c21e8955f
--- /dev/null
+++ b/tools/perf/arch/x86/util/operand.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+/* Decode instructions to resolve operands. */
+#include <stdio.h>
+#include "debug.h"
+#include "perf.h"
+#include "operand.h"
+#include "intel-pt-decoder/insn.h"
+#include "intel-pt-decoder/inat.h"
+
+static unsigned char x86_reg_to_perf[16] = {
+ [0] = PERF_REG_X86_AX,
+ [1] = PERF_REG_X86_CX,
+ [2] = PERF_REG_X86_DX,
+ [3] = PERF_REG_X86_BX,
+ [4] = PERF_REG_X86_SP,
+ [5] = PERF_REG_X86_BP,
+ [6] = PERF_REG_X86_SI,
+ [7] = PERF_REG_X86_DI,
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+ [8] = PERF_REG_X86_R8,
+ [9] = PERF_REG_X86_R9,
+ [10] = PERF_REG_X86_R10,
+ [11] = PERF_REG_X86_R11,
+ [12] = PERF_REG_X86_R12,
+ [13] = PERF_REG_X86_R13,
+ [14] = PERF_REG_X86_R14,
+ [15] = PERF_REG_X86_R15,
+#endif
+};
+
+/* Decode x86 instruction and print address mode. */
+int arch_resolve_operand(char *insnbytes, int insnlen, bool is64bit,
+ u64 ip,
+ u64 val,
+ struct operand_print_ops *ops,
+ void *ctx)
+{
+ struct insn insn;
+ bool has_value;
+ int reg;
+
+ insn_init(&insn, insnbytes, insnlen, is64bit);
+ insn_get_length(&insn);
+ if (!insn_complete(&insn))
+ goto unknown;
+ /* Cannot handle Y/Zmm */
+ if (insn.vex_prefix.nbytes > 0)
+ goto unknown;
+ if (!insn.modrm.nbytes)
+ goto unknown;
+
+ switch (insn.opcode.bytes[0]) {
+ case 0x0f:
+ /* For PTWRITE use the caller value */
+ if (insn.opcode.bytes[1] == 0xae)
+ has_value = true;
+ break;
+ case 0xb0:
+ case 0xb8:
+ case 0xc6:
+ case 0xc7:
+ /* For MOV $xxx use the immediate */
+ if (insn.immediate.nbytes) {
+ has_value = true;
+ val = insn.immediate.value;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* Could also get known register values from caller */
+
+ /* Should check for SSE instructions to detect XMM* */
+
+ if (insn_rip_relative(&insn)) {
+ ops->print_symbol(ctx, ip + insn.length + insn.displacement.value,
+ has_value, val);
+ return 0;
+ }
+
+ /* Should handle direct memory offset */
+
+ reg = X86_MODRM_RM(insn.modrm.value);
+ if (insn.rex_prefix.nbytes && X86_REX_B(insn.rex_prefix.value))
+ reg += 8;
+ reg = x86_reg_to_perf[reg];
+
+ switch (X86_MODRM_MOD(insn.modrm.value)) {
+ case 0: /* [r/m] */
+ case 1: /* [r/m + disp8] */
+ case 2: /* [r/m + disp32] */
+ if (insn.sib.nbytes) {
+ /*
+ * Scaling and multiple registers
+ * not supported for now.
+ */
+ pr_debug("SIB encoding not supported\n");
+ goto unknown;
+ }
+ ops->print_indirect_reg(ctx, reg, insn.displacement.value,
+ has_value, val);
+ break;
+
+ case 3: /* register value */
+ ops->print_reg(ctx, reg, has_value, val);
+ break;
+
+ default:
+ goto unknown;
+ }
+ return 0;
+
+unknown:
+ ops->print_unknown(ctx);
+ return 0;
+
+}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 7913ec732620..792e1d2dfdd4 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -36,6 +36,7 @@
#include "util/dump-insn.h"
#include "util/probe-finder.h"
#include "util/dwarf-sample.h"
+#include "util/operand.h"
#include <dirent.h>
#include <errno.h>
#include <inttypes.h>
@@ -93,6 +94,7 @@ enum perf_output_field {
PERF_OUTPUT_PHYS_ADDR = 1U << 26,
PERF_OUTPUT_UREGS = 1U << 27,
PERF_OUTPUT_IREG_VALS = 1U << 28,
+ PERF_OUTPUT_INSN_VAR = 1U << 29,
};
struct output_option {
@@ -128,6 +130,7 @@ struct output_option {
{.str = "synth", .field = PERF_OUTPUT_SYNTH},
{.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR},
{.str = "iregvals", .field = PERF_OUTPUT_IREG_VALS},
+ {.str = "insnvar", .field = PERF_OUTPUT_INSN_VAR},
};
enum {
@@ -442,7 +445,7 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
PERF_OUTPUT_PHYS_ADDR))
return -EINVAL;
- if (PRINT_FIELD(IREG_VALS)) {
+ if (PRINT_FIELD(IREG_VALS) || PRINT_FIELD(INSN_VAR)) {
if (init_probe_symbol_maps(false) >= 0)
probe_conf.max_probes = MAX_PROBES;
}
@@ -1068,6 +1071,159 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
return printed;
}
+#ifdef HAVE_DWARF_SUPPORT
+
+struct operand_print_ctx {
+ FILE *fp;
+ struct variable_list *vls;
+ int dret;
+ struct thread *thread;
+ u8 cpumode;
+};
+
+static void print_op_reg(void *ctx, int reg, bool has_value, u64 val)
+{
+ struct operand_print_ctx *oc = ctx;
+ char *name, *type;
+
+ if (!dwarf_varlist_find_reg(oc->vls, oc->dret, reg, &name, &type)) {
+ fprintf(oc->fp, " { %s", name);
+ if (has_value)
+ fprintf(oc->fp, " = %#" PRIx64, val);
+ fprintf(oc->fp, ", %.*s }", (int)strcspn(type, "\t"), type);
+ } else if (verbose)
+ fprintf(oc->fp, " {?NO-MATCH-REG}");
+}
+
+static void print_op_symbol(void *ctx, u64 addr, bool has_val, u64 val)
+{
+ struct operand_print_ctx *oc = ctx;
+ struct addr_location al;
+
+ memset(&al, 0, sizeof(struct addr_location));
+ thread__find_addr_map(oc->thread, oc->cpumode, MAP__VARIABLE,
+ addr, &al);
+ if (!al.map)
+ thread__find_addr_map(oc->thread, oc->cpumode, MAP__FUNCTION,
+ addr, &al);
+ if (al.map)
+ al.sym = map__find_symbol(al.map, al.addr);
+
+ if (al.map && al.sym) {
+ fprintf(oc->fp, " { ");
+ symbol__fprintf_symname_offs(al.sym, &al, oc->fp);
+ if (has_val)
+ fprintf(oc->fp, " = %lx", val);
+ fprintf(oc->fp, ", symbol }");
+ } else
+ fprintf(oc->fp, " {?BAD-SYM}");
+}
+
+static void print_op_unknown(void *ctx)
+{
+ struct operand_print_ctx *oc = ctx;
+
+ if (verbose)
+ fprintf(oc->fp, " {?}");
+}
+
+static void print_op_indirect_reg(void *ctx,
+ int reg,
+ s32 off,
+ bool has_val,
+ u64 val)
+{
+ struct operand_print_ctx *oc = ctx;
+ char *name, *type;
+
+ /* Should resolve field names too, for now just print offsets */
+ if (!dwarf_varlist_find_reg(oc->vls, oc->dret, reg, &name, &type)) {
+ /* Likely frame pointer. Should resolve separately. */
+ if (!strncmp(type, "unknown_type", 12))
+ return;
+
+ fprintf(oc->fp, " { %d(%s)", off, name);
+ if (has_val)
+ fprintf(oc->fp, " = %" PRIx64, val);
+ fprintf(oc->fp, ", %.*s }", (int)strcspn(type, "\t"), type);
+ } else if (verbose)
+ fprintf(oc->fp, " {?NO-MATCH-IND-REG}");
+
+}
+
+static struct operand_print_ops operand_ops = {
+ .print_reg = print_op_reg,
+ .print_symbol = print_op_symbol,
+ .print_unknown = print_op_unknown,
+ .print_indirect_reg = print_op_indirect_reg,
+};
+
+#define MAX_INSN 16
+
+/* Resolve operands of instructions to their dwarf name */
+static void perf_sample__fprint_insn_var(struct perf_sample *sample,
+ struct thread *thread,
+ struct perf_event_attr *attr,
+ struct machine *machine,
+ FILE *fp)
+{
+ struct operand_print_ctx oc = {
+ .fp = fp,
+ .thread = thread,
+ };
+ u8 ibuf[MAX_INSN*2];
+ bool is64bit;
+ u64 val = 0;
+
+ if (grab_bb(ibuf, sample->ip, sample->ip + MAX_INSN,
+ machine,
+ thread,
+ &is64bit,
+ &oc.cpumode,
+ false) < 0) {
+ if (verbose)
+ fprintf(fp, " {?NO-TEXT}");
+ return;
+ }
+
+ oc.cpumode = sample->cpumode;
+
+ oc.dret = dwarf_resolve_sample(sample, thread, &oc.vls);
+ if (oc.dret < 0) {
+ if (verbose)
+ fprintf(fp, " {?BAD-DWARF}");
+ return;
+ }
+
+ if (attr->config == PERF_SYNTH_INTEL_PTWRITE) {
+ struct perf_synth_intel_ptwrite *data =
+ perf_sample__synth_ptr(sample);
+ if (!perf_sample__bad_synth_size(sample, *data))
+ val = le64_to_cpu(data->payload);
+ }
+
+ arch_resolve_operand((char *)ibuf, MAX_INSN, is64bit,
+ sample->ip,
+ val,
+ &operand_ops,
+ &oc);
+}
+
+#else
+
+static void perf_sample__fprint_insn_var(
+ struct perf_sample *sample __maybe_unused,
+ struct thread *thread __maybe_unused,
+ struct perf_event_attr *attr __maybe_unused,
+ struct machine *machine __maybe_unused,
+ FILE *fp __maybe_unused)
+{
+ if (verbose)
+ fprintf(fp, " {?}");
+}
+
+#endif
+
static int perf_sample__fprintf_addr(struct perf_sample *sample,
struct thread *thread,
struct perf_event_attr *attr, FILE *fp)
@@ -1183,6 +1339,8 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample,
for (i = 0; i < sample->insn_len; i++)
printed += fprintf(fp, " %02x", (unsigned char)sample->insn[i]);
}
+ if (PRINT_FIELD(INSN_VAR))
+ perf_sample__fprint_insn_var(sample, thread, attr, machine, fp);
if (PRINT_FIELD(BRSTACKINSN))
printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp);
@@ -3009,7 +3167,7 @@ int cmd_script(int argc, const char **argv)
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,period,iregs,uregs,brstack,brstacksym,flags,"
"bpf-output,callindent,insn,insnlen,brstackinsn,synth,phys_addr,"
- "iregvals",
+ "iregvals,insnvar",
parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 361db92a4bfd..4f42b2fad398 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -42,6 +42,7 @@ libperf-y += callchain.o
libperf-y += values.o
libperf-y += debug.o
libperf-y += machine.o
+libperf-y += operand.o
libperf-y += dwarf-sample.o
libperf-y += map.o
libperf-y += pstack.o
diff --git a/tools/perf/util/operand.c b/tools/perf/util/operand.c
new file mode 100644
index 000000000000..88d9284a7049
--- /dev/null
+++ b/tools/perf/util/operand.c
@@ -0,0 +1,16 @@
+#include <errno.h>
+#include "perf.h"
+#include "operand.h"
+
+/* Fall back, can be overriden per architecture */
+__weak
+int arch_resolve_operand(char *insn __maybe_unused,
+ int insnlen __maybe_unused,
+ bool is64bit __maybe_unused,
+ u64 ip __maybe_unused,
+ u64 val __maybe_unused,
+ struct operand_print_ops *ops __maybe_unused,
+ void *ctx __maybe_unused)
+{
+ return -EINVAL;
+}
diff --git a/tools/perf/util/operand.h b/tools/perf/util/operand.h
new file mode 100644
index 000000000000..63a7602727a1
--- /dev/null
+++ b/tools/perf/util/operand.h
@@ -0,0 +1,16 @@
+#ifndef OPERAND_H
+#define OPERAND_H 1
+
+struct operand_print_ops {
+ void (*print_reg)(void *ctx, int reg, bool has_val, u64 val);
+ void (*print_symbol)(void *ctx, u64 addr, bool has_val, u64 val);
+ void (*print_indirect_reg)(void *ctx, int reg, s32 off, bool has_val, u64 val);
+ void (*print_unknown)(void *ctx);
+};
+
+int arch_resolve_operand(char *insn, int insnlen, bool is64bit, u64 ip,
+ u64 val,
+ struct operand_print_ops *ops,
+ void *ctx);
+
+#endif
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 85fbeeb364bf..2a65ebed0998 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -78,6 +78,9 @@ int init_probe_symbol_maps(bool user_only)
{
int ret;
+ if (host_machine)
+ return 0;
+
symbol_conf.sort_by_name = true;
symbol_conf.allow_aliases = true;
ret = symbol__init(NULL);
--
2.13.6