Re: [PATCH 6/6] perf thread-stack: Hide x86 retpolines

From: Arnaldo Carvalho de Melo
Date: Fri Feb 22 2019 - 14:42:52 EST


Em Wed, Jan 09, 2019 at 11:18:35AM +0200, Adrian Hunter escreveu:
> x86 retpoline functions pollute the call graph by showing up everywhere
> there is an indirect branch, but they do not really mean anything. Make
> changes so that the default retpoline functions will no longer appear in
> the call graph. Note this only affects the call graph, since all the
> original branches are left unchanged.
>
> This does not handle function return thunks, nor is there any improvement
> for the handling of inline thunks or extern thunks.
>
> Example:
>
> $ cat simple-retpoline.c
> __attribute__((noinline)) int bar(void)
> {
> return -1;
> }
>
> int foo(void)
> {
> return bar() + 1;
> }
>
> __attribute__((indirect_branch("thunk"))) int main()
> {
> int (*volatile fn)(void) = foo;
>
> fn();
> return fn();
> }
> $ gcc -ggdb3 -Wall -Wextra -O2 -o simple-retpoline simple-retpoline.c
> $ objdump -d simple-retpoline
> <SNIP>
> 0000000000001040 <main>:
> 1040: 48 83 ec 18 sub $0x18,%rsp
> 1044: 48 8d 05 25 01 00 00 lea 0x125(%rip),%rax # 1170 <foo>
> 104b: 48 89 44 24 08 mov %rax,0x8(%rsp)
> 1050: 48 8b 44 24 08 mov 0x8(%rsp),%rax
> 1055: e8 1f 01 00 00 callq 1179 <__x86_indirect_thunk_rax>
> 105a: 48 8b 44 24 08 mov 0x8(%rsp),%rax
> 105f: 48 83 c4 18 add $0x18,%rsp
> 1063: e9 11 01 00 00 jmpq 1179 <__x86_indirect_thunk_rax>
> <SNIP>
> 0000000000001160 <bar>:
> 1160: b8 ff ff ff ff mov $0xffffffff,%eax
> 1165: c3 retq
> <SNIP>
> 0000000000001170 <foo>:
> 1170: e8 eb ff ff ff callq 1160 <bar>
> 1175: 83 c0 01 add $0x1,%eax
> 1178: c3 retq
> 0000000000001179 <__x86_indirect_thunk_rax>:
> 1179: e8 07 00 00 00 callq 1185 <__x86_indirect_thunk_rax+0xc>
> 117e: f3 90 pause
> 1180: 0f ae e8 lfence
> 1183: eb f9 jmp 117e <__x86_indirect_thunk_rax+0x5>
> 1185: 48 89 04 24 mov %rax,(%rsp)
> 1189: c3 retq
> <SNIP>
> $ perf record -o simple-retpoline.perf.data -e intel_pt/cyc/u ./simple-retpoline
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0,017 MB simple-retpoline.perf.data ]
> $ perf script -i simple-retpoline.perf.data --itrace=be -s ~/libexec/perf-core/scripts/python/export-to-sqlite.py simple-retpoline.db branches calls
> 2019-01-08 14:03:37.851655 Creating database...
> 2019-01-08 14:03:37.863256 Writing records...
> 2019-01-08 14:03:38.069750 Adding indexes
> 2019-01-08 14:03:38.078799 Done
> $ ~/libexec/perf-core/scripts/python/exported-sql-viewer.py simple-retpoline.db
>
> Before:
>
> main
> -> __x86_indirect_thunk_rax
> -> __x86_indirect_thunk_rax
> -> foo
> -> bar
>
> After:
>
> main
> -> foo
> -> bar
>
> Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
> ---
> tools/perf/util/thread-stack.c | 112 ++++++++++++++++++++++++++++++++-
> 1 file changed, 109 insertions(+), 3 deletions(-)
>
> diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
> index 632c07a125ab..805e30836460 100644
> --- a/tools/perf/util/thread-stack.c
> +++ b/tools/perf/util/thread-stack.c
> @@ -20,6 +20,7 @@
> #include "thread.h"
> #include "event.h"
> #include "machine.h"
> +#include "env.h"
> #include "util.h"
> #include "debug.h"
> #include "symbol.h"
> @@ -29,6 +30,19 @@
>
> #define STACK_GROWTH 2048
>
> +/*
> + * State of retpoline detection.
> + *
> + * RETPOLINE_NONE: no retpoline detection
> + * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
> + * X86_RETPOLINE_DETECTED: x86 retpoline detected
> + */
> +enum retpoline_state_t {
> + RETPOLINE_NONE,
> + X86_RETPOLINE_POSSIBLE,
> + X86_RETPOLINE_DETECTED,
> +};
> +
> /**
> * struct thread_stack_entry - thread stack entry.
> * @ret_addr: return address
> @@ -64,6 +78,7 @@ struct thread_stack_entry {
> * @crp: call/return processor
> * @comm: current comm
> * @arr_sz: size of array if this is the first element of an array
> + * @rstate: used to detect retpolines
> */
> struct thread_stack {
> struct thread_stack_entry *stack;
> @@ -76,6 +91,7 @@ struct thread_stack {
> struct call_return_processor *crp;
> struct comm *comm;
> unsigned int arr_sz;
> + enum retpoline_state_t rstate;
> };
>
> /*
> @@ -115,10 +131,16 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
> if (err)
> return err;
>
> - if (thread->mg && thread->mg->machine)
> - ts->kernel_start = machine__kernel_start(thread->mg->machine);
> - else
> + if (thread->mg && thread->mg->machine) {
> + struct machine *machine = thread->mg->machine;
> + const char *arch = perf_env__arch(machine->env);
> +
> + ts->kernel_start = machine__kernel_start(machine);
> + if (!strcmp(arch, "x86"))
> + ts->rstate = X86_RETPOLINE_POSSIBLE;
> + } else {
> ts->kernel_start = 1ULL << 63;
> + }
> ts->crp = crp;
>
> return 0;
> @@ -733,6 +755,70 @@ static int thread_stack__trace_end(struct thread_stack *ts,
> false, true);
> }
>
> +static bool is_x86_retpoline(const char *name)
> +{
> + const char *p = strstr(name, "__x86_indirect_thunk_");
> +
> + return p == name || !strcmp(name, "__indirect_thunk_start");
> +}
> +
> +/*
> + * x86 retpoline functions pollute the call graph. This function removes them.
> + * This does not handle function return thunks, nor is there any improvement
> + * for the handling of inline thunks or extern thunks.
> + */
> +static int thread_stack__x86_retpoline(struct thread_stack *ts,
> + struct perf_sample *sample,
> + struct addr_location *to_al)
> +{
> + struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
> + struct call_path_root *cpr = ts->crp->cpr;
> + struct symbol *sym = tse->cp->sym;
> + struct symbol *tsym = to_al->sym;
> + struct call_path *cp;
> +
> + if (sym && sym->name && is_x86_retpoline(sym->name)) {


CC /tmp/build/perf/util/scripting-engines/trace-event-perl.o
CC /tmp/build/perf/util/intel-pt.o
CC /tmp/build/perf/util/intel-pt-decoder/intel-pt-log.o
util/thread-stack.c:780:18: error: address of array 'sym->name' will always evaluate to 'true' [-Werror,-Wpointer-bool-conversion]
if (sym && sym->name && is_x86_retpoline(sym->name)) {
~~ ~~~~~^~~~
1 error generated.
mv: cannot stat '/tmp/build/perf/util/.thread-stack.o.tmp': No such file or directory
make[4]: *** [/git/linux/tools/build/Makefile.build:96: /tmp/build/perf/util/thread-stack.o] Error 1


[acme@quaco perf]$ pahole -C symbol ~/bin/perf
struct symbol {
struct rb_node rb_node; /* 0 24 */
u64 start; /* 24 8 */
u64 end; /* 32 8 */
u16 namelen; /* 40 2 */
u8 type:4; /* 42: 4 1 */
u8 binding:4; /* 42: 0 1 */
u8 idle:1; /* 43: 7 1 */
u8 ignore:1; /* 43: 6 1 */
u8 inlined:1; /* 43: 5 1 */

/* XXX 5 bits hole, try to pack */

u8 arch_sym; /* 44 1 */
_Bool annotate2; /* 45 1 */
char name[0]; /* 46 0 */

/* size: 48, cachelines: 1, members: 12 */
/* bit holes: 1, sum bit holes: 5 bits */
/* padding: 2 */
/* last cacheline: 48 bytes */
};
[acme@quaco perf]$

I'm removing that sym->name test.

> + /*
> + * This is a x86 retpoline fn. It pollutes the call graph by
> + * showing up everywhere there is an indirect branch, but does
> + * not itself mean anything. Here the top-of-stack is removed,
> + * by decrementing the stack count, and then further down, the
> + * resulting top-of-stack is replaced with the actual target.
> + * The result is that the retpoline functions will no longer
> + * appear in the call graph. Note this only affects the call
> + * graph, since all the original branches are left unchanged.
> + */
> + ts->cnt -= 1;
> + sym = ts->stack[ts->cnt - 2].cp->sym;
> + if (sym && sym == tsym && to_al->addr != tsym->start) {
> + /*
> + * Target is back to the middle of the symbol we came
> + * from so assume it is an indirect jmp and forget it
> + * altogether.
> + */
> + ts->cnt -= 1;
> + return 0;
> + }
> + } else if (sym && sym == tsym) {
> + /*
> + * Target is back to the symbol we came from so assume it is an
> + * indirect jmp and forget it altogether.
> + */
> + ts->cnt -= 1;
> + return 0;
> + }
> +
> + cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
> + sample->addr, ts->kernel_start);
> + if (!cp)
> + return -ENOMEM;
> +
> + /* Replace the top-of-stack with the actual target */
> + ts->stack[ts->cnt - 1].cp = cp;
> +
> + return 0;
> +}
> +
> int thread_stack__process(struct thread *thread, struct comm *comm,
> struct perf_sample *sample,
> struct addr_location *from_al,
> @@ -740,6 +826,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
> struct call_return_processor *crp)
> {
> struct thread_stack *ts = thread__stack(thread, sample->cpu);
> + enum retpoline_state_t rstate;
> int err = 0;
>
> if (ts && !ts->crp) {
> @@ -755,6 +842,10 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
> ts->comm = comm;
> }
>
> + rstate = ts->rstate;
> + if (rstate == X86_RETPOLINE_DETECTED)
> + ts->rstate = X86_RETPOLINE_POSSIBLE;
> +
> /* Flush stack on exec */
> if (ts->comm != comm && thread->pid_ == thread->tid) {
> err = __thread_stack__flush(thread, ts);
> @@ -791,10 +882,25 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
> ts->kernel_start);
> err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
> cp, false, trace_end);
> +
> + /*
> + * A call to the same symbol but not the start of the symbol,
> + * may be the start of a x86 retpoline.
> + */
> + if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
> + from_al->sym == to_al->sym &&
> + to_al->addr != to_al->sym->start)
> + ts->rstate = X86_RETPOLINE_DETECTED;
> +
> } else if (sample->flags & PERF_IP_FLAG_RETURN) {
> if (!sample->ip || !sample->addr)
> return 0;
>
> + /* x86 retpoline 'return' doesn't match the stack */
> + if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
> + ts->stack[ts->cnt - 1].ret_addr != sample->addr)
> + return thread_stack__x86_retpoline(ts, sample, to_al);
> +
> err = thread_stack__pop_cp(thread, ts, sample->addr,
> sample->time, ref, from_al->sym);
> if (err) {
> --
> 2.17.1

--

- Arnaldo