Re: [PATCH perf/core 00/22] perf refcnt debugger API and fixes

From: 'Arnaldo Carvalho de Melo'
Date: Thu Dec 10 2015 - 10:12:55 EST


Em Thu, Dec 10, 2015 at 08:52:02PM +0800, Wangnan (F) escreveu:
> On 2015/12/10 19:04, åæéå / HIRAMATUïMASAMI wrote:
> >>From: Arnaldo Carvalho de Melo [mailto:acme@xxxxxxxxxx]
> >>Em Wed, Dec 09, 2015 at 11:10:48AM +0900, Masami Hiramatsu escreveu:
> >>>Here is a series of patches for perf refcnt debugger and
> >>>some fixes.
> >>>
> >>>In this series I've replaced all atomic reference counters
> >>>with the refcnt interface, including dso, map, map_groups,
> >>>thread, cpu_map, comm_str, cgroup_sel, thread_map, and
> >>>perf_mmap.
> >>>
> >>> refcnt debugger (or refcnt leak checker)
> >>> ===============
> >>>
> >>>At first, note that this change doesn't affect any compiled
> >>>code unless building with REFCNT_DEBUG=1 (see macros in
> >>>refcnt.h). So, this feature is only enabled in the debug binary.
> >>>But before releasing, we can ensure that all objects are safely
> >>>reclaimed before exit in -rc phase.
> >>That helps and is finding bugs and is really great stuff, thank you!
> >>
> >>But I wonder if we couldn't get the same results on an unmodified binary
> >>by using things like 'perf probe', the BPF code we're introducing, have
> >>you thought about this possibility?
> >That's possible, but it will require pre-analysis of the binary, because
> >refcnt interface is not fixed API like a "systemcall" (moreover, it could
> >be just a non-atomic variable).
> >
> >Thus we need a kind of "annotation" event by source code level.
> >
> >>I.e. trying to use 'perf probe' to do this would help in using the same
> >>technique in other code bases where we can't change the sources, etc.
> >>
> >>For perf we could perhaps use a 'noinline' on the __get/__put
> >>operations, so that we could have the right places to hook using
> >>uprobes, other codebases would have to rely in the 'perf probe'
> >>infrastructure that knows where inlines were expanded, etc.
> >>
> >>Such a toold could work like:
> >>
> >> perf dbgrefcnt ~/bin/perf thread
> >This works only for the binary which is coded as you said.
> >I actually doubt that this is universal solution. We'd better introduce
> >librefcnt.so if you need more general solution, so that we can fix the
> >refcnt API and we can also hook the interface easily.
> >
> >But with this way, we don't need ebpf/uprobes anymore, since we've already
> >have LD_PRELOAD (like valgrind does). :(
> >
> >So, IMHO, using ebpf and perf probe for this issue sounds like using
> >a sledgeâhammer...
>
> But this is an interesting problem and can inspire us the direction
> for eBPF improvement. I guess if we can solve this problem with eBPF

That is the spirit! All those efforts in the kernel to have always
available tracing facilities, kprobes, uprobes, eBPF, jump labels, you
name it, leaves me thinking that doing things by having to rebuild from
sources using defines, and even having to restart a workload to use
LD_PRELOAD tricks looks backwards :-)

> we can also solve many similar problems with much lower cost than what
> you have done in first 5 patches?
>
> This is what we have done today:
>
> With a much simpler patch which create 4 stub functions:
>
> diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> index 65fef59..2c45478 100644
> --- a/tools/perf/util/Build
> +++ b/tools/perf/util/Build
> @@ -87,6 +87,7 @@ libperf-$(CONFIG_AUXTRACE) += intel-bts.o
> libperf-y += parse-branch-options.o
> libperf-y += parse-regs-options.o
> libperf-y += term.o
> +libperf-y += refcnt.o
>
> libperf-$(CONFIG_LIBBPF) += bpf-loader.o
> libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
> diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
> index e8e9a9d..de52ae8 100644
> --- a/tools/perf/util/dso.c
> +++ b/tools/perf/util/dso.c
> @@ -1,6 +1,7 @@
> #include <asm/bug.h>
> #include <sys/time.h>
> #include <sys/resource.h>
> +#include "refcnt.h"
> #include "symbol.h"
> #include "dso.h"
> #include "machine.h"
> diff --git a/tools/perf/util/refcnt.c b/tools/perf/util/refcnt.c
> new file mode 100644
> index 0000000..f5a6659
> --- /dev/null
> +++ b/tools/perf/util/refcnt.c
> @@ -0,0 +1,29 @@
> +#include <linux/compiler.h>
> +#include "util/refcnt.h"
> +
> +void __attribute__ ((noinline))
> +__refcnt__init(atomic_t *refcnt, int n,
> + void *obj __maybe_unused,
> + const char *name __maybe_unused)
> +{
> + atomic_set(refcnt, n);
> +}
> +
> +void __attribute__ ((noinline))
> +refcnt__delete(void *obj __maybe_unused)
> +{
> +}
> +
> +void __attribute__ ((noinline))
> +__refcnt__get(atomic_t *refcnt, void *obj __maybe_unused,
> + const char *name __maybe_unused)
> +{
> + atomic_inc(refcnt);
> +}
> +
> +int __attribute__ ((noinline))
> +__refcnt__put(atomic_t *refcnt, void *obj __maybe_unused,
> + const char *name __maybe_unused)
> +{
> + return atomic_dec_and_test(refcnt);
> +}
> diff --git a/tools/perf/util/refcnt.h b/tools/perf/util/refcnt.h
> new file mode 100644
> index 0000000..04f5390
> --- /dev/null
> +++ b/tools/perf/util/refcnt.h
> @@ -0,0 +1,21 @@
> +#ifndef PERF_REFCNT_H
> +#define PERF_REFCNT_H
> +#include <linux/atomic.h>
> +
> +void __refcnt__init(atomic_t *refcnt, int n, void *obj, const char *name);
> +void refcnt__delete(void *obj);
> +void __refcnt__get(atomic_t *refcnt, void *obj, const char *name);
> +int __refcnt__put(atomic_t *refcnt, void *obj, const char *name);
> +
> +#define refcnt__init(obj, member, n) \
> + __refcnt__init(&(obj)->member, n, obj, #obj)
> +#define refcnt__init_as(obj, member, n, name) \
> + __refcnt__init(&(obj)->member, n, obj, name)
> +#define refcnt__exit(obj, member) \
> + refcnt__delete(obj)
> +#define refcnt__get(obj, member) \
> + __refcnt__get(&(obj)->member, obj, #obj)
> +#define refcnt__put(obj, member) \
> + __refcnt__put(&(obj)->member, obj, #obj)
> +
> +#endif

But this requires having these special refcnt__ routines, that will make
tools/perf/ code patterns for reference counts look different that the
refcount patterns in the kernel :-\

And would be a requirement to change the observed workload :-\

Is this _strictly_ required? Can't we, for a project like perf, where we
know where some refcount (say, the one for 'struct thread') gets
initialized, use that (thread__new()) and then hook into thread__get and
thread__put and then use the destructor, thread__delete() as the place
to dump leaks?

> And a relative complex eBPF script attached at the end of
> this mail, with following cmdline:
>
> # ./perf record -e ./refcnt.c ./perf probe vfs_read
> # cat /sys/kernel/debug/tracing/trace
> ...
> perf-18419 [004] d... 613572.513083: : Type 0 leak 2
> perf-18419 [004] d... 613572.513084: : Type 1 leak 1
>
> I know we have 2 dsos and 1 map get leak.
>
> However I have to analysis full stack trace from 'perf script' to find
> which one get leak, because currently my eBPF script is unable to report
> which object is leak. I know I can use a hashtable with object address
> as key, but currently I don't know how to enumerate keys in a hash table,



> except maintaining a relationship between index and object address.
> Now I'm waiting for Daniel's persistent map to be enforced for that. When
> it ready we can create a tool with the following eBPF script embedded into
> perf as a small subcommand, and report call stack of 'alloc' method of
> leak object in 'perf report' style, so we can solve similar problem easier.
> To make it genereic, we can even make it attach to '{m,c}alloc%return' and
> 'free', or 'mmap/munmap'.
>
> Thank you.
>
>
> -------------- eBPF script --------------
>
> typedef int u32;
> typedef unsigned long long u64;
> #define NULL ((void *)(0))
>
> #define BPF_ANY 0 /* create new element or update existing */
> #define BPF_NOEXIST 1 /* create new element if it didn't exist */
> #define BPF_EXIST 2 /* update existing element */
>
> enum bpf_map_type {
> BPF_MAP_TYPE_UNSPEC,
> BPF_MAP_TYPE_HASH,
> BPF_MAP_TYPE_ARRAY,
> BPF_MAP_TYPE_PROG_ARRAY,
> BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> };
>
> struct bpf_map_def {
> unsigned int type;
> unsigned int key_size;
> unsigned int value_size;
> unsigned int max_entries;
> };
>
> #define SEC(NAME) __attribute__((section(NAME), used))
> static int (*bpf_probe_read)(void *dst, int size, void *src) =
> (void *)4;
> static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
> (void *)6;
> static int (*bpf_get_smp_processor_id)(void) =
> (void *)8;
> static int (*map_update_elem)(struct bpf_map_def *, void *, void *, unsigned
> long long flags) =
> (void *)2;
> static void *(*map_lookup_elem)(struct bpf_map_def *, void *) =
> (void *)1;
> static unsigned long long (*get_current_pid_tgid)(void) =
> (void *)14;
> static unsigned long long (*get_current_comm)(char *buf, int size_of_buf) =
> (void *)16;
>
> char _license[] SEC("license") = "GPL";
> int _version SEC("version") = LINUX_VERSION_CODE;
>
> enum global_var {
> G_pid,
> G_LEAK_START,
> G_dso_leak = G_LEAK_START,
> G_map_group_leak,
> G_LEAK_END,
> G_NR = G_LEAK_END,
> };
>
> struct bpf_map_def SEC("maps") global_vars = {
> .type = BPF_MAP_TYPE_ARRAY,
> .key_size = sizeof(int),
> .value_size = sizeof(u64),
> .max_entries = G_NR,
> };
>
> static inline int filter_pid(void)
> {
> int key_pid = G_pid;
> unsigned long long *p_pid, pid;
>
> char fmt[] = "%d vs %d\n";
>
> p_pid = map_lookup_elem(&global_vars, &key_pid);
> if (!p_pid)
> return 0;
>
> pid = get_current_pid_tgid() & 0xffffffff;
>
> if (*p_pid != pid)
> return 0;
> return 1;
> }
>
> static inline void print_leak(int type)
> {
> unsigned long long *p_cnt;
> char fmt[] = "Type %d leak %llu\n";
>
> p_cnt = map_lookup_elem(&global_vars, &type);
> if (!p_cnt)
> return;
> bpf_trace_printk(fmt, sizeof(fmt), type - G_LEAK_START, *p_cnt);
> }
>
> SEC("execve=sys_execve")
> int execve(void *ctx)
> {
> char name[sizeof(u64)] = "";
> char name_perf[sizeof(u64)] = "perf";
> unsigned long long *p_pid, pid;
> int key = G_pid;
>
> p_pid = map_lookup_elem(&global_vars, &key);
> if (!p_pid)
> return 0;
> pid = *p_pid;
> if (pid)
> return 0;
> if (get_current_comm(name, sizeof(name)))
> return 0;
> if (*(u32*)name != *(u32*)name_perf)
> return 0;
>
> pid = get_current_pid_tgid() & 0xffffffff;
> map_update_elem(&global_vars, &key, &pid, BPF_ANY);
> return 0;
> }
>
> static inline int func_exit(void *ctx)
> {
> if (!filter_pid())
> return 0;
> print_leak(G_dso_leak);
> print_leak(G_map_group_leak);
> return 0;
> }
>
> SEC("exit_group=sys_exit_group")
> int exit_group(void *ctx)
> {
> return func_exit(ctx);
> }
>
> SEC("exit_=sys_exit")
> int exit_(void *ctx)
> {
> return func_exit(ctx);
> }
> static inline void inc_leak_from_type(int type, int n)
> {
> u64 *p_cnt, cnt;
>
> type += G_LEAK_START;
> if (type >= G_LEAK_END)
> return;
>
> p_cnt = map_lookup_elem(&global_vars, &type);
> if (!p_cnt)
> cnt = n;
> else
> cnt = *p_cnt + n;
>
> map_update_elem(&global_vars, &type, &cnt, BPF_ANY);
> return;
> }
>
> SEC("exec=/home/wangnan/perf;"
> "refcnt_init=__refcnt__init n obj type")
> int refcnt_init(void *ctx, int err, int n, void *obj, int type)
> {
> if (!filter_pid())
> return 0;
> inc_leak_from_type(type, n);
> return 0;
> }
> SEC("exec=/home/wangnan/perf;"
> "refcnt_del=refcnt__delete obj type")
> int refcnt_del(void *ctx, int err, void *obj, int type)
> {
> if (!filter_pid())
> return 0;
> return 0;
> }
>
> SEC("exec=/home/wangnan/perf;"
> "refcnt_get=__refcnt__get obj type")
> int refcnt_get(void *ctx, int err, void *obj, int type)
> {
> if (!filter_pid())
> return 0;
> inc_leak_from_type(type, 1);
> return 0;
> }
>
> SEC("exec=/home/wangnan/perf;"
> "refcnt_put=__refcnt__put refcnt obj type")
> int refcnt_put(void *ctx, int err, void *refcnt, void *obj, int type)
> {
> int old_cnt = -1;
>
> if (!filter_pid())
> return 0;
> if (bpf_probe_read(&old_cnt, sizeof(int), refcnt))
> return 0;
> if (old_cnt)
> inc_leak_from_type(type, -1);
> return 0;
> }
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/