Re: [PATCH 3/3] perf stat: Enable BPF counter with --for-each-cgroup

From: Song Liu
Date: Thu Jun 24 2021 - 00:54:43 EST




> On Jun 22, 2021, at 12:12 AM, Namhyung Kim <namhyung@xxxxxxxxxx> wrote:
>
> Recently bperf was added to use BPF to count perf events for various
> purposes. This is an extension for the approach and targetting to
> cgroup usages.
>
> Unlike the other bperf, it doesn't share the events with other
> processes but it'd reduce unnecessary events (and the overhead of
> multiplexing) for each monitored cgroup within the perf session.
>
> When --for-each-cgroup is used with --bpf-counters, it will open
> cgroup-switches event per cpu internally and attach the new BPF
> program to read given perf_events and to aggregate the results for
> cgroups. It's only called when task is switched to a task in a
> different cgroup.
>
> Cc: Song Liu <songliubraving@xxxxxx>
> Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
> ---
> tools/perf/Makefile.perf | 7 +-
> tools/perf/util/Build | 1 +
> tools/perf/util/bpf_counter.c | 5 +
> tools/perf/util/bpf_counter_cgroup.c | 337 ++++++++++++++++++++
> tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 207 ++++++++++++
> tools/perf/util/cgroup.c | 2 +
> tools/perf/util/cgroup.h | 1 +
> 7 files changed, 559 insertions(+), 1 deletion(-)
> create mode 100644 tools/perf/util/bpf_counter_cgroup.c
> create mode 100644 tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
>
> diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> index e47f04e5b51e..786cba8f3798 100644
> --- a/tools/perf/Makefile.perf
> +++ b/tools/perf/Makefile.perf
> @@ -1015,6 +1015,7 @@ SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel)
> SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
> SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
> SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
> +SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h
>
> ifdef BUILD_BPF_SKEL
> BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool
> @@ -1032,7 +1033,11 @@ $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT)
> $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \
> -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@
>
> -$(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL)
> +$(SKEL_OUT)/vmlinux.h:
> + $(MAKE) -C ../bpf/bpftool OUTPUT=$(SKEL_TMP_OUT)/ $(SKEL_TMP_OUT)/vmlinux.h

We build bpftool with $(BPFTOOL), which is a few lines above.
Can we reuse some of that?

> + $(Q)mv $(SKEL_TMP_OUT)/vmlinux.h $(SKEL_OUT)/vmlinux.h
> +
> +$(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o $(SKEL_OUT)/vmlinux.h | $(BPFTOOL)
> $(QUIET_GENSKEL)$(BPFTOOL) gen skeleton $< > $@
>
> bpf-skel: $(SKELETONS)
> diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> index 95e15d1035ab..700d635448ff 100644
> --- a/tools/perf/util/Build
> +++ b/tools/perf/util/Build
> @@ -140,6 +140,7 @@ perf-y += clockid.o
> perf-$(CONFIG_LIBBPF) += bpf-loader.o
> perf-$(CONFIG_LIBBPF) += bpf_map.o
> perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
> +perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
> perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
> perf-$(CONFIG_LIBELF) += symbol-elf.o
> perf-$(CONFIG_LIBELF) += probe-file.o
> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
> index 974f10e356f0..7812c5d9b826 100644
> --- a/tools/perf/util/bpf_counter.c
> +++ b/tools/perf/util/bpf_counter.c
> @@ -22,6 +22,7 @@
> #include "evsel.h"
> #include "evlist.h"
> #include "target.h"
> +#include "cgroup.h"
> #include "cpumap.h"
> #include "thread_map.h"
>
> @@ -792,6 +793,8 @@ struct bpf_counter_ops bperf_ops = {
> .destroy = bperf__destroy,
> };
>
> +extern struct bpf_counter_ops bperf_cgrp_ops;
> +
> static inline bool bpf_counter_skip(struct evsel *evsel)
> {
> return list_empty(&evsel->bpf_counter_list) &&
> @@ -809,6 +812,8 @@ int bpf_counter__load(struct evsel *evsel, struct target *target)
> {
> if (target->bpf_str)
> evsel->bpf_counter_ops = &bpf_program_profiler_ops;
> + else if (cgrp_event_expanded && target->use_bpf)
> + evsel->bpf_counter_ops = &bperf_cgrp_ops;
> else if (target->use_bpf || evsel->bpf_counter ||
> evsel__match_bpf_counter_events(evsel->name))
> evsel->bpf_counter_ops = &bperf_ops;

[...]


> +
> +#include "bpf_skel/bperf_cgroup.skel.h"
> +
> +static struct perf_event_attr cgrp_switch_attr = {
> + .type = PERF_TYPE_SOFTWARE,
> + .config = PERF_COUNT_SW_CGROUP_SWITCHES,
> + .size = sizeof(cgrp_switch_attr),
> + .sample_period = 1,
> + .disabled = 1,
> +};
> +
> +static struct evsel *cgrp_switch;
> +static struct xyarray *cgrp_prog_fds;
> +static struct bperf_cgroup_bpf *skel;
> +
> +#define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0))
> +#define PROG(cpu) (*(int *)xyarray__entry(cgrp_prog_fds, cpu, 0))
> +
> +static void set_max_rlimit(void)
> +{
> + struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
> +
> + setrlimit(RLIMIT_MEMLOCK, &rinf);
> +}
> +
> +static __u32 bpf_link_get_prog_id(int fd)
> +{
> + struct bpf_link_info link_info = {0};
> + __u32 link_info_len = sizeof(link_info);
> +
> + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
> + return link_info.prog_id;
> +}

How about we move set_max_rlimit() and bpf_link_get_prog_id() to
a header so we don't have to duplicate it?

> +
> +static int bperf_load_program(struct evlist *evlist)
> +{
> + struct bpf_link *link;
> + struct evsel *evsel;
> + struct cgroup *cgrp, *leader_cgrp;
> + __u32 i, cpu, prog_id;
> + int nr_cpus = evlist->core.all_cpus->nr;
> + int map_size, map_fd;
> + int prog_fd, err;
> +
> + skel = bperf_cgroup_bpf__open();
> + if (!skel) {
> + pr_err("Failed to open cgroup skeleton\n");
> + return -1;
> + }
> +
> + skel->rodata->num_cpus = nr_cpus;
> + skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups;
> +
> + BUG_ON(evlist->core.nr_entries % nr_cgroups != 0);
> +
> + /* we need one copy of events per cpu for reading */
> + map_size = nr_cpus * evlist->core.nr_entries / nr_cgroups;
> + bpf_map__resize(skel->maps.events, map_size);
> + bpf_map__resize(skel->maps.cpu_idx, nr_cpus);
> + bpf_map__resize(skel->maps.cgrp_idx, nr_cgroups);
> + /* previous result is saved in a per-cpu array */
> + map_size = evlist->core.nr_entries / nr_cgroups;
> + bpf_map__resize(skel->maps.prev_readings, map_size);
> + /* cgroup result needs all events */
> + map_size = nr_cpus * evlist->core.nr_entries;
> + bpf_map__resize(skel->maps.cgrp_readings, map_size);

We are setting map_size back and forth here.

[...]


> diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> new file mode 100644
> index 000000000000..6d74e93dd1f5
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> @@ -0,0 +1,207 @@
> +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +// Copyright (c) 2021 Facebook
> +// Copyright (c) 2021 Google
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_core_read.h>
> +
> +#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary
> +#define MAX_EVENTS 32 // max events per cgroup: arbitrary
> +
> +// NOTE: many of map and global data will be modified before loading
> +// from the userspace (perf tool) using the skeleton helpers.
> +
> +// single set of global perf events to measure
> +struct {
> + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> + __uint(key_size, sizeof(__u32));
> + __uint(value_size, sizeof(int));
> + __uint(max_entries, 1);
> +} events SEC(".maps");
> +
> +// from logical cpu number to event index
> +// useful when user wants to count subset of cpus
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __uint(key_size, sizeof(__u32));
> + __uint(value_size, sizeof(__u32));
> + __uint(max_entries, 1);
> +} cpu_idx SEC(".maps");

How about we make cpu_idx a percpu array and use 0,1 for
disable/enable profiling on this cpu?

> +
> +// from cgroup id to event index
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __uint(key_size, sizeof(__u64));
> + __uint(value_size, sizeof(__u32));
> + __uint(max_entries, 1);
> +} cgrp_idx SEC(".maps");
> +
> +// per-cpu event snapshots to calculate delta
> +struct {
> + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> + __uint(key_size, sizeof(__u32));
> + __uint(value_size, sizeof(struct bpf_perf_event_value));
> +} prev_readings SEC(".maps");
> +
> +// aggregated event values for each cgroup
> +// will be read from the user-space
> +struct {
> + __uint(type, BPF_MAP_TYPE_ARRAY);
> + __uint(key_size, sizeof(__u32));
> + __uint(value_size, sizeof(struct bpf_perf_event_value));
> +} cgrp_readings SEC(".maps");

Maybe also make this a percpu array? This should make the BPF program
faster.

> +
> +const volatile __u32 num_events = 1;
> +const volatile __u32 num_cpus = 1;
> +
> +int enabled = 0;
> +int use_cgroup_v2 = 0;
> +
[...]