Re: [PATCH bpf-next 5/5] selftests/bpf: add benchmark for uprobe vs. user_prog

From: Song Liu
Date: Tue Aug 04 2020 - 17:02:18 EST




> On Aug 2, 2020, at 10:10 PM, Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote:
>
> On Sun, Aug 2, 2020 at 9:47 PM Song Liu <songliubraving@xxxxxx> wrote:
>>
>>
>>> On Aug 2, 2020, at 6:51 PM, Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote:
>>>
>>> On Sat, Aug 1, 2020 at 1:50 AM Song Liu <songliubraving@xxxxxx> wrote:
>>>>
>>>> Add a benchmark to compare performance of
>>>> 1) uprobe;
>>>> 2) user program w/o args;
>>>> 3) user program w/ args;
>>>> 4) user program w/ args on random cpu.
>>>>
>>>
>>> Can you please add it to the existing benchmark runner instead, e.g.,
>>> along the other bench_trigger benchmarks? No need to re-implement
>>> benchmark setup. And also that would also allow to compare existing
>>> ways of cheaply triggering a program vs this new _USER program?
>>
>> Will try.
>>
>>>
>>> If the performance is not significantly better than other ways, do you
>>> think it still makes sense to add a new BPF program type? I think
>>> triggering KPROBE/TRACEPOINT from bpf_prog_test_run() would be very
>>> nice, maybe it's possible to add that instead of a new program type?
>>> Either way, let's see comparison with other program triggering
>>> mechanisms first.
>>
>> Triggering KPROBE and TRACEPOINT from bpf_prog_test_run() will be useful.
>> But I don't think they can be used instead of user program, for a couple
>> reasons. First, KPROBE/TRACEPOINT may be triggered by other programs
>> running in the system, so user will have to filter those noise out in
>> each program. Second, it is not easy to specify CPU for KPROBE/TRACEPOINT,
>> while this feature could be useful in many cases, e.g. get stack trace
>> on a given CPU.
>>
>
> Right, it's not as convenient with KPROBE/TRACEPOINT as with the USER
> program you've added specifically with that feature in mind. But if
> you pin user-space thread on the needed CPU and trigger kprobe/tp,
> then you'll get what you want. As for the "noise", see how
> bench_trigger() deals with that: it records thread ID and filters
> everything not matching. You can do the same with CPU ID. It's not as
> automatic as with a special BPF program type, but still pretty simple,
> which is why I'm still deciding (for myself) whether USER program type
> is necessary :)

Here are some bench_trigger numbers:

base : 1.698 ± 0.001M/s
tp : 1.477 ± 0.001M/s
rawtp : 1.567 ± 0.001M/s
kprobe : 1.431 ± 0.000M/s
fentry : 1.691 ± 0.000M/s
fmodret : 1.654 ± 0.000M/s
user : 1.253 ± 0.000M/s
fentry-on-cpu: 0.022 ± 0.011M/s
user-on-cpu: 0.315 ± 0.001M/s

The two "on-cpu" tests run the program on a different CPU (see the patch
at the end).

"user" is about 25% slower than "fentry". I think this is mostly because
getpgid() is a faster syscall than bpf(BPF_TEST_RUN).

"user-on-cpu" is more than 10x faster than "fentry-on-cpu", because IPI
is way faster than moving the process (via sched_setaffinity).

For use cases that we would like to call BPF program on specific CPU,
triggering it via IPI is a lot faster.

Thanks,
Song


========================== 8< ==========================

diff --git c/tools/testing/selftests/bpf/bench.c w/tools/testing/selftests/bpf/bench.c
index 944ad4721c83c..5394a1d2dfd21 100644
--- c/tools/testing/selftests/bpf/bench.c
+++ w/tools/testing/selftests/bpf/bench.c
@@ -317,7 +317,10 @@ extern const struct bench bench_trig_tp;
extern const struct bench bench_trig_rawtp;
extern const struct bench bench_trig_kprobe;
extern const struct bench bench_trig_fentry;
+extern const struct bench bench_trig_fentry_on_cpu;
extern const struct bench bench_trig_fmodret;
+extern const struct bench bench_trig_user;
+extern const struct bench bench_trig_user_on_cpu;
extern const struct bench bench_rb_libbpf;
extern const struct bench bench_rb_custom;
extern const struct bench bench_pb_libbpf;
@@ -338,7 +341,10 @@ static const struct bench *benchs[] = {
&bench_trig_rawtp,
&bench_trig_kprobe,
&bench_trig_fentry,
+ &bench_trig_fentry_on_cpu,
&bench_trig_fmodret,
+ &bench_trig_user,
+ &bench_trig_user_on_cpu,
&bench_rb_libbpf,
&bench_rb_custom,
&bench_pb_libbpf,
@@ -462,4 +468,3 @@ int main(int argc, char **argv)

return 0;
}
-
diff --git c/tools/testing/selftests/bpf/benchs/bench_trigger.c w/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 49c22832f2169..a1ebaebf6070c 100644
--- c/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ w/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <sched.h>
#include "bench.h"
#include "trigger_bench.skel.h"

@@ -39,6 +41,22 @@ static void *trigger_producer(void *input)
return NULL;
}

+static void *trigger_on_cpu_producer(void *input)
+{
+ cpu_set_t set;
+ int i = 0, nr_cpu;
+
+ nr_cpu = libbpf_num_possible_cpus();
+ while (true) {
+ CPU_ZERO(&set);
+ CPU_SET(i, &set);
+ sched_setaffinity(0, sizeof(set), &set);
+ (void)syscall(__NR_getpgid);
+ i = (i + 1) % nr_cpu;
+ }
+ return NULL;
+}
+
static void trigger_measure(struct bench_res *res)
{
res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
@@ -96,6 +114,39 @@ static void trigger_fmodret_setup()
attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
}

+static void trigger_user_setup()
+{
+ setup_ctx();
+}
+
+static void *trigger_producer_user(void *input)
+{
+ struct bpf_prog_test_run_attr attr = {};
+
+ attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user);
+
+ while (true)
+ (void)bpf_prog_test_run_xattr(&attr);
+ return NULL;
+}
+
+static void *trigger_producer_user_on_cpu(void *input)
+{
+ struct bpf_prog_test_run_attr attr = {};
+ int i = 0, nr_cpu;
+
+ nr_cpu = libbpf_num_possible_cpus();
+
+ attr.prog_fd = bpf_program__fd(ctx.skel->progs.bench_trigger_user);
+
+ while (true) {
+ attr.cpu_plus = i + 1;
+ (void)bpf_prog_test_run_xattr(&attr);
+ i = (i + 1) % nr_cpu;
+ }
+ return NULL;
+}
+
static void *trigger_consumer(void *input)
{
return NULL;
@@ -155,6 +206,17 @@ const struct bench bench_trig_fentry = {
.report_final = hits_drops_report_final,
};

+const struct bench bench_trig_fentry_on_cpu = {
+ .name = "trig-fentry-on-cpu",
+ .validate = trigger_validate,
+ .setup = trigger_fentry_setup,
+ .producer_thread = trigger_on_cpu_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
const struct bench bench_trig_fmodret = {
.name = "trig-fmodret",
.validate = trigger_validate,
@@ -165,3 +227,25 @@ const struct bench bench_trig_fmodret = {
.report_progress = hits_drops_report_progress,
.report_final = hits_drops_report_final,
};
+
+const struct bench bench_trig_user = {
+ .name = "trig-user",
+ .validate = trigger_validate,
+ .setup = trigger_user_setup,
+ .producer_thread = trigger_producer_user,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_user_on_cpu = {
+ .name = "trig-user-on-cpu",
+ .validate = trigger_validate,
+ .setup = trigger_user_setup,
+ .producer_thread = trigger_producer_user_on_cpu,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
diff --git c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
index 78e83f2432946..f10b7aea76aa3 100755
--- c/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
+++ w/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
@@ -2,7 +2,7 @@

set -eufo pipefail

-for i in base tp rawtp kprobe fentry fmodret
+for i in base tp rawtp kprobe fentry fmodret user fentry-on-cpu user-on-cpu
do
summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
printf "%-10s: %s\n" $i "$summary"
diff --git c/tools/testing/selftests/bpf/progs/trigger_bench.c w/tools/testing/selftests/bpf/progs/trigger_bench.c
index 8b36b6640e7e9..a6ac11e68d287 100644
--- c/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ w/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -45,3 +45,10 @@ int bench_trigger_fmodret(void *ctx)
__sync_add_and_fetch(&hits, 1);
return -22;
}
+
+SEC("user")
+int BPF_PROG(bench_trigger_user)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
~