[PATCH bpf-next v1] selftests/bpf: Add benchmark for tail call performance

From: Tiezhu Yang

Date: Mon Jun 29 2026 - 22:23:54 EST

Add a dedicated benchmark to measure the runtime performance and overhead
of tail calls. This helps developers detect performance regressions across
different kernel versions and optimization phases.

The benchmark sets up a standard tracepoint to intercept syscalls triggered
by a dedicated producer thread running a dead loop. The execution path is
strictly bounded by the tail call depth limit, safely preventing any core
lockup or infinite recursion risks.

To eliminate cacheline bouncing and global locking variance, Per-CPU array
maps are utilized to track execution hits across multiple cores.

To evaluate the JIT compiler architecture under complex control flows, it
interleaves direct tail calls with bpf2bpf tail calls.

This forces the tracking context at the target program's entry prologue to
toggle dynamically between a scalar count (0 to 33) and a massive kernel
pointer address, providing a robust micro-architectural stress test which
consists of:

1. tailcall_bench_main: The entry program filtering processes by PID, and
introducing a high-frequency alternating execution path via the syscall
arguments to switch between a direct tail call and a bpf2bpf tail call.
2. tailcall_bench_target: The final target destination hop which safely
terminates the mixed execution flow and increments the step counter.

All functions utilize explicit "struct tracepoint_raw_syscalls_sys_enter"
context types to ensure strict type alignment and clear pointer provenance
for the BPF verifier.

Additionally, provide a test script run_bench_tailcall.sh to automate the
execution under strict core affinity and isolation for reliable profiling,
formatting the captured metrics directly into the performance report.

Signed-off-by: Tiezhu Yang <yangtiezhu@xxxxxxxxxxx>
---
tools/testing/selftests/bpf/Makefile | 2 +
tools/testing/selftests/bpf/bench.c | 2 +
.../selftests/bpf/benchs/bench_tailcall.c | 90 +++++++++++++++++++
.../bpf/benchs/run_bench_tailcall.sh | 18 ++++
.../selftests/bpf/progs/tailcall_bench.c | 89 ++++++++++++++++++
5 files changed, 201 insertions(+)
create mode 100644 tools/testing/selftests/bpf/benchs/bench_tailcall.c
create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bench.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index b642ee489ea6..584504bc87a6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -980,6 +980,7 @@ $(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_tr
$(OUTPUT)/bench_bpf_nop.o: $(OUTPUT)/bpf_nop_bench.skel.h bench_bpf_timing.h
$(OUTPUT)/bench_xdp_lb.o: $(OUTPUT)/xdp_lb_bench.skel.h bench_bpf_timing.h
$(OUTPUT)/bench_bpf_timing.o: bench_bpf_timing.h
+$(OUTPUT)/bench_tailcall.o: $(OUTPUT)/tailcall_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
$(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -1005,6 +1006,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
$(OUTPUT)/bench_bpf_timing.o \
$(OUTPUT)/bench_bpf_nop.o \
$(OUTPUT)/bench_xdp_lb.o \
+ $(OUTPUT)/bench_tailcall.o \
$(OUTPUT)/usdt_1.o \
$(OUTPUT)/usdt_2.o \
#
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 3d9d2cd7764b..a79b86316d28 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -582,6 +582,7 @@ extern const struct bench bench_lpm_trie_delete;
extern const struct bench bench_lpm_trie_free;
extern const struct bench bench_bpf_nop;
extern const struct bench bench_xdp_lb;
+extern const struct bench bench_tailcall;

static const struct bench *benchs[] = {
&bench_count_global,
@@ -665,6 +666,7 @@ static const struct bench *benchs[] = {
&bench_lpm_trie_free,
&bench_bpf_nop,
&bench_xdp_lb,
+ &bench_tailcall,
};

static void find_benchmark(void)
diff --git a/tools/testing/selftests/bpf/benchs/bench_tailcall.c b/tools/testing/selftests/bpf/benchs/bench_tailcall.c
new file mode 100644
index 000000000000..a203017f5e28
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_tailcall.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <bpf/bpf.h>
+#include "bench.h"
+#include "tailcall_bench.skel.h"
+
+static struct ctx {
+ struct tailcall_bench *skel;
+ struct bpf_link *link;
+ int map_fd;
+ int ncpus;
+ unsigned int percpu_size;
+} ctx;
+
+static void tailcall_measure(struct bench_res *res)
+{
+ __u32 key = 0;
+ __u8 *values;
+ __u64 total_hits = 0;
+
+ values = calloc(ctx.ncpus, ctx.percpu_size);
+ if (!values)
+ return;
+
+ if (bpf_map_lookup_elem(ctx.map_fd, &key, values) != 0)
+ return;
+
+ for (int i = 0; i < ctx.ncpus; i++)
+ total_hits += *(__u64 *)(values + i * ctx.percpu_size);
+
+ res->hits = total_hits;
+ free(values);
+}
+
+static void *tailcall_producer(void *input)
+{
+ unsigned long arg = 0;
+
+ while (true) {
+ /* Toggle the argument between 0 and 1 on every iteration */
+ syscall(__NR_getpgid, arg & 1);
+ arg++;
+ }
+
+ return NULL;
+}
+
+static void tailcall_setup(void)
+{
+ int main_fd, target_fd, jmp_map_fd;
+ __u32 key1 = 1;
+
+ ctx.skel = tailcall_bench__open();
+ if (!ctx.skel)
+ exit(1);
+
+ ctx.skel->data->my_pid = getpid();
+ ctx.ncpus = libbpf_num_possible_cpus();
+
+ if (tailcall_bench__load(ctx.skel))
+ exit(1);
+
+ jmp_map_fd = bpf_map__fd(ctx.skel->maps.jmp_table);
+ ctx.map_fd = bpf_map__fd(ctx.skel->maps.pcpu_hits_map);
+ ctx.percpu_size = bpf_map__value_size(ctx.skel->maps.pcpu_hits_map);
+
+ if (ctx.map_fd < 0 || jmp_map_fd < 0)
+ exit(1);
+
+ main_fd = bpf_program__fd(ctx.skel->progs.tailcall_bench_main);
+ target_fd = bpf_program__fd(ctx.skel->progs.tailcall_bench_target);
+
+ /* Map key 1 directly to the final target program */
+ bpf_map_update_elem(jmp_map_fd, &key1, &target_fd, BPF_ANY);
+
+ ctx.link = bpf_program__attach(ctx.skel->progs.tailcall_bench_main);
+ if (!ctx.link)
+ exit(1);
+}
+
+const struct bench bench_tailcall = {
+ .name = "tailcall",
+ .setup = tailcall_setup,
+ .producer_thread = tailcall_producer,
+ .measure = tailcall_measure,
+ .report_progress = ops_report_progress,
+ .report_final = ops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh b/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
new file mode 100755
index 000000000000..c687f34455e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_tailcall.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# 1. Load the official common benchmark utilities
+source ./benchs/run_common.sh
+
+# 2. Strict error handling configurations
+set -eufo pipefail
+
+# 3. Use default bench binary path if not exported by the framework
+BENCH_BIN=${BENCH:-./bench}
+
+# 4. Run with strict core affinity and isolation for reliable profiling
+RUN_BENCH="numactl --physcpubind=0,2 --membind=0 nice -n -20 $BENCH_BIN -w5 -d20 -a"
+
+# 5. Capture the output string and pass it straight into summarize_ops
+# This satisfies the framework's internal parameter bounds without triggering set -u.
+summarize_ops "tailcall" "$($RUN_BENCH tailcall)"
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bench.c b/tools/testing/selftests/bpf/progs/tailcall_bench.c
new file mode 100644
index 000000000000..68a50c7b1d06
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bench.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/*
+ * Define the standard kernel tracepoint context structure explicitly
+ * to provide the Clang compiler with exact memory offsets.
+ */
+struct tracepoint_raw_syscalls_sys_enter {
+ unsigned long long unused;
+ long id;
+ unsigned long args[6];
+};
+
+__u32 my_pid SEC(".data") = 0;
+
+/* High-performance Per-CPU Array Map to eliminate global lock variance */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} pcpu_hits_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __type(value, __u32);
+} jmp_table SEC(".maps");
+
+static __always_inline void increment_pcpu_counter(void)
+{
+ __u32 key = 0;
+ __u64 *val;
+
+ val = bpf_map_lookup_elem(&pcpu_hits_map, &key);
+ if (val) {
+ /* Lockless and stable per-CPU increment without cacheline bounce */
+ (*val)++;
+ }
+}
+
+/* Target Program: The final destination of the tail call */
+SEC("tracepoint/raw_syscalls/sys_enter")
+int tailcall_bench_target(struct tracepoint_raw_syscalls_sys_enter *ctx)
+{
+ increment_pcpu_counter();
+ return 0;
+}
+
+/* bpf2bpf Sub-function driving a tail call to pointerize the counter */
+static __noinline int bpf2bpf_tailcall(struct tracepoint_raw_syscalls_sys_enter *ctx)
+{
+ bpf_tail_call(ctx, &jmp_table, 1);
+ return 0;
+}
+
+/* Main program: Entry point for filtered syscall tracepoints */
+SEC("tracepoint/raw_syscalls/sys_enter")
+int tailcall_bench_main(struct tracepoint_raw_syscalls_sys_enter *ctx)
+{
+ __u32 current_pid = bpf_get_current_pid_tgid() >> 32;
+
+ if (current_pid != my_pid)
+ return 0;
+
+ increment_pcpu_counter();
+
+ /*
+ * Branch based on the syscall's first argument from user space.
+ * Alternating between a direct tail call and a bpf2bpf tail call
+ * forces the tail call counter at the target program's prologue to
+ * swing dynamically between a pure scalar value and an inherited
+ * kernel pointer.
+ */
+ if (ctx->args[0] & 1) {
+ /* Path A: Direct tail call -> pure scalar value */
+ bpf_tail_call(ctx, &jmp_table, 1);
+ } else {
+ /* Path B: bpf2bpf tail call -> inherited kernel pointer */
+ bpf2bpf_tailcall(ctx);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
--
2.42.0