[RFC 1/2] page_pool: add benchmarking for napi-based recycling

From: Dragos Tatulea

Date: Fri Nov 07 2025 - 05:30:31 EST


The code brings back the tasklet based code in order
to be able to run in softirq context.

One additional test is added which benchmarks the
impact of page_pool_napi_local().

Signed-off-by: Dragos Tatulea <dtatulea@xxxxxxxxxx>
---
.../bench/page_pool/bench_page_pool_simple.c | 92 ++++++++++++++++++-
1 file changed, 90 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
index cb6468adbda4..84683c547814 100644
--- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
+++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
@@ -9,6 +9,7 @@
#include <linux/limits.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/netdevice.h>
#include <net/page_pool/helpers.h>

#include "time_bench.h"
@@ -16,6 +17,8 @@
static int verbose = 1;
#define MY_POOL_SIZE 1024

+DEFINE_MUTEX(wait_for_tasklet);
+
/* Makes tests selectable. Useful for perf-record to analyze a single test.
* Hint: Bash shells support writing binary number like: $((2#101010)
*
@@ -31,6 +34,10 @@ enum benchmark_bit {
bit_run_bench_no_softirq01,
bit_run_bench_no_softirq02,
bit_run_bench_no_softirq03,
+ bit_run_bench_tasklet01,
+ bit_run_bench_tasklet02,
+ bit_run_bench_tasklet03,
+ bit_run_bench_tasklet04,
};

#define bit(b) (1 << (b))
@@ -120,7 +127,12 @@ static void pp_fill_ptr_ring(struct page_pool *pp, int elems)
kfree(array);
}

-enum test_type { type_fast_path, type_ptr_ring, type_page_allocator };
+enum test_type {
+ type_fast_path,
+ type_napi_aware,
+ type_ptr_ring,
+ type_page_allocator,
+};

/* Depends on compile optimizing this function */
static int time_bench_page_pool(struct time_bench_record *rec, void *data,
@@ -132,6 +144,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,

struct page_pool *pp;
struct page *page;
+ struct napi_struct napi = {0};

struct page_pool_params pp_params = {
.order = 0,
@@ -141,6 +154,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
.dev = NULL, /* Only use for DMA mapping */
.dma_dir = DMA_BIDIRECTIONAL,
};
+ struct page_pool_stats stats = {0};

pp = page_pool_create(&pp_params);
if (IS_ERR(pp)) {
@@ -155,6 +169,11 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
else
pr_warn("%s(): Cannot use page_pool fast-path\n", func);

+ if (type == type_napi_aware) {
+ napi.list_owner = smp_processor_id();
+ page_pool_enable_direct_recycling(pp, &napi);
+ }
+
time_bench_start(rec);
/** Loop to measure **/
for (i = 0; i < rec->loops; i++) {
@@ -173,7 +192,13 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
page_pool_recycle_direct(pp, page);

} else if (type == type_ptr_ring) {
- /* Normal return path */
+ /* Normal return path, either direct or via ptr_ring */
+ page_pool_put_page(pp, page, -1, false);
+
+ } else if (type == type_napi_aware) {
+ /* NAPI-aware recycling: uses fast-path recycling if
+ * possible.
+ */
page_pool_put_page(pp, page, -1, false);

} else if (type == type_page_allocator) {
@@ -188,6 +213,14 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
}
}
time_bench_stop(rec, loops_cnt);
+
+ if (type == type_napi_aware) {
+ page_pool_get_stats(pp, &stats);
+ if (stats.recycle_stats.cached < rec->loops)
+ pr_warn("%s(): NAPI-aware recycling wasn't used\n",
+ func);
+ }
+
out:
page_pool_destroy(pp);
return loops_cnt;
@@ -211,6 +244,54 @@ static int time_bench_page_pool03_slow(struct time_bench_record *rec,
return time_bench_page_pool(rec, data, type_page_allocator, __func__);
}

+static int time_bench_page_pool04_napi_aware(struct time_bench_record *rec,
+ void *data)
+{
+ return time_bench_page_pool(rec, data, type_napi_aware, __func__);
+}
+
+/* Testing page_pool requires running under softirq.
+ *
+ * Running under a tasklet satisfy this, as tasklets are built on top of
+ * softirq.
+ */
+static void pp_tasklet_handler(struct tasklet_struct *t)
+{
+ uint32_t nr_loops = loops;
+
+ if (in_serving_softirq())
+ pr_warn("%s(): in_serving_softirq fast-path\n",
+ __func__); // True
+ else
+ pr_warn("%s(): Cannot use page_pool fast-path\n", __func__);
+
+ if (enabled(bit_run_bench_tasklet01))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path",
+ NULL, time_bench_page_pool01_fast_path);
+
+ if (enabled(bit_run_bench_tasklet02))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring",
+ NULL, time_bench_page_pool02_ptr_ring);
+
+ if (enabled(bit_run_bench_tasklet03))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL,
+ time_bench_page_pool03_slow);
+
+ if (enabled(bit_run_bench_tasklet04))
+ time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware",
+ NULL, time_bench_page_pool04_napi_aware);
+
+ mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */
+}
+DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler);
+
+static void run_tasklet_tests(void)
+{
+ tasklet_enable(&pp_tasklet);
+ /* "Async" schedule tasklet, which runs on the CPU that schedule it */
+ tasklet_schedule(&pp_tasklet);
+}
+
static int run_benchmark_tests(void)
{
uint32_t nr_loops = loops;
@@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)

run_benchmark_tests();

+ mutex_lock(&wait_for_tasklet);
+ run_tasklet_tests();
+ /* Sleep on mutex, waiting for tasklet to release */
+ mutex_lock(&wait_for_tasklet);
+
return 0;
}
module_init(bench_page_pool_simple_module_init);

static void __exit bench_page_pool_simple_module_exit(void)
{
+ tasklet_kill(&pp_tasklet);
+
if (verbose)
pr_info("Unloaded\n");
}
--
2.50.1