Re: [PATCH RFC net-next v1] page_pool: import Jesper's page_pool benchmark

From: Jesper Dangaard Brouer
Date: Mon Mar 24 2025 - 18:34:57 EST



On 09/03/2025 09.41, Mina Almasry wrote:
diff --git a/lib/bench/time_bench.c b/lib/bench/time_bench.c
new file mode 100644
index 000000000000..4f5314419644
--- /dev/null
+++ b/lib/bench/time_bench.c
@@ -0,0 +1,426 @@
+/*
+ * Benchmarking code execution time inside the kernel
+ *
+ * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
+ * for licensing details see kernel-base/COPYING
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/time_bench.h>
+
+#include <linux/perf_event.h> /* perf_event_create_kernel_counter() */
+
+/* For concurrency testing */
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+
+static int verbose = 1;
+
+/** TSC (Time-Stamp Counter) based **
+ * See: linux/time_bench.h
+ * tsc_start_clock() and tsc_stop_clock()
+ */
+
+/** Wall-clock based **
+ */
+
+/** PMU (Performance Monitor Unit) based **
+ */
+#define PERF_FORMAT \
+ (PERF_FORMAT_GROUP | PERF_FORMAT_ID | \
+ PERF_FORMAT_TOTAL_TIME_ENABLED | \
+ PERF_FORMAT_TOTAL_TIME_RUNNING)
+
+struct raw_perf_event {
+ uint64_t config; /* event */
+ uint64_t config1; /* umask */
+ struct perf_event *save;
+ char *desc;
+};
+
+/* if HT is enable a maximum of 4 events (5 if one is instructions
+ * retired can be specified, if HT is disabled a maximum of 8 (9 if
+ * one is instructions retired) can be specified.
+ *
+ * From Table 19-1. Architectural Performance Events
+ * Architectures Software Developer’s Manual Volume 3: System Programming Guide
+ */
+struct raw_perf_event perf_events[] = {
+ { 0x3c, 0x00, NULL, "Unhalted CPU Cycles" },
+ { 0xc0, 0x00, NULL, "Instruction Retired" }
+};
+
+#define NUM_EVTS (sizeof(perf_events) / sizeof(struct raw_perf_event))
+
+/* WARNING: PMU config is currently broken!
+ */

As the comment says PMU usage is in a broken state...

I've not used it for years... below config setup code doesn't enable PMU correctly.
The way I've activated it (in the past) is by loading (modprobe) the
module under the `perf stat` commands, which does the correct setup, and
then benchmark code can read the PMU counters.

IMHO we should just remove all the PMU code (i.e. not upstream it).
Unless, ACME can fix below setup code in seconds... else let's not bother.

+bool time_bench_PMU_config(bool enable)
+{
+ int i;
+ struct perf_event_attr perf_conf;
+ struct perf_event *perf_event;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ pr_info("DEBUG: cpu:%d\n", cpu);
+ preempt_enable();
+
+ memset(&perf_conf, 0, sizeof(struct perf_event_attr));
+ perf_conf.type = PERF_TYPE_RAW;
+ perf_conf.size = sizeof(struct perf_event_attr);
+ perf_conf.read_format = PERF_FORMAT;
+ perf_conf.pinned = 1;
+ perf_conf.exclude_user = 1; /* No userspace events */
+ perf_conf.exclude_kernel = 0; /* Only kernel events */
+
+ for (i = 0; i < NUM_EVTS; i++) {
+ perf_conf.disabled = enable;
+ perf_conf.config = perf_events[i].config;
+ perf_conf.config1 = perf_events[i].config1;
+ if (verbose)
+ pr_info("%s() enable PMU counter: %s\n",
+ __func__, perf_events[i].desc);
+ perf_event = perf_event_create_kernel_counter(&perf_conf, cpu,
+ NULL /* task */,
+ NULL /* overflow_handler*/,
+ NULL /* context */);
+ if (perf_event) {
+ perf_events[i].save = perf_event;
+ pr_info("%s():DEBUG perf_event success\n", __func__);
+
+ perf_event_enable(perf_event);
+ } else {
+ pr_info("%s():DEBUG perf_event is NULL\n", __func__);
+ }
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(time_bench_PMU_config);

Below code that reads the PMU registers are likely also flawed.
ACME feel free to yell a me ;-)

[...]
> +/** PMU (Performance Monitor Unit) based **
> + *
> + * Needed for calculating: Instructions Per Cycle (IPC)
> + * - The IPC number tell how efficient the CPU pipelining were
> + */
> +//lookup: perf_event_create_kernel_counter()
> +
> +bool time_bench_PMU_config(bool enable);
> +
> +/* Raw reading via rdpmc() using fixed counters
> + *
> + * From:https://github.com/andikleen/simple-pmu
> + */
> +enum {
> + FIXED_SELECT = (1U << 30), /* == 0x40000000 */
> + FIXED_INST_RETIRED_ANY = 0,
> + FIXED_CPU_CLK_UNHALTED_CORE = 1,
> + FIXED_CPU_CLK_UNHALTED_REF = 2,
> +};
> +
> +static __always_inline unsigned long long p_rdpmc(unsigned in)
> +{
> + unsigned d, a;
> +
> + asm volatile("rdpmc" : "=d" (d), "=a" (a) : "c" (in) : "memory");
> + return ((unsigned long long)d << 32) | a;
> +}
> +
> +/* These PMU counter needs to be enabled, but I don't have the
> + * configure code implemented. My current hack is running:
> + * sudo perf stat -e cycles:k -e instructions:k insmod lib/ring_queue_test.ko
> + */

Here I spell out how I run the `insmod` under `perf stat`.


> +/* Reading all pipelined instruction */
> +static __always_inline unsigned long long pmc_inst(void)
> +{
> + return p_rdpmc(FIXED_SELECT | FIXED_INST_RETIRED_ANY);
> +}
> +
> +/* Reading CPU clock cycles */
> +static __always_inline unsigned long long pmc_clk(void)
> +{
> + return p_rdpmc(FIXED_SELECT | FIXED_CPU_CLK_UNHALTED_CORE);
> +}
> +
> +/* Raw reading via MSR rdmsr() is likely wrong
> + * FIXME: How can I know which raw MSR registers are conf for what?
> + */
> +#define MSR_IA32_PCM0 0x400000C1 /* PERFCTR0 */
> +#define MSR_IA32_PCM1 0x400000C2 /* PERFCTR1 */
> +#define MSR_IA32_PCM2 0x400000C3
> +static inline uint64_t msr_inst(unsigned long long *msr_result)
> +{
> + return rdmsrl_safe(MSR_IA32_PCM0, msr_result);
> +}


--Jesper