[PATCH V18 3/9] drivers: perf: arm_pmu: Add infrastructure for branch stack sampling

From: Anshuman Khandual
Date: Thu Jun 13 2024 - 02:18:54 EST


In order to support the Branch Record Buffer Extension (BRBE), we need to
extend the arm_pmu framework with some basic infrastructure for branch
stack sampling which arm_pmu drivers can opt-in to using. Subsequent
patches will use this to add support for BRBE in the PMUv3 driver.

With BRBE, the hardware records branches into a hardware FIFO, which will
be sampled by software when perf events overflow. A task may be context-
switched an arbitrary number of times between overflows, and to avoid
losing samples we need to save the current records when a task is context-
switched out. To do these we'll need to use the pmu::sched_task() callback,
and we'll also need to allocate some per-task storage space via event flag
PERF_ATTACH_TASK_DATA.

Cc: Catalin Marinas <catalin.marinas@xxxxxxx>
Cc: Will Deacon <will@xxxxxxxxxx>
Cc: Mark Rutland <mark.rutland@xxxxxxx>
Cc: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
Signed-off-by: Anshuman Khandual <anshuman.khandual@xxxxxxx>
----
Changes in V18:

- Scan valid branch stack events in armpmu_start() to create merged filter
- Updated the commit message

drivers/perf/arm_pmu.c | 42 +++++++++++++++++++++++++++++++++---
include/linux/perf/arm_pmu.h | 32 ++++++++++++++++++++++++++-
2 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 8458fe2cebb4..219c1e276327 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -289,6 +289,23 @@ static void armpmu_start(struct perf_event *event, int flags)
{
struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
struct hw_perf_event *hwc = &event->hw;
+ struct pmu_hw_events *cpuc = this_cpu_ptr(armpmu->hw_events);
+ int idx;
+
+ /*
+ * Merge all branch filter requests from different perf
+ * events being added into this PMU. This includes both
+ * privilege and branch type filters.
+ */
+ if (armpmu->has_branch_stack) {
+ cpuc->branch_sample_type = 0;
+ for (idx = 0; idx < ARMPMU_MAX_HWEVENTS; idx++) {
+ struct perf_event *event_idx = cpuc->events[idx];
+
+ if (event_idx && has_branch_stack(event_idx))
+ cpuc->branch_sample_type |= event_idx->attr.branch_sample_type;
+ }
+ }

/*
* ARM pmu always has to reprogram the period, so ignore
@@ -317,6 +334,9 @@ armpmu_del(struct perf_event *event, int flags)
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;

+ if (has_branch_stack(event))
+ armpmu->branch_stack_del(event, hw_events);
+
armpmu_stop(event, PERF_EF_UPDATE);
hw_events->events[idx] = NULL;
armpmu->clear_event_idx(hw_events, event);
@@ -342,6 +362,9 @@ armpmu_add(struct perf_event *event, int flags)
if (idx < 0)
return idx;

+ if (has_branch_stack(event))
+ armpmu->branch_stack_add(event, hw_events);
+
/*
* If there is an event in the counter we are going to use then make
* sure it is disabled.
@@ -511,13 +534,25 @@ static int armpmu_event_init(struct perf_event *event)
!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
return -ENOENT;

- /* does not support taken branch sampling */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
+ if (has_branch_stack(event)) {
+ if (!armpmu->has_branch_stack)
+ return -EOPNOTSUPP;
+
+ if (!armpmu->branch_stack_init(event))
+ return -EOPNOTSUPP;
+ }

return __hw_perf_event_init(event);
}

+static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+ struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+
+ if (armpmu->sched_task)
+ armpmu->sched_task(pmu_ctx, sched_in);
+}
+
static void armpmu_enable(struct pmu *pmu)
{
struct arm_pmu *armpmu = to_arm_pmu(pmu);
@@ -864,6 +899,7 @@ struct arm_pmu *armpmu_alloc(void)
}

pmu->pmu = (struct pmu) {
+ .sched_task = armpmu_sched_task,
.pmu_enable = armpmu_enable,
.pmu_disable = armpmu_disable,
.event_init = armpmu_event_init,
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index b3b34f6670cf..9eda16dd684e 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -46,6 +46,18 @@ static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_63BIT) == ARMPMU_EVT_63BIT);
}, \
}

+/*
+ * Maximum branch record entries which could be processed
+ * for core perf branch stack sampling support, regardless
+ * of the hardware support available on a given ARM PMU.
+ */
+#define MAX_BRANCH_RECORDS 64
+
+struct branch_records {
+ struct perf_branch_stack branch_stack;
+ struct perf_branch_entry branch_entries[MAX_BRANCH_RECORDS];
+};
+
/* The events for a given PMU register set. */
struct pmu_hw_events {
/*
@@ -66,6 +78,17 @@ struct pmu_hw_events {
struct arm_pmu *percpu_pmu;

int irq;
+
+ struct branch_records *branches;
+
+ /* Active context for task events */
+ void *branch_context;
+
+ /* Active events requesting branch records */
+ unsigned int branch_users;
+
+ /* Active branch sample type filters */
+ unsigned long branch_sample_type;
};

enum armpmu_attr_groups {
@@ -96,8 +119,15 @@ struct arm_pmu {
void (*stop)(struct arm_pmu *);
void (*reset)(void *);
int (*map_event)(struct perf_event *event);
+ void (*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+ bool (*branch_stack_init)(struct perf_event *event);
+ void (*branch_stack_add)(struct perf_event *event, struct pmu_hw_events *cpuc);
+ void (*branch_stack_del)(struct perf_event *event, struct pmu_hw_events *cpuc);
+ void (*branch_stack_reset)(void);
int num_events;
- bool secure_access; /* 32-bit ARM only */
+ unsigned int secure_access : 1, /* 32-bit ARM only */
+ has_branch_stack: 1, /* 64-bit ARM only */
+ reserved : 30;
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000
--
2.25.1