[PATCH] perf/arm-cmn: Add shutdown routine

From: Geoff Blake
Date: Fri Nov 25 2022 - 18:03:15 EST


The CMN driver does not gracefully handle all
restart cases, such as kexec. On a kexec if the
arm-cmn driver is in use it can be left in a state
with still active events that can cause spurious and/or
unhandled interrupts that appear as non-fatal kernel errors
like below, that can be confusing and misleading:

[ 3.895093] irq 28: nobody cared (try booting with the "irqpoll" option)
[ 3.895170] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.19.0-1011-aws #12
[ 3.895172] Hardware name: Amazon EC2 c6g.metal/Not Specified, BIOS 1.0 10/16/2017
[ 3.895174] Call trace:
[ 3.895175] dump_backtrace+0xe8/0x150
[ 3.895181] show_stack+0x28/0x70
[ 3.895183] dump_stack_lvl+0x68/0x9c
[ 3.895188] dump_stack+0x1c/0x48
[ 3.895190] __report_bad_irq+0x58/0x138
[ 3.895193] note_interrupt+0x23c/0x360
[ 3.895196] handle_irq_event+0x108/0x1a0
[ 3.895198] handle_fasteoi_irq+0xd0/0x24c
[ 3.895201] generic_handle_domain_irq+0x3c/0x70
[ 3.895203] __gic_handle_irq_from_irqson.isra.0+0xcc/0x2c0
[ 3.895207] gic_handle_irq+0x34/0xb0
[ 3.895209] call_on_irq_stack+0x40/0x50
[ 3.895211] do_interrupt_handler+0xb0/0xb4
[ 3.895214] el1_interrupt+0x4c/0xe0
[ 3.895217] el1h_64_irq_handler+0x1c/0x40
[ 3.895220] el1h_64_irq+0x78/0x7c
[ 3.895222] __do_softirq+0xd0/0x450
[ 3.895223] __irq_exit_rcu+0xcc/0x120
[ 3.895227] irq_exit_rcu+0x20/0x40
[ 3.895229] el1_interrupt+0x50/0xe0
[ 3.895231] el1h_64_irq_handler+0x1c/0x40
[ 3.895233] el1h_64_irq+0x78/0x7c
[ 3.895235] arch_cpu_idle+0x1c/0x6c
[ 3.895238] default_idle_call+0x4c/0x19c
[ 3.895240] cpuidle_idle_call+0x18c/0x1f0
[ 3.895243] do_idle+0xb0/0x11c
[ 3.895245] cpu_startup_entry+0x34/0x40
[ 3.895248] rest_init+0xec/0x104
[ 3.895250] arch_post_acpi_subsys_init+0x0/0x30
[ 3.895254] start_kernel+0x4d0/0x534
[ 3.895256] __primary_switched+0xc4/0xcc
[ 3.895259] handlers:
[ 3.895292] [<000000008f5364c7>] arm_cmn_handle_irq [arm_cmn]
[ 3.895369] Disabling IRQ #28

This type of kernel error can be reproduced by running perf with
an arm_cmn event active and then forcing a kexec. On return from
the kexec, this message can appear semi-regularly.

This patch adds a shutdown routine that gets called by the
reboot/kexec path to put the PMU nodes back into a clean state
to avoid leaked events and the above unhandled interrupts.
Additionally modify arm_cmn_discover() loop to initialize all PMU
capable nodes to have no active events as done for XP watchpoints
in arm_cmn_init_dtm(). Testing 100's of kexecs with this patch
has shown this error message goes away.

Signed-off-by: Geoff Blake <blakgeof@xxxxxxxxxx>
---
drivers/perf/arm-cmn.c | 58 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index b80a9b74662b..032e0c87e8b6 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -112,6 +112,7 @@
#define CMN_DTM_UNIT_INFO 0x0910

#define CMN_DTM_NUM_COUNTERS 4
+#define CMN_DTM_NUM_WPS 4
/* Want more local counters? Why not replicate the whole DTM! Ugh... */
#define CMN_DTM_OFFSET(n) ((n) * 0x200)

@@ -1865,7 +1866,7 @@ static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, i

dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx);
dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < CMN_DTM_NUM_WPS; i++) {
dtm->wp_event[i] = -1;
writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i));
writeq_relaxed(~0ULL, dtm->base + CMN_DTM_WPn_VAL(i));
@@ -2137,6 +2138,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
case CMN_TYPE_CCRA:
case CMN_TYPE_CCHA:
case CMN_TYPE_CCLA:
+ writeq_relaxed(0, dn->pmu_base + CMN_PMU_EVENT_SEL);
dn++;
break;
/* Nothing to see here */
@@ -2156,6 +2158,8 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
dn[1] = dn[0];
dn[0].pmu_base += CMN_HNP_PMU_EVENT_SEL;
dn[1].type = arm_cmn_subtype(dn->type);
+ writeq_relaxed(0, dn[0].pmu_base + CMN_PMU_EVENT_SEL);
+ writeq_relaxed(0, dn[1].pmu_base + CMN_PMU_EVENT_SEL);
dn += 2;
break;
/* Something has gone horribly wrong */
@@ -2312,15 +2316,64 @@ static int arm_cmn_probe(struct platform_device *pdev)
return err;
}

-static int arm_cmn_remove(struct platform_device *pdev)
+static void arm_cmn_shutdown(struct platform_device *pdev)
{
struct arm_cmn *cmn = platform_get_drvdata(pdev);
+ struct arm_cmn_dtm *dtm = cmn->dtms;
+ struct arm_cmn_node *dn = cmn->dns;
+ int i, j = 0;

+ writel_relaxed(0, cmn->dtc[0].base + CMN_DT_PMCR);
writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);

+ /* Go through all DTMs and disable PMU events and WPs */
+ for (i = 0, dtm = cmn->dtms; i < cmn->num_xps; i++, dtm++) {
+ writeq_relaxed(0, dtm->base + CMN_DTM_PMU_CONFIG);
+ for (j = 0; j < CMN_DTM_NUM_WPS; j++) {
+ writeq_relaxed(0, dtm->base + CMN_DTM_WPn_CONFIG(j));
+ writeq_relaxed(0, dtm->base + CMN_DTM_WPn_VAL(j));
+ writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(j));
+ }
+ }
+
+ /* Go through all relevant DN types and disable any leaked PMU events */
+ for (i = 0, dn = cmn->dns; i < cmn->num_dns; i++, dn++) {
+ switch (dn->type) {
+ case CMN_TYPE_DVM:
+ case CMN_TYPE_HNI:
+ case CMN_TYPE_HNF:
+ case CMN_TYPE_HNP:
+ case CMN_TYPE_SBSX:
+ case CMN_TYPE_RNI:
+ case CMN_TYPE_RND:
+ case CMN_TYPE_MTSX:
+ case CMN_TYPE_CXRA:
+ case CMN_TYPE_CXHA:
+ case CMN_TYPE_CCRA:
+ case CMN_TYPE_CCHA:
+ case CMN_TYPE_CCLA:
+ case CMN_TYPE_CCLA_RNI:
+ case CMN_TYPE_XP:
+ writeq_relaxed(0, dn->pmu_base + CMN_PMU_EVENT_SEL);
+ break;
+ case CMN_TYPE_DTC:
+ case CMN_TYPE_MPAM_S:
+ case CMN_TYPE_MPAM_NS:
+ case CMN_TYPE_RNSAM:
+ case CMN_TYPE_CXLA:
+ default:
+ break;
+ }
+ }
+
perf_pmu_unregister(&cmn->pmu);
cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
debugfs_remove(cmn->debug);
+}
+
+static int arm_cmn_remove(struct platform_device *pdev)
+{
+ arm_cmn_shutdown(pdev);
return 0;
}

@@ -2353,6 +2406,7 @@ static struct platform_driver arm_cmn_driver = {
},
.probe = arm_cmn_probe,
.remove = arm_cmn_remove,
+ .shutdown = arm_cmn_shutdown,
};

static int __init arm_cmn_init(void)
--
2.24.3 (Apple Git-128)