Re: [PATCH V2 3/3] perf/x86/intel: Support auto counter reload

From: Peter Zijlstra
Date: Fri Mar 14 2025 - 06:20:44 EST


On Thu, Oct 10, 2024 at 12:28:44PM -0700, kan.liang@xxxxxxxxxxxxxxx wrote:

> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 726ef13c2c81..d3bdc7d18d3f 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -2851,6 +2851,54 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
> cpuc->fixed_ctrl_val |= bits;
> }
>
> +static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
> +{
> + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> + int msr_b, msr_c;
> +
> + if (!mask && !cpuc->acr_cfg_b[idx])
> + return;
> +
> + if (idx < INTEL_PMC_IDX_FIXED) {
> + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B;
> + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C;
> + } else {
> + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B;
> + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C;
> + idx -= INTEL_PMC_IDX_FIXED;
> + }
> +
> + if (cpuc->acr_cfg_b[idx] != mask) {
> + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask);
> + cpuc->acr_cfg_b[idx] = mask;
> + }
> + /* Only need to update the reload value when there is a valid config value. */
> + if (mask && cpuc->acr_cfg_c[idx] != reload) {
> + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload);
> + cpuc->acr_cfg_c[idx] = reload;
> + }
> +}
> +
> +static void intel_pmu_enable_acr(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> +
> + /* The PMU doesn't support ACR */
> + if (!hybrid(event->pmu, acr_cntr_mask64))
> + return;

In which case we shouldn't have been able to create such an event,
right?

But you need this because you do the cleanup here. Hmm, perhaps use a
static_call() to elide the whole call if the hardware doesn't support
this?

> +
> + if (!is_acr_event_group(event) || !event->attr.config2) {
> + /*
> + * The disable doesn't clear the ACR CFG register.
> + * Check and clear the ACR CFG register.
> + */
> + intel_pmu_config_acr(hwc->idx, 0, 0);
> + return;
> + }
> +
> + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period);
> +}
> +
> static void intel_pmu_enable_event(struct perf_event *event)
> {
> u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE;
> @@ -2866,8 +2914,11 @@ static void intel_pmu_enable_event(struct perf_event *event)
> enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR;
> intel_set_masks(event, idx);
> __x86_pmu_enable_event(hwc, enable_mask);
> + intel_pmu_enable_acr(event);
> break;
> case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
> + intel_pmu_enable_acr(event);
> + fallthrough;
> case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
> intel_pmu_enable_fixed(event);
> break;

This order is somewhat inconsistent. The GP events get enable,acr, while
the FIXED ones get acr,enable.

> @@ -3687,6 +3738,12 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
> c2->weight = hweight64(c2->idxmsk64);
> }
>
> + if (is_acr_event_group(event)) {
> + c2 = dyn_constraint(cpuc, c2, idx);
> + c2->idxmsk64 &= event->hw.dyn_mask;
> + c2->weight = hweight64(c2->idxmsk64);
> + }
> +
> return c2;
> }
>
> @@ -3945,6 +4002,78 @@ static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
> return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
> }
>
> +static bool intel_pmu_is_acr_group(struct perf_event *event)
> +{
> + if (!hybrid(event->pmu, acr_cntr_mask64))
> + return false;

Why the above check? Aaaah, because the function does two distinct
things. Perhaps split?

> + /* The group leader has the ACR flag set */
> + if (is_acr_event_group(event))
> + return true;
> +
> + /* The acr_mask is set */
> + if (event->attr.config2)
> + return true;
> +
> + return false;
> +}
> +
> +static int intel_pmu_acr_check_reloadable_event(struct perf_event *event)
> +{
> + struct perf_event *sibling, *leader = event->group_leader;
> + int num = 0;
> +
> + /*
> + * The acr_mask(config2) indicates the event can be reloaded by
> + * other events. Apply the acr_cntr_mask.
> + */
> + if (leader->attr.config2) {
> + leader->hw.dyn_mask = hybrid(leader->pmu, acr_cntr_mask64);
> + num++;
> + } else
> + leader->hw.dyn_mask = ~0ULL;

Here..

> + for_each_sibling_event(sibling, leader) {
> + if (sibling->attr.config2) {
> + sibling->hw.dyn_mask = hybrid(sibling->pmu, acr_cntr_mask64);
> + num++;
> + } else
> + sibling->hw.dyn_mask = ~0ULL;

.. here ..

> + }
> +
> + if (event->attr.config2) {
> + event->hw.dyn_mask = hybrid(event->pmu, acr_cntr_mask64);
> + num++;
> + } else
> + event->hw.dyn_mask = ~0ULL;

.. and here, the else branch lost its {}.

Also, you managed to write the exact same logic 3 times, surely that can
be a helper function?

> +
> + if (num > hweight64(hybrid(event->pmu, acr_cntr_mask64)))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +/*
> + * Update the dyn_mask of each event to guarantee the event is scheduled
> + * to the counters which be able to cause a reload.
> + */
> +static void intel_pmu_set_acr_dyn_mask(struct perf_event *event, int idx,
> + struct perf_event *last)
> +{
> + struct perf_event *sibling, *leader = event->group_leader;
> + u64 mask = hybrid(event->pmu, acr_cntr_cause_mask);
> +
> + /* The event set in the acr_mask(config2) can causes a reload. */
> + if (test_bit(idx, (unsigned long *)&leader->attr.config2))
> + event->hw.dyn_mask &= mask;
> + for_each_sibling_event(sibling, leader) {
> + if (test_bit(idx, (unsigned long *)&sibling->attr.config2))
> + event->hw.dyn_mask &= mask;
> + }
> + if (test_bit(idx, (unsigned long *)&last->attr.config2))
> + event->hw.dyn_mask &= mask;
> +}
> +
> static int intel_pmu_hw_config(struct perf_event *event)
> {
> int ret = x86_pmu_hw_config(event);
> @@ -4056,6 +4185,50 @@ static int intel_pmu_hw_config(struct perf_event *event)
> event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
> }
>
> + if (intel_pmu_is_acr_group(event)) {

This is where you want to check 2 things:

- does hardare suport ACR, if not, fail
- is event ACR, check constraints.

Current code obscures this distinction.

> + struct perf_event *sibling, *leader = event->group_leader;
> + int event_idx = 0;
> +
> + /* Not support perf metrics */
> + if (is_metric_event(event))
> + return -EINVAL;
> +
> + /* Not support freq mode */
> + if (event->attr.freq)
> + return -EINVAL;
> +
> + /* PDist is not supported */
> + if (event->attr.config2 && event->attr.precise_ip > 2)
> + return -EINVAL;
> +
> + /* The reload value cannot exceeds the max period */
> + if (event->attr.sample_period > x86_pmu.max_period)
> + return -EINVAL;
> + /*
> + * It's hard to know whether the event is the last one of
> + * the group. Reconfigure the dyn_mask of each X86 event
> + * every time when add a new event.
> + * It's impossible to verify whether the bits of
> + * the event->attr.config2 exceeds the group. But it's
> + * harmless, because the invalid bits are ignored. See
> + * intel_pmu_update_acr_mask(). The n - n0 guarantees that
> + * only the bits in the group is used.
> + *
> + * Check whether the reloadable counters is enough and
> + * initialize the dyn_mask.
> + */
> + if (intel_pmu_acr_check_reloadable_event(event))
> + return -EINVAL;
> +
> + /* Reconfigure the dyn_mask for each event */
> + intel_pmu_set_acr_dyn_mask(leader, event_idx++, event);
> + for_each_sibling_event(sibling, leader)
> + intel_pmu_set_acr_dyn_mask(sibling, event_idx++, event);
> + intel_pmu_set_acr_dyn_mask(event, event_idx, event);
> +
> + leader->hw.flags |= PERF_X86_EVENT_ACR;
> + }
> +
> if ((event->attr.type == PERF_TYPE_HARDWARE) ||
> (event->attr.type == PERF_TYPE_HW_CACHE))
> return 0;
> @@ -4159,6 +4332,49 @@ static int intel_pmu_hw_config(struct perf_event *event)
> return 0;
> }
>
> +static void intel_pmu_update_acr_mask(struct cpu_hw_events *cpuc, int n, int *assign)
> +{
> + struct perf_event *event;
> + int n0, i, off;
> +
> + if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
> + n0 = cpuc->n_events - cpuc->n_txn;
> + else
> + n0 = cpuc->n_events;
> +
> + for (i = n0; i < n; i++) {
> + event = cpuc->event_list[i];
> + event->hw.config1 = 0;
> +
> + /* Convert the group index into the counter index */
> + for_each_set_bit(off, (unsigned long *)&event->attr.config2, n - n0)
> + set_bit(assign[n0 + off], (unsigned long *)&event->hw.config1);

Atomic set_bit() is required?

> + }
> +}
> +
> +static int intel_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
> +{
> + struct perf_event *event;
> + int ret = x86_schedule_events(cpuc, n, assign);
> +
> + if (ret)
> + return ret;
> +
> + if (cpuc->is_fake)
> + return ret;
> +
> + event = cpuc->event_list[n - 1];

ISTR seeing this pattern before somewhere and then argued it was all
sorts of broken. Why is it sane to look at the last event here?

> + /*
> + * The acr_mask(config2) is the event-enabling order.
> + * Update it to the counter order after the counters are assigned.
> + */
> + if (event && is_acr_event_group(event))
> + intel_pmu_update_acr_mask(cpuc, n, assign);
> +
> + return 0;
> +}
> +
> +
> /*
> * Currently, the only caller of this function is the atomic_switch_perf_msrs().
> * The host perf context helps to prepare the values of the real hardware for
> @@ -5305,7 +5521,7 @@ static __initconst const struct x86_pmu intel_pmu = {
> .set_period = intel_pmu_set_period,
> .update = intel_pmu_update,
> .hw_config = intel_pmu_hw_config,
> - .schedule_events = x86_schedule_events,
> + .schedule_events = intel_pmu_schedule_events,
> .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
> .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
> .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0,

How about only setting that function if the PMU actually support ACR ?