Re: [Patch v9 12/12] perf/x86/intel: Add counter group support for arch-PEBS

From: Ian Rogers

Date: Tue Mar 10 2026 - 00:37:29 EST


On Mon, Mar 9, 2026 at 7:06 PM Mi, Dapeng <dapeng1.mi@xxxxxxxxxxxxxxx> wrote:
>
>
> On 3/10/2026 6:59 AM, Ian Rogers wrote:
> > On Wed, Oct 29, 2025 at 3:25 AM Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx> wrote:
> >> Base on previous adaptive PEBS counter snapshot support, add counter
> >> group support for architectural PEBS. Since arch-PEBS shares same
> >> counter group layout with adaptive PEBS, directly reuse
> >> __setup_pebs_counter_group() helper to process arch-PEBS counter group.
> >>
> >> Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
> >> ---
> >> arch/x86/events/intel/core.c | 38 ++++++++++++++++++++++++++++---
> >> arch/x86/events/intel/ds.c | 29 ++++++++++++++++++++---
> >> arch/x86/include/asm/msr-index.h | 6 +++++
> >> arch/x86/include/asm/perf_event.h | 13 ++++++++---
> >> 4 files changed, 77 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> >> index 75cba28b86d5..cb64018321dd 100644
> >> --- a/arch/x86/events/intel/core.c
> >> +++ b/arch/x86/events/intel/core.c
> >> @@ -3014,6 +3014,17 @@ static void intel_pmu_enable_event_ext(struct perf_event *event)
> >>
> >> if (pebs_data_cfg & PEBS_DATACFG_LBRS)
> >> ext |= ARCH_PEBS_LBR & cap.caps;
> >> +
> >> + if (pebs_data_cfg &
> >> + (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT))
> >> + ext |= ARCH_PEBS_CNTR_GP & cap.caps;
> >> +
> >> + if (pebs_data_cfg &
> >> + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT))
> >> + ext |= ARCH_PEBS_CNTR_FIXED & cap.caps;
> >> +
> >> + if (pebs_data_cfg & PEBS_DATACFG_METRICS)
> >> + ext |= ARCH_PEBS_CNTR_METRICS & cap.caps;
> >> }
> >>
> >> if (cpuc->n_pebs == cpuc->n_large_pebs)
> >> @@ -3038,6 +3049,9 @@ static void intel_pmu_enable_event_ext(struct perf_event *event)
> >> }
> >> }
> >>
> >> + if (is_pebs_counter_event_group(event))
> >> + ext |= ARCH_PEBS_CNTR_ALLOW;
> >> +
> >> if (cpuc->cfg_c_val[hwc->idx] != ext)
> >> __intel_pmu_update_event_ext(hwc->idx, ext);
> >> }
> >> @@ -4323,6 +4337,20 @@ static bool intel_pmu_is_acr_group(struct perf_event *event)
> >> return false;
> >> }
> >>
> >> +static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu)
> >> +{
> >> + u64 caps;
> >> +
> >> + if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline)
> >> + return true;
> >> +
> >> + caps = hybrid(pmu, arch_pebs_cap).caps;
> >> + if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK))
> >> + return true;
> >> +
> >> + return false;
> >> +}
> >> +
> >> static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event,
> >> u64 *cause_mask, int *num)
> >> {
> >> @@ -4471,8 +4499,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
> >> }
> >>
> >> if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
> >> - (x86_pmu.intel_cap.pebs_format >= 6) &&
> >> - x86_pmu.intel_cap.pebs_baseline &&
> >> + intel_pmu_has_pebs_counter_group(event->pmu) &&
> >> is_sampling_event(event) &&
> >> event->attr.precise_ip)
> >> event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
> >> @@ -5420,6 +5447,8 @@ static inline void __intel_update_large_pebs_flags(struct pmu *pmu)
> >> x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
> >> if (caps & ARCH_PEBS_LBR)
> >> x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK;
> >> + if (caps & ARCH_PEBS_CNTR_MASK)
> >> + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
> >>
> >> if (!(caps & ARCH_PEBS_AUX))
> >> x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC;
> >> @@ -7134,8 +7163,11 @@ __init int intel_pmu_init(void)
> >> * Many features on and after V6 require dynamic constraint,
> >> * e.g., Arch PEBS, ACR.
> >> */
> >> - if (version >= 6)
> >> + if (version >= 6) {
> >> x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT;
> >> + x86_pmu.late_setup = intel_pmu_late_setup;
> >> + }
> >> +
> >> /*
> >> * Install the hw-cache-events table:
> >> */
> >> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> >> index c66e9b562de3..c93bf971d97b 100644
> >> --- a/arch/x86/events/intel/ds.c
> >> +++ b/arch/x86/events/intel/ds.c
> >> @@ -1530,13 +1530,20 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
> >>
> >> u64 intel_get_arch_pebs_data_config(struct perf_event *event)
> >> {
> >> + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> >> u64 pebs_data_cfg = 0;
> >> + u64 cntr_mask;
> >>
> >> if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX))
> >> return 0;
> >>
> >> pebs_data_cfg |= pebs_update_adaptive_cfg(event);
> >>
> >> + cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) |
> >> + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) |
> >> + PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS;
> >> + pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask;
> >> +
> >> return pebs_data_cfg;
> >> }
> >>
> >> @@ -2444,6 +2451,24 @@ static void setup_arch_pebs_sample_data(struct perf_event *event,
> >> }
> >> }
> >>
> >> + if (header->cntr) {
> >> + struct arch_pebs_cntr_header *cntr = next_record;
> >> + unsigned int nr;
> >> +
> >> + next_record += sizeof(struct arch_pebs_cntr_header);
> >> +
> >> + if (is_pebs_counter_event_group(event)) {
> >> + __setup_pebs_counter_group(cpuc, event,
> >> + (struct pebs_cntr_header *)cntr, next_record);
> >> + data->sample_flags |= PERF_SAMPLE_READ;
> >> + }
> >> +
> >> + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed);
> >> + if (cntr->metrics == INTEL_CNTR_METRICS)
> >> + nr += 2;
> >> + next_record += nr * sizeof(u64);
> >> + }
> >> +
> >> /* Parse followed fragments if there are. */
> >> if (arch_pebs_record_continued(header)) {
> >> at = at + header->size;
> >> @@ -3094,10 +3119,8 @@ static void __init intel_ds_pebs_init(void)
> >> break;
> >>
> >> case 6:
> >> - if (x86_pmu.intel_cap.pebs_baseline) {
> >> + if (x86_pmu.intel_cap.pebs_baseline)
> >> x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
> >> - x86_pmu.late_setup = intel_pmu_late_setup;
> >> - }
> > Hi Dapeng,
> >
> > I'm trying to understand why the late_setup initialization was changed
> > here and its connection with counter group support. I couldn't find a
> > mention in the commit message.
>
> It's because arch-PEBS also supports counters group sampling, not just the
> legacy PEBS with PEBS format 6. Currently ACR (auto counter reload) and
> PEBS both needs the late_setup, ACR and counters group sampling (regardless
> of legacy PEBS or arch-PEBS) are introduced since Perfmon v6, so the
> late_setup initialization is moved to the unified place of perfmon v6
> initialization. Thanks.

Ah, okay. I think it would have been clearer if those changes and the
counter group changes were kept in separate patches. Anyway, this code
has landed and nothing broke with this change.

Thanks,
Ian

>
> >
> > Thanks,
> > Ian
> >
> >> fallthrough;
> >> case 5:
> >> x86_pmu.pebs_ept = 1;
> >> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> >> index f1ef9ac38bfb..65cc528fbad8 100644
> >> --- a/arch/x86/include/asm/msr-index.h
> >> +++ b/arch/x86/include/asm/msr-index.h
> >> @@ -334,12 +334,18 @@
> >> #define ARCH_PEBS_INDEX_WR_SHIFT 4
> >>
> >> #define ARCH_PEBS_RELOAD 0xffffffff
> >> +#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35)
> >> +#define ARCH_PEBS_CNTR_GP BIT_ULL(36)
> >> +#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37)
> >> +#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38)
> >> #define ARCH_PEBS_LBR_SHIFT 40
> >> #define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT)
> >> #define ARCH_PEBS_VECR_XMM BIT_ULL(49)
> >> #define ARCH_PEBS_GPR BIT_ULL(61)
> >> #define ARCH_PEBS_AUX BIT_ULL(62)
> >> #define ARCH_PEBS_EN BIT_ULL(63)
> >> +#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \
> >> + ARCH_PEBS_CNTR_METRICS)
> >>
> >> #define MSR_IA32_RTIT_CTL 0x00000570
> >> #define RTIT_CTL_TRACEEN BIT(0)
> >> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> >> index 3b3848f0d339..7276ba70c88a 100644
> >> --- a/arch/x86/include/asm/perf_event.h
> >> +++ b/arch/x86/include/asm/perf_event.h
> >> @@ -141,16 +141,16 @@
> >> #define ARCH_PERFMON_EVENTS_COUNT 7
> >>
> >> #define PEBS_DATACFG_MEMINFO BIT_ULL(0)
> >> -#define PEBS_DATACFG_GP BIT_ULL(1)
> >> +#define PEBS_DATACFG_GP BIT_ULL(1)
> >> #define PEBS_DATACFG_XMMS BIT_ULL(2)
> >> #define PEBS_DATACFG_LBRS BIT_ULL(3)
> >> -#define PEBS_DATACFG_LBR_SHIFT 24
> >> #define PEBS_DATACFG_CNTR BIT_ULL(4)
> >> +#define PEBS_DATACFG_METRICS BIT_ULL(5)
> >> +#define PEBS_DATACFG_LBR_SHIFT 24
> >> #define PEBS_DATACFG_CNTR_SHIFT 32
> >> #define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0)
> >> #define PEBS_DATACFG_FIX_SHIFT 48
> >> #define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0)
> >> -#define PEBS_DATACFG_METRICS BIT_ULL(5)
> >>
> >> /* Steal the highest bit of pebs_data_cfg for SW usage */
> >> #define PEBS_UPDATE_DS_SW BIT_ULL(63)
> >> @@ -603,6 +603,13 @@ struct arch_pebs_lbr_header {
> >> u64 ler_info;
> >> };
> >>
> >> +struct arch_pebs_cntr_header {
> >> + u32 cntr;
> >> + u32 fixed;
> >> + u32 metrics;
> >> + u32 reserved;
> >> +};
> >> +
> >> /*
> >> * AMD Extended Performance Monitoring and Debug cpuid feature detection
> >> */
> >> --
> >> 2.34.1
> >>
>