Re: [Patch v9 08/12] perf/x86/intel: Process arch-PEBS records or record fragments
From: Chun-Tse Shao
Date: Mon Mar 02 2026 - 19:21:16 EST
On Wed, Oct 29, 2025 at 3:39 AM Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx> wrote:
>
> A significant difference with adaptive PEBS is that arch-PEBS record
> supports fragments which means an arch-PEBS record could be split into
> several independent fragments which have its own arch-PEBS header in
> each fragment.
>
> This patch defines architectural PEBS record layout structures and add
> helpers to process arch-PEBS records or fragments. Only legacy PEBS
> groups like basic, GPR, XMM and LBR groups are supported in this patch,
> the new added YMM/ZMM/OPMASK vector registers capturing would be
> supported in the future.
>
> Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
> ---
> arch/x86/events/intel/core.c | 13 +++
> arch/x86/events/intel/ds.c | 184 ++++++++++++++++++++++++++++++
> arch/x86/include/asm/msr-index.h | 6 +
> arch/x86/include/asm/perf_event.h | 96 ++++++++++++++++
> 4 files changed, 299 insertions(+)
>
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 9ce27b326923..de4dbde28adc 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3215,6 +3215,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
> status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
> }
>
> + /*
> + * Arch PEBS sets bit 54 in the global status register
> + */
> + if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT,
> + (unsigned long *)&status)) {
> + handled++;
> + static_call(x86_pmu_drain_pebs)(regs, &data);
> +
> + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] &&
> + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS]))
> + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
> + }
> +
> /*
> * Intel PT
> */
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 68664526443f..fe1bf373409e 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -2270,6 +2270,117 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
> format_group);
> }
>
> +static inline bool arch_pebs_record_continued(struct arch_pebs_header *header)
> +{
> + /* Continue bit or null PEBS record indicates fragment follows. */
> + return header->cont || !(header->format & GENMASK_ULL(63, 16));
> +}
> +
> +static void setup_arch_pebs_sample_data(struct perf_event *event,
> + struct pt_regs *iregs,
> + void *__pebs,
> + struct perf_sample_data *data,
> + struct pt_regs *regs)
> +{
> + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> + u64 sample_type = event->attr.sample_type;
> + struct arch_pebs_header *header = NULL;
> + struct arch_pebs_aux *meminfo = NULL;
> + struct arch_pebs_gprs *gprs = NULL;
> + struct x86_perf_regs *perf_regs;
> + void *next_record;
> + void *at = __pebs;
> +
> + if (at == NULL)
> + return;
> +
> + perf_regs = container_of(regs, struct x86_perf_regs, regs);
> + perf_regs->xmm_regs = NULL;
> +
> + __setup_perf_sample_data(event, iregs, data);
> +
> + *regs = *iregs;
> +
> +again:
> + header = at;
> + next_record = at + sizeof(struct arch_pebs_header);
> + if (header->basic) {
> + struct arch_pebs_basic *basic = next_record;
> + u16 retire = 0;
> +
> + next_record = basic + 1;
> +
> + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
> + retire = basic->valid ? basic->retire : 0;
> + __setup_pebs_basic_group(event, regs, data, sample_type,
> + basic->ip, basic->tsc, retire);
> + }
> +
> + /*
> + * The record for MEMINFO is in front of GP
> + * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
> + * Save the pointer here but process later.
> + */
> + if (header->aux) {
> + meminfo = next_record;
> + next_record = meminfo + 1;
> + }
> +
> + if (header->gpr) {
> + gprs = next_record;
> + next_record = gprs + 1;
> +
> + __setup_pebs_gpr_group(event, regs,
> + (struct pebs_gprs *)gprs,
> + sample_type);
> + }
> +
> + if (header->aux) {
> + u64 ax = gprs ? gprs->ax : 0;
> +
> + __setup_pebs_meminfo_group(event, data, sample_type,
> + meminfo->cache_latency,
> + meminfo->instr_latency,
> + meminfo->address, meminfo->aux,
> + meminfo->tsx_tuning, ax);
> + }
> +
> + if (header->xmm) {
> + struct pebs_xmm *xmm;
> +
> + next_record += sizeof(struct arch_pebs_xer_header);
> +
> + xmm = next_record;
> + perf_regs->xmm_regs = xmm->xmm;
> + next_record = xmm + 1;
> + }
> +
> + if (header->lbr) {
> + struct arch_pebs_lbr_header *lbr_header = next_record;
> + struct lbr_entry *lbr;
> + int num_lbr;
> +
> + next_record = lbr_header + 1;
> + lbr = next_record;
> +
> + num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ?
> + lbr_header->depth :
> + header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES;
> + next_record += num_lbr * sizeof(struct lbr_entry);
> +
> + if (has_branch_stack(event)) {
> + intel_pmu_store_pebs_lbrs(lbr);
> + intel_pmu_lbr_save_brstack(data, cpuc, event);
> + }
> + }
> +
> + /* Parse followed fragments if there are. */
> + if (arch_pebs_record_continued(header)) {
> + at = at + header->size;
If the header->size is 0, will it cause infinite loop?
I can see a 0 check below but not here.
Thanks,
CT
> + goto again;
> + }
> +}
> +
> static inline void *
> get_next_pebs_record_by_bit(void *base, void *top, int bit)
> {
> @@ -2753,6 +2864,78 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
> setup_pebs_adaptive_sample_data);
> }
>
> +static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs,
> + struct perf_sample_data *data)
> +{
> + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
> + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
> + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> + union arch_pebs_index index;
> + struct x86_perf_regs perf_regs;
> + struct pt_regs *regs = &perf_regs.regs;
> + void *base, *at, *top;
> + u64 mask;
> +
> + rdmsrq(MSR_IA32_PEBS_INDEX, index.whole);
> +
> + if (unlikely(!index.wr)) {
> + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
> + return;
> + }
> +
> + base = cpuc->ds_pebs_vaddr;
> + top = (void *)((u64)cpuc->ds_pebs_vaddr +
> + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT));
> +
> + index.wr = 0;
> + index.full = 0;
> + wrmsrq(MSR_IA32_PEBS_INDEX, index.whole);
> +
> + mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled;
> +
> + if (!iregs)
> + iregs = &dummy_iregs;
> +
> + /* Process all but the last event for each counter. */
> + for (at = base; at < top;) {
> + struct arch_pebs_header *header;
> + struct arch_pebs_basic *basic;
> + u64 pebs_status;
> +
> + header = at;
> +
> + if (WARN_ON_ONCE(!header->size))
> + break;
> +
> + /* 1st fragment or single record must have basic group */
> + if (!header->basic) {
> + at += header->size;
> + continue;
> + }
> +
> + basic = at + sizeof(struct arch_pebs_header);
> + pebs_status = mask & basic->applicable_counters;
> + __intel_pmu_handle_pebs_record(iregs, regs, data, at,
> + pebs_status, counts, last,
> + setup_arch_pebs_sample_data);
> +
> + /* Skip non-last fragments */
> + while (arch_pebs_record_continued(header)) {
> + if (!header->size)
> + break;
> + at += header->size;
> + header = at;
> + }
> +
> + /* Skip last fragment or the single record */
> + at += header->size;
> + }
> +
> + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask,
> + counts, last,
> + setup_arch_pebs_sample_data);
> +}
> +
> static void __init intel_arch_pebs_init(void)
> {
> /*
> @@ -2762,6 +2945,7 @@ static void __init intel_arch_pebs_init(void)
> */
> x86_pmu.arch_pebs = 1;
> x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
> + x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs;
> x86_pmu.pebs_capable = ~0ULL;
>
> x86_pmu.pebs_enable = __intel_pmu_pebs_enable;
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 9e1720d73244..fc7a4e7c718d 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -327,6 +327,12 @@
> PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
> PERF_CAP_PEBS_TIMING_INFO)
>
> +/* Arch PEBS */
> +#define MSR_IA32_PEBS_BASE 0x000003f4
> +#define MSR_IA32_PEBS_INDEX 0x000003f5
> +#define ARCH_PEBS_OFFSET_MASK 0x7fffff
> +#define ARCH_PEBS_INDEX_WR_SHIFT 4
> +
> #define MSR_IA32_RTIT_CTL 0x00000570
> #define RTIT_CTL_TRACEEN BIT(0)
> #define RTIT_CTL_CYCLEACC BIT(1)
> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> index 0dfa06722bab..3b3848f0d339 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -437,6 +437,8 @@ static inline bool is_topdown_idx(int idx)
> #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
> #define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55
> #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
> +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54
> +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT)
> #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
>
> #define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
> @@ -507,6 +509,100 @@ struct pebs_cntr_header {
>
> #define INTEL_CNTR_METRICS 0x3
>
> +/*
> + * Arch PEBS
> + */
> +union arch_pebs_index {
> + struct {
> + u64 rsvd:4,
> + wr:23,
> + rsvd2:4,
> + full:1,
> + en:1,
> + rsvd3:3,
> + thresh:23,
> + rsvd4:5;
> + };
> + u64 whole;
> +};
> +
> +struct arch_pebs_header {
> + union {
> + u64 format;
> + struct {
> + u64 size:16, /* Record size */
> + rsvd:14,
> + mode:1, /* 64BIT_MODE */
> + cont:1,
> + rsvd2:3,
> + cntr:5,
> + lbr:2,
> + rsvd3:7,
> + xmm:1,
> + ymmh:1,
> + rsvd4:2,
> + opmask:1,
> + zmmh:1,
> + h16zmm:1,
> + rsvd5:5,
> + gpr:1,
> + aux:1,
> + basic:1;
> + };
> + };
> + u64 rsvd6;
> +};
> +
> +struct arch_pebs_basic {
> + u64 ip;
> + u64 applicable_counters;
> + u64 tsc;
> + u64 retire :16, /* Retire Latency */
> + valid :1,
> + rsvd :47;
> + u64 rsvd2;
> + u64 rsvd3;
> +};
> +
> +struct arch_pebs_aux {
> + u64 address;
> + u64 rsvd;
> + u64 rsvd2;
> + u64 rsvd3;
> + u64 rsvd4;
> + u64 aux;
> + u64 instr_latency :16,
> + pad2 :16,
> + cache_latency :16,
> + pad3 :16;
> + u64 tsx_tuning;
> +};
> +
> +struct arch_pebs_gprs {
> + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
> + u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp;
> + u64 rsvd;
> +};
> +
> +struct arch_pebs_xer_header {
> + u64 xstate;
> + u64 rsvd;
> +};
> +
> +#define ARCH_PEBS_LBR_NAN 0x0
> +#define ARCH_PEBS_LBR_NUM_8 0x1
> +#define ARCH_PEBS_LBR_NUM_16 0x2
> +#define ARCH_PEBS_LBR_NUM_VAR 0x3
> +#define ARCH_PEBS_BASE_LBR_ENTRIES 8
> +struct arch_pebs_lbr_header {
> + u64 rsvd;
> + u64 ctl;
> + u64 depth;
> + u64 ler_from;
> + u64 ler_to;
> + u64 ler_info;
> +};
> +
> /*
> * AMD Extended Performance Monitoring and Debug cpuid feature detection
> */
> --
> 2.34.1
>
>