Re: [tip: perf/core] perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
From: Ian Rogers
Date: Tue Mar 10 2026 - 00:39:19 EST
On Mon, Mar 9, 2026 at 8:32 PM Mi, Dapeng <dapeng1.mi@xxxxxxxxxxxxxxx> wrote:
>
>
> On 3/10/2026 7:47 AM, Ian Rogers wrote:
> > On Thu, Jan 15, 2026 at 1:46 PM tip-bot2 for Dapeng Mi
> > <tip-bot2@xxxxxxxxxxxxx> wrote:
> >> The following commit has been merged into the perf/core branch of tip:
> >>
> >> Commit-ID: d2bdcde9626cbea0c44a6aaa33b440c8adf81e09
> >> Gitweb: https://git.kernel.org/tip/d2bdcde9626cbea0c44a6aaa33b440c8adf81e09
> >> Author: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
> >> AuthorDate: Wed, 14 Jan 2026 09:17:45 +08:00
> >> Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> >> CommitterDate: Thu, 15 Jan 2026 10:04:26 +01:00
> >>
> >> perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
> >>
> >> With the introduction of the OMR feature, the PEBS memory auxiliary info
> >> field for load and store latency events has been restructured for DMR.
> >>
> >> The memory auxiliary info field's bit[8] indicates whether a L2 cache
> >> miss occurred for a memory load or store instruction. If bit[8] is 0,
> >> it signifies no L2 cache miss, and bits[7:0] specify the exact cache data
> >> source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent
> >> the OMR encoding, indicating the specific L3 cache or memory region
> >> involved in the memory access. A significant enhancement is OMR encoding
> >> provides up to 8 fine-grained memory regions besides the cache region.
> >>
> >> A significant enhancement for OMR encoding is the ability to provide
> >> up to 8 fine-grained memory regions in addition to the cache region,
> >> offering more detailed insights into memory access regions.
> >>
> >> For detailed information on the memory auxiliary info encoding, please
> >> refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in
> >> the ISE documentation.
> >>
> >> This patch ensures that the PEBS memory auxiliary info field is correctly
> >> interpreted and utilized in DMR.
> >>
> >> Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
> >> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> >> Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@xxxxxxxxxxxxxxx
> >> ---
> >> arch/x86/events/intel/ds.c | 140 +++++++++++++++++++++++++-
> >> arch/x86/events/perf_event.h | 2 +-
> >> include/uapi/linux/perf_event.h | 27 ++++-
> >> tools/include/uapi/linux/perf_event.h | 27 ++++-
> >> 4 files changed, 190 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> >> index feb1c3c..272e652 100644
> >> --- a/arch/x86/events/intel/ds.c
> >> +++ b/arch/x86/events/intel/ds.c
> >> @@ -34,6 +34,17 @@ struct pebs_record_32 {
> >>
> >> */
> >>
> >> +union omr_encoding {
> >> + struct {
> >> + u8 omr_source : 4;
> >> + u8 omr_remote : 1;
> >> + u8 omr_hitm : 1;
> >> + u8 omr_snoop : 1;
> >> + u8 omr_promoted : 1;
> > Hi Dapeng,
> >
> > omr_snoop and omr_promoted are 1 bit fields here.
>
> Yes. According the OMR encoding layout in the "Table 16-5. OMR Encoding for
> P-Core and E-Core Microarchitectures" of the ISE doc, bit [6] represents
> the snoop information and bit [7] represents promoted prefetch in most
> cases. Although the bit[7] and bit[6] are combined to represent the snoop
> information when omr_source field is 0x2, but it's only an exception. So
> bit[6] is named to omr_snoop and bit[7] is named to omr_promoted here. Thanks.
Yep, there were more comments below.
> >
> >> + };
> >> + u8 omr_full;
> >> +};
> >> +
> >> union intel_x86_pebs_dse {
> >> u64 val;
> >> struct {
> >> @@ -73,6 +84,18 @@ union intel_x86_pebs_dse {
> >> unsigned int lnc_addr_blk:1;
> >> unsigned int ld_reserved6:18;
> >> };
> >> + struct {
> >> + unsigned int pnc_dse: 8;
> >> + unsigned int pnc_l2_miss:1;
> >> + unsigned int pnc_stlb_clean_hit:1;
> >> + unsigned int pnc_stlb_any_hit:1;
> >> + unsigned int pnc_stlb_miss:1;
> >> + unsigned int pnc_locked:1;
> >> + unsigned int pnc_data_blk:1;
> >> + unsigned int pnc_addr_blk:1;
> >> + unsigned int pnc_fb_full:1;
> >> + unsigned int ld_reserved8:16;
> >> + };
> >> };
> >>
> >>
> >> @@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void)
> >> __intel_pmu_pebs_data_source_cmt(data_source);
> >> }
> >>
> >> +/* Version for Panthercove and later */
> >> +
> >> +/* L2 hit */
> >> +#define PNC_PEBS_DATA_SOURCE_MAX 16
> >> +static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = {
> >> + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */
> >> + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */
> >> + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
> >> + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */
> >> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */
> >> + 0, /* 0x05: Reserved */
> >> + 0, /* 0x06: Reserved */
> >> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */
> >> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */
> >> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */
> >> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */
> >> + 0, /* 0x0b: Reserved */
> >> + 0, /* 0x0c: Reserved */
> >> + 0, /* 0x0d: Reserved */
> >> + 0, /* 0x0e: Reserved */
> >> + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
> >> +};
> >> +
> >> +/* L2 miss */
> >> +#define OMR_DATA_SOURCE_MAX 16
> >> +static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = {
> >> + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */
> >> + 0, /* 0x01: Reserved */
> >> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */
> >> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */
> >> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */
> >> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */
> >> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */
> >> + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */
> >> +};
> >> +
> >> +static u64 parse_omr_data_source(u8 dse)
> >> +{
> >> + union omr_encoding omr;
> >> + u64 val = 0;
> >> +
> >> + omr.omr_full = dse;
> >> + val = omr_data_source[omr.omr_source];
> >> + if (omr.omr_source > 0x1 && omr.omr_source < 0x7)
> >> + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0;
> >> + else if (omr.omr_source > 0x7)
> >> + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM);
> >> +
> >> + if (omr.omr_remote)
> >> + val |= REM;
> >> +
> >> + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT);
> >> +
> >> + if (omr.omr_source == 0x2) {
> >> + u8 snoop = omr.omr_snoop | omr.omr_promoted;
> > Or-ing the values together should mean snoop is only ever 0 or 1.
This comment about the OR only yielding 0 or 1.
> >> +
> >> + if (snoop == 0x0)
> >> + val |= P(SNOOP, NA);
> >> + else if (snoop == 0x1)
> >> + val |= P(SNOOP, MISS);
> >> + else if (snoop == 0x2)
> >> + val |= P(SNOOP, HIT);
> >> + else if (snoop == 0x3)
> >> + val |= P(SNOOP, NONE);
> > How can snoop equal 0x2 or 0x3 here? Should snoop be "(omr.omr_snoop
> > << 1) | omr.omr_promoted" ?
And then this comment: the values 0x2 and 0x3 seem unreachable.
Thanks,
Ian
> >
> > Thanks,
> > Ian
> >
> >> + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) {
> >> + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0;
> >> + }
> >> +
> >> + return val;
> >> +}
> >> +
> >> static u64 precise_store_data(u64 status)
> >> {
> >> union intel_x86_pebs_dse dse;
> >> @@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status)
> >> return lnl_latency_data(event, status);
> >> }
> >>
> >> +u64 pnc_latency_data(struct perf_event *event, u64 status)
> >> +{
> >> + union intel_x86_pebs_dse dse;
> >> + union perf_mem_data_src src;
> >> + u64 val;
> >> +
> >> + dse.val = status;
> >> +
> >> + if (!dse.pnc_l2_miss)
> >> + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf];
> >> + else
> >> + val = parse_omr_data_source(dse.pnc_dse);
> >> +
> >> + if (!val)
> >> + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
> >> +
> >> + if (dse.pnc_stlb_miss)
> >> + val |= P(TLB, MISS) | P(TLB, L2);
> >> + else
> >> + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
> >> +
> >> + if (dse.pnc_locked)
> >> + val |= P(LOCK, LOCKED);
> >> +
> >> + if (dse.pnc_data_blk)
> >> + val |= P(BLK, DATA);
> >> + if (dse.pnc_addr_blk)
> >> + val |= P(BLK, ADDR);
> >> + if (!dse.pnc_data_blk && !dse.pnc_addr_blk)
> >> + val |= P(BLK, NA);
> >> +
> >> + src.val = val;
> >> + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
> >> + src.mem_op = P(OP, STORE);
> >> +
> >> + return src.val;
> >> +}
> >> +
> >> static u64 load_latency_data(struct perf_event *event, u64 status)
> >> {
> >> union intel_x86_pebs_dse dse;
> >> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> >> index 586e3fd..bd501c2 100644
> >> --- a/arch/x86/events/perf_event.h
> >> +++ b/arch/x86/events/perf_event.h
> >> @@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status);
> >>
> >> u64 arl_h_latency_data(struct perf_event *event, u64 status);
> >>
> >> +u64 pnc_latency_data(struct perf_event *event, u64 status);
> >> +
> >> extern struct event_constraint intel_core2_pebs_event_constraints[];
> >>
> >> extern struct event_constraint intel_atom_pebs_event_constraints[];
> >> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> >> index c44a8fb..533393e 100644
> >> --- a/include/uapi/linux/perf_event.h
> >> +++ b/include/uapi/linux/perf_event.h
> >> @@ -1330,14 +1330,16 @@ union perf_mem_data_src {
> >> mem_snoopx : 2, /* Snoop mode, ext */
> >> mem_blk : 3, /* Access blocked */
> >> mem_hops : 3, /* Hop level */
> >> - mem_rsvd : 18;
> >> + mem_region : 5, /* cache/memory regions */
> >> + mem_rsvd : 13;
> >> };
> >> };
> >> #elif defined(__BIG_ENDIAN_BITFIELD)
> >> union perf_mem_data_src {
> >> __u64 val;
> >> struct {
> >> - __u64 mem_rsvd : 18,
> >> + __u64 mem_rsvd : 13,
> >> + mem_region : 5, /* cache/memory regions */
> >> mem_hops : 3, /* Hop level */
> >> mem_blk : 3, /* Access blocked */
> >> mem_snoopx : 2, /* Snoop mode, ext */
> >> @@ -1394,7 +1396,7 @@ union perf_mem_data_src {
> >> #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
> >> #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
> >> #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
> >> -/* 0x007 available */
> >> +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
> >> #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
> >> #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
> >> #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
> >> @@ -1447,6 +1449,25 @@ union perf_mem_data_src {
> >> /* 5-7 available */
> >> #define PERF_MEM_HOPS_SHIFT 43
> >>
> >> +/* Cache/Memory region */
> >> +#define PERF_MEM_REGION_NA 0x0 /* Invalid */
> >> +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
> >> +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
> >> +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
> >> +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
> >> +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
> >> +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
> >> +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
> >> +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
> >> +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
> >> +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
> >> +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
> >> +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
> >> +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
> >> +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
> >> +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
> >> +#define PERF_MEM_REGION_SHIFT 46
> >> +
> >> #define PERF_MEM_S(a, s) \
> >> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
> >>
> >> diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
> >> index c44a8fb..d4b9961 100644
> >> --- a/tools/include/uapi/linux/perf_event.h
> >> +++ b/tools/include/uapi/linux/perf_event.h
> >> @@ -1330,14 +1330,16 @@ union perf_mem_data_src {
> >> mem_snoopx : 2, /* Snoop mode, ext */
> >> mem_blk : 3, /* Access blocked */
> >> mem_hops : 3, /* Hop level */
> >> - mem_rsvd : 18;
> >> + mem_region : 5, /* cache/memory regions */
> >> + mem_rsvd : 13;
> >> };
> >> };
> >> #elif defined(__BIG_ENDIAN_BITFIELD)
> >> union perf_mem_data_src {
> >> __u64 val;
> >> struct {
> >> - __u64 mem_rsvd : 18,
> >> + __u64 mem_rsvd : 13,
> >> + mem_region : 5, /* cache/memory regions */
> >> mem_hops : 3, /* Hop level */
> >> mem_blk : 3, /* Access blocked */
> >> mem_snoopx : 2, /* Snoop mode, ext */
> >> @@ -1394,7 +1396,7 @@ union perf_mem_data_src {
> >> #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
> >> #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
> >> #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
> >> -/* 0x007 available */
> >> +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
> >> #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
> >> #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
> >> #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
> >> @@ -1447,6 +1449,25 @@ union perf_mem_data_src {
> >> /* 5-7 available */
> >> #define PERF_MEM_HOPS_SHIFT 43
> >>
> >> +/* Cache/Memory region */
> >> +#define PERF_MEM_REGION_NA 0x0 /* Invalid */
> >> +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
> >> +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
> >> +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
> >> +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
> >> +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
> >> +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
> >> +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
> >> +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
> >> +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
> >> +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
> >> +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
> >> +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
> >> +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
> >> +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
> >> +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
> >> +#define PERF_MEM_REGION_SHIFT 46
> >> +
> >> #define PERF_MEM_S(a, s) \
> >> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
> >>
> >>