Re: [tip: perf/core] perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR

From: Mi, Dapeng

Date: Mon Mar 09 2026 - 23:32:37 EST



On 3/10/2026 7:47 AM, Ian Rogers wrote:
> On Thu, Jan 15, 2026 at 1:46 PM tip-bot2 for Dapeng Mi
> <tip-bot2@xxxxxxxxxxxxx> wrote:
>> The following commit has been merged into the perf/core branch of tip:
>>
>> Commit-ID: d2bdcde9626cbea0c44a6aaa33b440c8adf81e09
>> Gitweb: https://git.kernel.org/tip/d2bdcde9626cbea0c44a6aaa33b440c8adf81e09
>> Author: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
>> AuthorDate: Wed, 14 Jan 2026 09:17:45 +08:00
>> Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>> CommitterDate: Thu, 15 Jan 2026 10:04:26 +01:00
>>
>> perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
>>
>> With the introduction of the OMR feature, the PEBS memory auxiliary info
>> field for load and store latency events has been restructured for DMR.
>>
>> The memory auxiliary info field's bit[8] indicates whether a L2 cache
>> miss occurred for a memory load or store instruction. If bit[8] is 0,
>> it signifies no L2 cache miss, and bits[7:0] specify the exact cache data
>> source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent
>> the OMR encoding, indicating the specific L3 cache or memory region
>> involved in the memory access. A significant enhancement is OMR encoding
>> provides up to 8 fine-grained memory regions besides the cache region.
>>
>> A significant enhancement for OMR encoding is the ability to provide
>> up to 8 fine-grained memory regions in addition to the cache region,
>> offering more detailed insights into memory access regions.
>>
>> For detailed information on the memory auxiliary info encoding, please
>> refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in
>> the ISE documentation.
>>
>> This patch ensures that the PEBS memory auxiliary info field is correctly
>> interpreted and utilized in DMR.
>>
>> Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
>> Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@xxxxxxxxxxxxxxx
>> ---
>> arch/x86/events/intel/ds.c | 140 +++++++++++++++++++++++++-
>> arch/x86/events/perf_event.h | 2 +-
>> include/uapi/linux/perf_event.h | 27 ++++-
>> tools/include/uapi/linux/perf_event.h | 27 ++++-
>> 4 files changed, 190 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
>> index feb1c3c..272e652 100644
>> --- a/arch/x86/events/intel/ds.c
>> +++ b/arch/x86/events/intel/ds.c
>> @@ -34,6 +34,17 @@ struct pebs_record_32 {
>>
>> */
>>
>> +union omr_encoding {
>> + struct {
>> + u8 omr_source : 4;
>> + u8 omr_remote : 1;
>> + u8 omr_hitm : 1;
>> + u8 omr_snoop : 1;
>> + u8 omr_promoted : 1;
> Hi Dapeng,
>
> omr_snoop and omr_promoted are 1 bit fields here.

Yes. According the OMR encoding layout in the "Table 16-5. OMR Encoding for
P-Core and E-Core Microarchitectures" of the ISE doc, bit [6] represents
the snoop information and bit [7] represents promoted prefetch in most
cases. Although the bit[7] and bit[6] are combined to represent the snoop
information when omr_source field is 0x2, but it's only an exception. So
bit[6] is named to omr_snoop and bit[7] is named to omr_promoted here. Thanks.

>
>> + };
>> + u8 omr_full;
>> +};
>> +
>> union intel_x86_pebs_dse {
>> u64 val;
>> struct {
>> @@ -73,6 +84,18 @@ union intel_x86_pebs_dse {
>> unsigned int lnc_addr_blk:1;
>> unsigned int ld_reserved6:18;
>> };
>> + struct {
>> + unsigned int pnc_dse: 8;
>> + unsigned int pnc_l2_miss:1;
>> + unsigned int pnc_stlb_clean_hit:1;
>> + unsigned int pnc_stlb_any_hit:1;
>> + unsigned int pnc_stlb_miss:1;
>> + unsigned int pnc_locked:1;
>> + unsigned int pnc_data_blk:1;
>> + unsigned int pnc_addr_blk:1;
>> + unsigned int pnc_fb_full:1;
>> + unsigned int ld_reserved8:16;
>> + };
>> };
>>
>>
>> @@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void)
>> __intel_pmu_pebs_data_source_cmt(data_source);
>> }
>>
>> +/* Version for Panthercove and later */
>> +
>> +/* L2 hit */
>> +#define PNC_PEBS_DATA_SOURCE_MAX 16
>> +static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = {
>> + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */
>> + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */
>> + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
>> + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */
>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */
>> + 0, /* 0x05: Reserved */
>> + 0, /* 0x06: Reserved */
>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */
>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */
>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */
>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */
>> + 0, /* 0x0b: Reserved */
>> + 0, /* 0x0c: Reserved */
>> + 0, /* 0x0d: Reserved */
>> + 0, /* 0x0e: Reserved */
>> + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
>> +};
>> +
>> +/* L2 miss */
>> +#define OMR_DATA_SOURCE_MAX 16
>> +static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = {
>> + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */
>> + 0, /* 0x01: Reserved */
>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */
>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */
>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */
>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */
>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */
>> + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */
>> + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */
>> +};
>> +
>> +static u64 parse_omr_data_source(u8 dse)
>> +{
>> + union omr_encoding omr;
>> + u64 val = 0;
>> +
>> + omr.omr_full = dse;
>> + val = omr_data_source[omr.omr_source];
>> + if (omr.omr_source > 0x1 && omr.omr_source < 0x7)
>> + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0;
>> + else if (omr.omr_source > 0x7)
>> + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM);
>> +
>> + if (omr.omr_remote)
>> + val |= REM;
>> +
>> + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT);
>> +
>> + if (omr.omr_source == 0x2) {
>> + u8 snoop = omr.omr_snoop | omr.omr_promoted;
> Or-ing the values together should mean snoop is only ever 0 or 1.
>
>> +
>> + if (snoop == 0x0)
>> + val |= P(SNOOP, NA);
>> + else if (snoop == 0x1)
>> + val |= P(SNOOP, MISS);
>> + else if (snoop == 0x2)
>> + val |= P(SNOOP, HIT);
>> + else if (snoop == 0x3)
>> + val |= P(SNOOP, NONE);
> How can snoop equal 0x2 or 0x3 here? Should snoop be "(omr.omr_snoop
> << 1) | omr.omr_promoted" ?
>
> Thanks,
> Ian
>
>> + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) {
>> + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0;
>> + }
>> +
>> + return val;
>> +}
>> +
>> static u64 precise_store_data(u64 status)
>> {
>> union intel_x86_pebs_dse dse;
>> @@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status)
>> return lnl_latency_data(event, status);
>> }
>>
>> +u64 pnc_latency_data(struct perf_event *event, u64 status)
>> +{
>> + union intel_x86_pebs_dse dse;
>> + union perf_mem_data_src src;
>> + u64 val;
>> +
>> + dse.val = status;
>> +
>> + if (!dse.pnc_l2_miss)
>> + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf];
>> + else
>> + val = parse_omr_data_source(dse.pnc_dse);
>> +
>> + if (!val)
>> + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
>> +
>> + if (dse.pnc_stlb_miss)
>> + val |= P(TLB, MISS) | P(TLB, L2);
>> + else
>> + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
>> +
>> + if (dse.pnc_locked)
>> + val |= P(LOCK, LOCKED);
>> +
>> + if (dse.pnc_data_blk)
>> + val |= P(BLK, DATA);
>> + if (dse.pnc_addr_blk)
>> + val |= P(BLK, ADDR);
>> + if (!dse.pnc_data_blk && !dse.pnc_addr_blk)
>> + val |= P(BLK, NA);
>> +
>> + src.val = val;
>> + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
>> + src.mem_op = P(OP, STORE);
>> +
>> + return src.val;
>> +}
>> +
>> static u64 load_latency_data(struct perf_event *event, u64 status)
>> {
>> union intel_x86_pebs_dse dse;
>> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
>> index 586e3fd..bd501c2 100644
>> --- a/arch/x86/events/perf_event.h
>> +++ b/arch/x86/events/perf_event.h
>> @@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status);
>>
>> u64 arl_h_latency_data(struct perf_event *event, u64 status);
>>
>> +u64 pnc_latency_data(struct perf_event *event, u64 status);
>> +
>> extern struct event_constraint intel_core2_pebs_event_constraints[];
>>
>> extern struct event_constraint intel_atom_pebs_event_constraints[];
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index c44a8fb..533393e 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -1330,14 +1330,16 @@ union perf_mem_data_src {
>> mem_snoopx : 2, /* Snoop mode, ext */
>> mem_blk : 3, /* Access blocked */
>> mem_hops : 3, /* Hop level */
>> - mem_rsvd : 18;
>> + mem_region : 5, /* cache/memory regions */
>> + mem_rsvd : 13;
>> };
>> };
>> #elif defined(__BIG_ENDIAN_BITFIELD)
>> union perf_mem_data_src {
>> __u64 val;
>> struct {
>> - __u64 mem_rsvd : 18,
>> + __u64 mem_rsvd : 13,
>> + mem_region : 5, /* cache/memory regions */
>> mem_hops : 3, /* Hop level */
>> mem_blk : 3, /* Access blocked */
>> mem_snoopx : 2, /* Snoop mode, ext */
>> @@ -1394,7 +1396,7 @@ union perf_mem_data_src {
>> #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
>> #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
>> #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
>> -/* 0x007 available */
>> +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
>> #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
>> #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
>> #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
>> @@ -1447,6 +1449,25 @@ union perf_mem_data_src {
>> /* 5-7 available */
>> #define PERF_MEM_HOPS_SHIFT 43
>>
>> +/* Cache/Memory region */
>> +#define PERF_MEM_REGION_NA 0x0 /* Invalid */
>> +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
>> +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
>> +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
>> +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
>> +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
>> +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
>> +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
>> +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
>> +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
>> +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
>> +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
>> +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
>> +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
>> +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
>> +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
>> +#define PERF_MEM_REGION_SHIFT 46
>> +
>> #define PERF_MEM_S(a, s) \
>> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>>
>> diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
>> index c44a8fb..d4b9961 100644
>> --- a/tools/include/uapi/linux/perf_event.h
>> +++ b/tools/include/uapi/linux/perf_event.h
>> @@ -1330,14 +1330,16 @@ union perf_mem_data_src {
>> mem_snoopx : 2, /* Snoop mode, ext */
>> mem_blk : 3, /* Access blocked */
>> mem_hops : 3, /* Hop level */
>> - mem_rsvd : 18;
>> + mem_region : 5, /* cache/memory regions */
>> + mem_rsvd : 13;
>> };
>> };
>> #elif defined(__BIG_ENDIAN_BITFIELD)
>> union perf_mem_data_src {
>> __u64 val;
>> struct {
>> - __u64 mem_rsvd : 18,
>> + __u64 mem_rsvd : 13,
>> + mem_region : 5, /* cache/memory regions */
>> mem_hops : 3, /* Hop level */
>> mem_blk : 3, /* Access blocked */
>> mem_snoopx : 2, /* Snoop mode, ext */
>> @@ -1394,7 +1396,7 @@ union perf_mem_data_src {
>> #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
>> #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
>> #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
>> -/* 0x007 available */
>> +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
>> #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
>> #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
>> #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
>> @@ -1447,6 +1449,25 @@ union perf_mem_data_src {
>> /* 5-7 available */
>> #define PERF_MEM_HOPS_SHIFT 43
>>
>> +/* Cache/Memory region */
>> +#define PERF_MEM_REGION_NA 0x0 /* Invalid */
>> +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
>> +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
>> +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
>> +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
>> +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
>> +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
>> +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
>> +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
>> +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
>> +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
>> +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
>> +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
>> +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
>> +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
>> +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
>> +#define PERF_MEM_REGION_SHIFT 46
>> +
>> #define PERF_MEM_S(a, s) \
>> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>>
>>