Re: [tip: perf/core] perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
From: Mi, Dapeng
Date: Tue Mar 10 2026 - 01:04:27 EST
On 3/10/2026 12:38 PM, Ian Rogers wrote:
> On Mon, Mar 9, 2026 at 8:32 PM Mi, Dapeng <dapeng1.mi@xxxxxxxxxxxxxxx> wrote:
>>
>> On 3/10/2026 7:47 AM, Ian Rogers wrote:
>>> On Thu, Jan 15, 2026 at 1:46 PM tip-bot2 for Dapeng Mi
>>> <tip-bot2@xxxxxxxxxxxxx> wrote:
>>>> The following commit has been merged into the perf/core branch of tip:
>>>>
>>>> Commit-ID: d2bdcde9626cbea0c44a6aaa33b440c8adf81e09
>>>> Gitweb: https://git.kernel.org/tip/d2bdcde9626cbea0c44a6aaa33b440c8adf81e09
>>>> Author: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
>>>> AuthorDate: Wed, 14 Jan 2026 09:17:45 +08:00
>>>> Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>>>> CommitterDate: Thu, 15 Jan 2026 10:04:26 +01:00
>>>>
>>>> perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
>>>>
>>>> With the introduction of the OMR feature, the PEBS memory auxiliary info
>>>> field for load and store latency events has been restructured for DMR.
>>>>
>>>> The memory auxiliary info field's bit[8] indicates whether a L2 cache
>>>> miss occurred for a memory load or store instruction. If bit[8] is 0,
>>>> it signifies no L2 cache miss, and bits[7:0] specify the exact cache data
>>>> source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent
>>>> the OMR encoding, indicating the specific L3 cache or memory region
>>>> involved in the memory access. A significant enhancement is OMR encoding
>>>> provides up to 8 fine-grained memory regions besides the cache region.
>>>>
>>>> A significant enhancement for OMR encoding is the ability to provide
>>>> up to 8 fine-grained memory regions in addition to the cache region,
>>>> offering more detailed insights into memory access regions.
>>>>
>>>> For detailed information on the memory auxiliary info encoding, please
>>>> refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in
>>>> the ISE documentation.
>>>>
>>>> This patch ensures that the PEBS memory auxiliary info field is correctly
>>>> interpreted and utilized in DMR.
>>>>
>>>> Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
>>>> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
>>>> Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@xxxxxxxxxxxxxxx
>>>> ---
>>>> arch/x86/events/intel/ds.c | 140 +++++++++++++++++++++++++-
>>>> arch/x86/events/perf_event.h | 2 +-
>>>> include/uapi/linux/perf_event.h | 27 ++++-
>>>> tools/include/uapi/linux/perf_event.h | 27 ++++-
>>>> 4 files changed, 190 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
>>>> index feb1c3c..272e652 100644
>>>> --- a/arch/x86/events/intel/ds.c
>>>> +++ b/arch/x86/events/intel/ds.c
>>>> @@ -34,6 +34,17 @@ struct pebs_record_32 {
>>>>
>>>> */
>>>>
>>>> +union omr_encoding {
>>>> + struct {
>>>> + u8 omr_source : 4;
>>>> + u8 omr_remote : 1;
>>>> + u8 omr_hitm : 1;
>>>> + u8 omr_snoop : 1;
>>>> + u8 omr_promoted : 1;
>>> Hi Dapeng,
>>>
>>> omr_snoop and omr_promoted are 1 bit fields here.
>> Yes. According the OMR encoding layout in the "Table 16-5. OMR Encoding for
>> P-Core and E-Core Microarchitectures" of the ISE doc, bit [6] represents
>> the snoop information and bit [7] represents promoted prefetch in most
>> cases. Although the bit[7] and bit[6] are combined to represent the snoop
>> information when omr_source field is 0x2, but it's only an exception. So
>> bit[6] is named to omr_snoop and bit[7] is named to omr_promoted here. Thanks.
> Yep, there were more comments below.
>
>>>> + };
>>>> + u8 omr_full;
>>>> +};
>>>> +
>>>> union intel_x86_pebs_dse {
>>>> u64 val;
>>>> struct {
>>>> @@ -73,6 +84,18 @@ union intel_x86_pebs_dse {
>>>> unsigned int lnc_addr_blk:1;
>>>> unsigned int ld_reserved6:18;
>>>> };
>>>> + struct {
>>>> + unsigned int pnc_dse: 8;
>>>> + unsigned int pnc_l2_miss:1;
>>>> + unsigned int pnc_stlb_clean_hit:1;
>>>> + unsigned int pnc_stlb_any_hit:1;
>>>> + unsigned int pnc_stlb_miss:1;
>>>> + unsigned int pnc_locked:1;
>>>> + unsigned int pnc_data_blk:1;
>>>> + unsigned int pnc_addr_blk:1;
>>>> + unsigned int pnc_fb_full:1;
>>>> + unsigned int ld_reserved8:16;
>>>> + };
>>>> };
>>>>
>>>>
>>>> @@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void)
>>>> __intel_pmu_pebs_data_source_cmt(data_source);
>>>> }
>>>>
>>>> +/* Version for Panthercove and later */
>>>> +
>>>> +/* L2 hit */
>>>> +#define PNC_PEBS_DATA_SOURCE_MAX 16
>>>> +static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = {
>>>> + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */
>>>> + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */
>>>> + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
>>>> + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */
>>>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */
>>>> + 0, /* 0x05: Reserved */
>>>> + 0, /* 0x06: Reserved */
>>>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */
>>>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */
>>>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */
>>>> + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */
>>>> + 0, /* 0x0b: Reserved */
>>>> + 0, /* 0x0c: Reserved */
>>>> + 0, /* 0x0d: Reserved */
>>>> + 0, /* 0x0e: Reserved */
>>>> + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
>>>> +};
>>>> +
>>>> +/* L2 miss */
>>>> +#define OMR_DATA_SOURCE_MAX 16
>>>> +static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = {
>>>> + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */
>>>> + 0, /* 0x01: Reserved */
>>>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */
>>>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */
>>>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */
>>>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */
>>>> + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */
>>>> + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */
>>>> +};
>>>> +
>>>> +static u64 parse_omr_data_source(u8 dse)
>>>> +{
>>>> + union omr_encoding omr;
>>>> + u64 val = 0;
>>>> +
>>>> + omr.omr_full = dse;
>>>> + val = omr_data_source[omr.omr_source];
>>>> + if (omr.omr_source > 0x1 && omr.omr_source < 0x7)
>>>> + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0;
>>>> + else if (omr.omr_source > 0x7)
>>>> + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM);
>>>> +
>>>> + if (omr.omr_remote)
>>>> + val |= REM;
>>>> +
>>>> + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT);
>>>> +
>>>> + if (omr.omr_source == 0x2) {
>>>> + u8 snoop = omr.omr_snoop | omr.omr_promoted;
>>> Or-ing the values together should mean snoop is only ever 0 or 1.
> This comment about the OR only yielding 0 or 1.
Oh, yes, it's a bug. Would submit a patch to fix it. Thanks a lot.
>
>>>> +
>>>> + if (snoop == 0x0)
>>>> + val |= P(SNOOP, NA);
>>>> + else if (snoop == 0x1)
>>>> + val |= P(SNOOP, MISS);
>>>> + else if (snoop == 0x2)
>>>> + val |= P(SNOOP, HIT);
>>>> + else if (snoop == 0x3)
>>>> + val |= P(SNOOP, NONE);
>>> How can snoop equal 0x2 or 0x3 here? Should snoop be "(omr.omr_snoop
>>> << 1) | omr.omr_promoted" ?
> And then this comment: the values 0x2 and 0x3 seem unreachable.
>
> Thanks,
> Ian
>
>>> Thanks,
>>> Ian
>>>
>>>> + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) {
>>>> + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0;
>>>> + }
>>>> +
>>>> + return val;
>>>> +}
>>>> +
>>>> static u64 precise_store_data(u64 status)
>>>> {
>>>> union intel_x86_pebs_dse dse;
>>>> @@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status)
>>>> return lnl_latency_data(event, status);
>>>> }
>>>>
>>>> +u64 pnc_latency_data(struct perf_event *event, u64 status)
>>>> +{
>>>> + union intel_x86_pebs_dse dse;
>>>> + union perf_mem_data_src src;
>>>> + u64 val;
>>>> +
>>>> + dse.val = status;
>>>> +
>>>> + if (!dse.pnc_l2_miss)
>>>> + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf];
>>>> + else
>>>> + val = parse_omr_data_source(dse.pnc_dse);
>>>> +
>>>> + if (!val)
>>>> + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
>>>> +
>>>> + if (dse.pnc_stlb_miss)
>>>> + val |= P(TLB, MISS) | P(TLB, L2);
>>>> + else
>>>> + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
>>>> +
>>>> + if (dse.pnc_locked)
>>>> + val |= P(LOCK, LOCKED);
>>>> +
>>>> + if (dse.pnc_data_blk)
>>>> + val |= P(BLK, DATA);
>>>> + if (dse.pnc_addr_blk)
>>>> + val |= P(BLK, ADDR);
>>>> + if (!dse.pnc_data_blk && !dse.pnc_addr_blk)
>>>> + val |= P(BLK, NA);
>>>> +
>>>> + src.val = val;
>>>> + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
>>>> + src.mem_op = P(OP, STORE);
>>>> +
>>>> + return src.val;
>>>> +}
>>>> +
>>>> static u64 load_latency_data(struct perf_event *event, u64 status)
>>>> {
>>>> union intel_x86_pebs_dse dse;
>>>> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
>>>> index 586e3fd..bd501c2 100644
>>>> --- a/arch/x86/events/perf_event.h
>>>> +++ b/arch/x86/events/perf_event.h
>>>> @@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status);
>>>>
>>>> u64 arl_h_latency_data(struct perf_event *event, u64 status);
>>>>
>>>> +u64 pnc_latency_data(struct perf_event *event, u64 status);
>>>> +
>>>> extern struct event_constraint intel_core2_pebs_event_constraints[];
>>>>
>>>> extern struct event_constraint intel_atom_pebs_event_constraints[];
>>>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>>>> index c44a8fb..533393e 100644
>>>> --- a/include/uapi/linux/perf_event.h
>>>> +++ b/include/uapi/linux/perf_event.h
>>>> @@ -1330,14 +1330,16 @@ union perf_mem_data_src {
>>>> mem_snoopx : 2, /* Snoop mode, ext */
>>>> mem_blk : 3, /* Access blocked */
>>>> mem_hops : 3, /* Hop level */
>>>> - mem_rsvd : 18;
>>>> + mem_region : 5, /* cache/memory regions */
>>>> + mem_rsvd : 13;
>>>> };
>>>> };
>>>> #elif defined(__BIG_ENDIAN_BITFIELD)
>>>> union perf_mem_data_src {
>>>> __u64 val;
>>>> struct {
>>>> - __u64 mem_rsvd : 18,
>>>> + __u64 mem_rsvd : 13,
>>>> + mem_region : 5, /* cache/memory regions */
>>>> mem_hops : 3, /* Hop level */
>>>> mem_blk : 3, /* Access blocked */
>>>> mem_snoopx : 2, /* Snoop mode, ext */
>>>> @@ -1394,7 +1396,7 @@ union perf_mem_data_src {
>>>> #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
>>>> #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
>>>> #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
>>>> -/* 0x007 available */
>>>> +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
>>>> #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
>>>> #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
>>>> #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
>>>> @@ -1447,6 +1449,25 @@ union perf_mem_data_src {
>>>> /* 5-7 available */
>>>> #define PERF_MEM_HOPS_SHIFT 43
>>>>
>>>> +/* Cache/Memory region */
>>>> +#define PERF_MEM_REGION_NA 0x0 /* Invalid */
>>>> +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
>>>> +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
>>>> +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
>>>> +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
>>>> +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
>>>> +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
>>>> +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
>>>> +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
>>>> +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
>>>> +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
>>>> +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
>>>> +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
>>>> +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
>>>> +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
>>>> +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
>>>> +#define PERF_MEM_REGION_SHIFT 46
>>>> +
>>>> #define PERF_MEM_S(a, s) \
>>>> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>>>>
>>>> diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
>>>> index c44a8fb..d4b9961 100644
>>>> --- a/tools/include/uapi/linux/perf_event.h
>>>> +++ b/tools/include/uapi/linux/perf_event.h
>>>> @@ -1330,14 +1330,16 @@ union perf_mem_data_src {
>>>> mem_snoopx : 2, /* Snoop mode, ext */
>>>> mem_blk : 3, /* Access blocked */
>>>> mem_hops : 3, /* Hop level */
>>>> - mem_rsvd : 18;
>>>> + mem_region : 5, /* cache/memory regions */
>>>> + mem_rsvd : 13;
>>>> };
>>>> };
>>>> #elif defined(__BIG_ENDIAN_BITFIELD)
>>>> union perf_mem_data_src {
>>>> __u64 val;
>>>> struct {
>>>> - __u64 mem_rsvd : 18,
>>>> + __u64 mem_rsvd : 13,
>>>> + mem_region : 5, /* cache/memory regions */
>>>> mem_hops : 3, /* Hop level */
>>>> mem_blk : 3, /* Access blocked */
>>>> mem_snoopx : 2, /* Snoop mode, ext */
>>>> @@ -1394,7 +1396,7 @@ union perf_mem_data_src {
>>>> #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */
>>>> #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */
>>>> #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */
>>>> -/* 0x007 available */
>>>> +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */
>>>> #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */
>>>> #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */
>>>> #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */
>>>> @@ -1447,6 +1449,25 @@ union perf_mem_data_src {
>>>> /* 5-7 available */
>>>> #define PERF_MEM_HOPS_SHIFT 43
>>>>
>>>> +/* Cache/Memory region */
>>>> +#define PERF_MEM_REGION_NA 0x0 /* Invalid */
>>>> +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */
>>>> +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */
>>>> +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */
>>>> +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */
>>>> +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */
>>>> +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */
>>>> +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */
>>>> +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */
>>>> +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */
>>>> +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */
>>>> +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */
>>>> +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */
>>>> +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */
>>>> +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */
>>>> +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */
>>>> +#define PERF_MEM_REGION_SHIFT 46
>>>> +
>>>> #define PERF_MEM_S(a, s) \
>>>> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>>>>
>>>>