[PATCH v1 2/3] perf vendor events intel: Update meteorlake events
From: Ian Rogers
Date: Tue Mar 14 2023 - 01:34:08 EST
Update from 1.00 to 1.01. Event description updates. Addition of
IDQ_BUBBLES.CORE, TOPDOWN.BACKEND_BOUND_SLOTS, UOPS_RETIRED.SLOTS.
Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
---
tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +-
.../pmu-events/arch/x86/meteorlake/cache.json | 8 +++++
.../arch/x86/meteorlake/frontend.json | 9 +++++
.../arch/x86/meteorlake/memory.json | 13 +++++--
.../pmu-events/arch/x86/meteorlake/other.json | 4 +--
.../arch/x86/meteorlake/pipeline.json | 36 +++++++++++++++++--
.../arch/x86/meteorlake/virtual-memory.json | 4 +++
7 files changed, 69 insertions(+), 7 deletions(-)
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 210dd9b2004f..34431709f7d0 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -18,7 +18,7 @@ GenuineIntel-6-3A,v23,ivybridge,core
GenuineIntel-6-3E,v22,ivytown,core
GenuineIntel-6-2D,v22,jaketown,core
GenuineIntel-6-(57|85),v10,knightslanding,core
-GenuineIntel-6-A[AC],v1.00,meteorlake,core
+GenuineIntel-6-A[AC],v1.01,meteorlake,core
GenuineIntel-6-1[AEF],v3,nehalemep,core
GenuineIntel-6-2E,v3,nehalemex,core
GenuineIntel-6-2A,v18,sandybridge,core
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
index 0970724a2984..bf24d3f25a3d 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
@@ -3,6 +3,7 @@
"BriefDescription": "L2 code requests",
"EventCode": "0x24",
"EventName": "L2_RQSTS.ALL_CODE_RD",
+ "PublicDescription": "Counts the total number of L2 code requests.",
"SampleAfterValue": "200003",
"UMask": "0xe4",
"Unit": "cpu_core"
@@ -11,6 +12,7 @@
"BriefDescription": "Demand Data Read access L2 cache",
"EventCode": "0x24",
"EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD",
+ "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once.",
"SampleAfterValue": "200003",
"UMask": "0xe1",
"Unit": "cpu_core"
@@ -19,6 +21,7 @@
"BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.",
"EventCode": "0x2e",
"EventName": "LONGEST_LAT_CACHE.MISS",
+ "PublicDescription": "Counts the number of cacheable memory requests that miss in the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.",
"SampleAfterValue": "200003",
"UMask": "0x41",
"Unit": "cpu_atom"
@@ -27,6 +30,7 @@
"BriefDescription": "Core-originated cacheable requests that missed L3 (Except hardware prefetches to the L3)",
"EventCode": "0x2e",
"EventName": "LONGEST_LAT_CACHE.MISS",
+ "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.",
"SampleAfterValue": "100003",
"UMask": "0x41",
"Unit": "cpu_core"
@@ -35,6 +39,7 @@
"BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.",
"EventCode": "0x2e",
"EventName": "LONGEST_LAT_CACHE.REFERENCE",
+ "PublicDescription": "Counts the number of cacheable memory requests that access the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.",
"SampleAfterValue": "200003",
"UMask": "0x4f",
"Unit": "cpu_atom"
@@ -43,6 +48,7 @@
"BriefDescription": "Core-originated cacheable requests that refer to L3 (Except hardware prefetches to the L3)",
"EventCode": "0x2e",
"EventName": "LONGEST_LAT_CACHE.REFERENCE",
+ "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.",
"SampleAfterValue": "100003",
"UMask": "0x4f",
"Unit": "cpu_core"
@@ -53,6 +59,7 @@
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.ALL_LOADS",
"PEBS": "1",
+ "PublicDescription": "Counts all retired load instructions. This event accounts for SW prefetch instructions of PREFETCHNTA or PREFETCHT0/1/2 or PREFETCHW.",
"SampleAfterValue": "1000003",
"UMask": "0x81",
"Unit": "cpu_core"
@@ -63,6 +70,7 @@
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.ALL_STORES",
"PEBS": "1",
+ "PublicDescription": "Counts all retired store instructions.",
"SampleAfterValue": "1000003",
"UMask": "0x82",
"Unit": "cpu_core"
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json
index 7de11819dd0d..66e5609699ea 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json
@@ -14,5 +14,14 @@
"SampleAfterValue": "200003",
"UMask": "0x2",
"Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.",
+ "EventCode": "0x9c",
+ "EventName": "IDQ_BUBBLES.CORE",
+ "PublicDescription": "This event counts a subset of the Topdown Slots event that were no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.\nThe count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.",
+ "SampleAfterValue": "1000003",
+ "UMask": "0x1",
+ "Unit": "cpu_core"
}
]
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json
index b7715cec1dbc..20c2efe70eeb 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json
@@ -7,6 +7,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x80",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "1009",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -19,6 +20,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x10",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "20011",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -31,6 +33,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x100",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "503",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -43,6 +46,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x20",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "100007",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -55,6 +59,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x4",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "100003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -67,6 +72,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x200",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "101",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -79,6 +85,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x40",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "2003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -91,6 +98,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x8",
"PEBS": "2",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "50021",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -101,12 +109,13 @@
"EventCode": "0xcd",
"EventName": "MEM_TRANS_RETIRED.STORE_SAMPLE",
"PEBS": "2",
+ "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8",
"SampleAfterValue": "1000003",
"UMask": "0x2",
"Unit": "cpu_core"
},
{
- "BriefDescription": "Counts cacheable demand data reads were not supplied by the L3 cache.",
+ "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
"EventCode": "0xB7",
"EventName": "OCR.DEMAND_DATA_RD.L3_MISS",
"MSRIndex": "0x1a6,0x1a7",
@@ -126,7 +135,7 @@
"Unit": "cpu_core"
},
{
- "BriefDescription": "Counts demand reads for ownership, including SWPREFETCHW which is an RFO were not supplied by the L3 cache.",
+ "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
"EventCode": "0xB7",
"EventName": "OCR.DEMAND_RFO.L3_MISS",
"MSRIndex": "0x1a6,0x1a7",
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/other.json b/tools/perf/pmu-events/arch/x86/meteorlake/other.json
index ae98e3d0e149..14e648bf11c5 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/other.json
@@ -1,6 +1,6 @@
[
{
- "BriefDescription": "Counts cacheable demand data reads Catch all value for any response types - this includes response types not define in the OCR. If this is set all other response types will be ignored",
+ "BriefDescription": "Counts demand data reads that have any type of response.",
"EventCode": "0xB7",
"EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
"MSRIndex": "0x1a6,0x1a7",
@@ -20,7 +20,7 @@
"Unit": "cpu_core"
},
{
- "BriefDescription": "Counts demand reads for ownership, including SWPREFETCHW which is an RFO Catch all value for any response types - this includes response types not define in the OCR. If this is set all other response types will be ignored",
+ "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.",
"EventCode": "0xB7",
"EventName": "OCR.DEMAND_RFO.ANY_RESPONSE",
"MSRIndex": "0x1a6,0x1a7",
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
index 7be7e40c03ac..639789478073 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
@@ -4,6 +4,7 @@
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.ALL_BRANCHES",
"PEBS": "1",
+ "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires. All branch type instructions are accounted for.",
"SampleAfterValue": "200003",
"Unit": "cpu_atom"
},
@@ -12,6 +13,7 @@
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.ALL_BRANCHES",
"PEBS": "1",
+ "PublicDescription": "Counts all branch instructions retired.",
"SampleAfterValue": "400009",
"Unit": "cpu_core"
},
@@ -20,6 +22,7 @@
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
"PEBS": "1",
+ "PublicDescription": "Counts the total number of mispredicted branch instructions retired. All branch type instructions are accounted for. Prediction of the branch target address enables the processor to begin executing instructions before the non-speculative execution path is known. The branch prediction unit (BPU) predicts the target address based on the instruction pointer (IP) of the branch and on the execution path through which execution reached this IP. A branch misprediction occurs when the prediction is wrong, and results in discarding all instructions executed in the speculative path and re-fetching from the correct path.",
"SampleAfterValue": "200003",
"Unit": "cpu_atom"
},
@@ -28,6 +31,7 @@
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
"PEBS": "1",
+ "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path.",
"SampleAfterValue": "400009",
"Unit": "cpu_core"
},
@@ -39,7 +43,7 @@
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of unhalted core clock cycles[This event is alias to CPU_CLK_UNHALTED.THREAD_P]",
+ "BriefDescription": "Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.THREAD_P]",
"EventCode": "0x3c",
"EventName": "CPU_CLK_UNHALTED.CORE_P",
"SampleAfterValue": "2000003",
@@ -55,6 +59,7 @@
{
"BriefDescription": "Reference cycles when the core is not in halt state.",
"EventName": "CPU_CLK_UNHALTED.REF_TSC",
+ "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.",
"SampleAfterValue": "2000003",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -63,6 +68,7 @@
"BriefDescription": "Reference cycles when the core is not in halt state.",
"EventCode": "0x3c",
"EventName": "CPU_CLK_UNHALTED.REF_TSC_P",
+ "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.",
"SampleAfterValue": "2000003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -77,12 +83,13 @@
{
"BriefDescription": "Core cycles when the thread is not in halt state",
"EventName": "CPU_CLK_UNHALTED.THREAD",
+ "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events.",
"SampleAfterValue": "2000003",
"UMask": "0x2",
"Unit": "cpu_core"
},
{
- "BriefDescription": "Counts the number of unhalted core clock cycles[This event is alias to CPU_CLK_UNHALTED.CORE_P]",
+ "BriefDescription": "Counts the number of unhalted core clock cycles [This event is alias to CPU_CLK_UNHALTED.CORE_P]",
"EventCode": "0x3c",
"EventName": "CPU_CLK_UNHALTED.THREAD_P",
"SampleAfterValue": "2000003",
@@ -92,6 +99,7 @@
"BriefDescription": "Thread cycles when thread is not in halt state",
"EventCode": "0x3c",
"EventName": "CPU_CLK_UNHALTED.THREAD_P",
+ "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.",
"SampleAfterValue": "2000003",
"Unit": "cpu_core"
},
@@ -107,6 +115,7 @@
"BriefDescription": "Number of instructions retired. Fixed Counter - architectural event",
"EventName": "INST_RETIRED.ANY",
"PEBS": "1",
+ "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter.",
"SampleAfterValue": "2000003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -124,6 +133,7 @@
"EventCode": "0xc0",
"EventName": "INST_RETIRED.ANY_P",
"PEBS": "1",
+ "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter.",
"SampleAfterValue": "2000003",
"Unit": "cpu_core"
},
@@ -131,13 +141,24 @@
"BriefDescription": "Loads blocked due to overlapping with a preceding store that cannot be forwarded.",
"EventCode": "0x03",
"EventName": "LD_BLOCKS.STORE_FORWARD",
+ "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide.",
"SampleAfterValue": "100003",
"UMask": "0x82",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.",
+ "EventCode": "0xa4",
+ "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS",
+ "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.\nThe count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.",
+ "SampleAfterValue": "10000003",
+ "UMask": "0x2",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event",
"EventName": "TOPDOWN.SLOTS",
+ "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).",
"SampleAfterValue": "10000003",
"UMask": "0x4",
"Unit": "cpu_core"
@@ -146,6 +167,7 @@
"BriefDescription": "TMA slots available for an unhalted logical processor. General counter - architectural event",
"EventCode": "0xa4",
"EventName": "TOPDOWN.SLOTS_P",
+ "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core.",
"SampleAfterValue": "10000003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -154,6 +176,7 @@
"BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.",
"EventCode": "0x73",
"EventName": "TOPDOWN_BAD_SPECULATION.ALL",
+ "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window, including relevant microcode flows, and while uops are not yet available in the instruction queue (IQ) or until an FE_BOUND event occurs besides OTHER and CISC. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
"SampleAfterValue": "1000003",
"Unit": "cpu_atom"
},
@@ -178,5 +201,14 @@
"PEBS": "1",
"SampleAfterValue": "1000003",
"Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.",
+ "EventCode": "0xc2",
+ "EventName": "UOPS_RETIRED.SLOTS",
+ "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.\nSoftware can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.",
+ "SampleAfterValue": "2000003",
+ "UMask": "0x2",
+ "Unit": "cpu_core"
}
]
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json
index 0ee62378bf22..556e4292fcc8 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json
@@ -3,6 +3,7 @@
"BriefDescription": "Load miss in all TLB levels causes a page walk that completes. (All page sizes)",
"EventCode": "0x12",
"EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
+ "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
"SampleAfterValue": "100003",
"UMask": "0xe",
"Unit": "cpu_core"
@@ -11,6 +12,7 @@
"BriefDescription": "Store misses in all TLB levels causes a page walk that completes. (All page sizes)",
"EventCode": "0x13",
"EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
+ "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
"SampleAfterValue": "100003",
"UMask": "0xe",
"Unit": "cpu_core"
@@ -19,6 +21,7 @@
"BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.",
"EventCode": "0x85",
"EventName": "ITLB_MISSES.WALK_COMPLETED",
+ "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.",
"SampleAfterValue": "200003",
"UMask": "0xe",
"Unit": "cpu_atom"
@@ -27,6 +30,7 @@
"BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (All page sizes)",
"EventCode": "0x11",
"EventName": "ITLB_MISSES.WALK_COMPLETED",
+ "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.",
"SampleAfterValue": "100003",
"UMask": "0xe",
"Unit": "cpu_core"
--
2.40.0.rc1.284.g88254d51c5-goog