[PATCH 1/3] perf, x86: Add new cache events table for Haswell

From: Andi Kleen
Date: Mon Feb 09 2015 - 14:17:57 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)

The events with additional caveats have references to the specification update.

v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
Various fixes with events being more similar to the older
Sandy Bridge tables.
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/perf_event_intel.c | 197 ++++++++++++++++++++++++++++++++-
1 file changed, 195 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..387f580 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids

};

+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes both remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT(0)
+#define HSW_DEMAND_RFO BIT(1)
+#define HSW_PF_L2_RFO BIT(5)
+#define HSW_PF_L3_RFO BIT(8)
+#define HSW_ALL_RFO (HSW_DEMAND_RFO| \
+ HSW_PF_L2_RFO|HSW_PF_L3_RFO)
+#define HSW_ANY_RESPONSE BIT(16)
+#define HSW_SUPPLIER_NONE BIT(17)
+#define HSW_L3_MISS_LOCAL BIT(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT(31)
+#define HSW_SNOOP_NOT_NEEDED BIT(32)
+#define HSW_SNOOP_MISS BIT(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT(35)
+#define HSW_SNOOP_HITM BIT(36)
+#define HSW_SNOOP_NON_DRAM BIT(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
+ HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_DATA_RD|
+ HSW_L3_MISS|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
+ HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+ HSW_SUPPLIER_NONE,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_RFO|HSW_L3_MISS|
+ HSW_ANY_SNOOP|HSW_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
+ HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_DATA_RD|
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
+ HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
+ HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_RFO|
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
+ HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
+ HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2739,8 @@ __init int intel_pmu_init(void)
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));

intel_pmu_lbr_init_snb();

--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/