[tip:perf/core] perf/x86: Fix data source decoding for Skylake
From: tip-bot for Andi Kleen
Date: Fri Aug 25 2017 - 07:58:24 EST
Commit-ID: 6ae5fa61d27dcb055f4198bcf6c8dbbf1bb33f52
Gitweb: http://git.kernel.org/tip/6ae5fa61d27dcb055f4198bcf6c8dbbf1bb33f52
Author: Andi Kleen <ak@xxxxxxxxxxxxxxx>
AuthorDate: Wed, 16 Aug 2017 15:21:54 -0700
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Fri, 25 Aug 2017 11:04:17 +0200
perf/x86: Fix data source decoding for Skylake
Skylake changed the encoding of the PEBS data source field.
Some combinations are not available anymore, but some new cases
e.g. for L4 cache hit are added.
Fix up the conversion table for Skylake, similar as had been done
for Nehalem.
On Skylake server the encoding for L4 actually means persistent
memory. Handle this case too.
To properly describe it in the abstracted perf format I had to add
some new fields. Since a hit can have only one level add a new
field that is an enumeration, not a bit field to describe
the level. It can describe any level. Some numbers are also
used to describe PMEM and LFB.
Also add a new generic remote flag that can be combined with
the generic level to signify a remote cache.
And there is an extension field for the snoop indication to handle
the Forward state.
I didn't add a generic flag for hops because it's not needed
for Skylake.
I changed the existing encodings for older CPUs to also fill in the
new level and remote fields.
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Madhavan Srinivasan <maddy@xxxxxxxxxxxxxxxxxx>
Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: acme@xxxxxxxxxx
Cc: jolsa@xxxxxxxxxx
Link: http://lkml.kernel.org/r/20170816222156.19953-3-andi@xxxxxxxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
arch/x86/events/intel/core.c | 2 ++
arch/x86/events/intel/ds.c | 51 ++++++++++++++++++++++++++---------------
arch/x86/events/perf_event.h | 2 ++
include/uapi/linux/perf_event.h | 30 ++++++++++++++++++++++--
4 files changed, 64 insertions(+), 21 deletions(-)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c3439a3..6f34200 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4208,6 +4208,8 @@ __init int intel_pmu_init(void)
skl_format_attr);
WARN_ON(!x86_pmu.format_attrs);
x86_pmu.cpu_events = hsw_events_attrs;
+ intel_pmu_pebs_data_source_skl(
+ boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
pr_cont("Skylake events, ");
break;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3ccdf8c..98e36e0 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
*/
#define P(a, b) PERF_MEM_S(a, b)
#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
/* Version for Sandy Bridge and later */
static u64 pebs_data_source[] = {
- P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
- OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
- OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
- OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
- OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
- OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
- OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
- OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
- OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
- OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
- OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */
- OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
+ OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */
+ OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+ OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+ OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
};
/* Patch up minor differences in the bits */
void __init intel_pmu_pebs_data_source_nhm(void)
{
- pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT);
- pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
- pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+ pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+ pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+ pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
}
static u64 precise_store_data(u64 status)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 2e9636e..0f7dad8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -948,6 +948,8 @@ void intel_pmu_lbr_init_knl(void);
void intel_pmu_pebs_data_source_nhm(void);
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 642db5f..2a37ae9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -954,14 +954,20 @@ union perf_mem_data_src {
mem_snoop:5, /* snoop mode */
mem_lock:2, /* lock instr */
mem_dtlb:7, /* tlb access */
- mem_rsvd:31;
+ mem_lvl_num:4, /* memory hierarchy level number */
+ mem_remote:1, /* remote */
+ mem_snoopx:2, /* snoop mode, ext */
+ mem_rsvd:24;
};
};
#elif defined(__BIG_ENDIAN_BITFIELD)
union perf_mem_data_src {
__u64 val;
struct {
- __u64 mem_rsvd:31,
+ __u64 mem_rsvd:24,
+ mem_snoopx:2, /* snoop mode, ext */
+ mem_remote:1, /* remote */
+ mem_lvl_num:4, /* memory hierarchy level number */
mem_dtlb:7, /* tlb access */
mem_lock:2, /* lock instr */
mem_snoop:5, /* snoop mode */
@@ -998,6 +1004,22 @@ union perf_mem_data_src {
#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
#define PERF_MEM_LVL_SHIFT 5
+#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */
+#define PERF_MEM_REMOTE_SHIFT 37
+
+#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */
+#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */
+#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */
+#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */
+/* 5-0xa available */
+#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
+#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */
+#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */
+#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */
+#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */
+
+#define PERF_MEM_LVLNUM_SHIFT 33
+
/* snoop mode */
#define PERF_MEM_SNOOP_NA 0x01 /* not available */
#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
@@ -1006,6 +1028,10 @@ union perf_mem_data_src {
#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
#define PERF_MEM_SNOOP_SHIFT 19
+#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
+/* 1 free */
+#define PERF_MEM_SNOOPX_SHIFT 37
+
/* locked instruction */
#define PERF_MEM_LOCK_NA 0x01 /* not available */
#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */