[Patch v3 21/22] perf tools: Support to capture more vector registers (x86/Intel)
From: Dapeng Mi
Date: Tue Apr 15 2025 - 04:28:39 EST
Intel architectural PEBS supports to capture more vector registers like
OPMASK/YMM/ZMM registers besides already supported XMM registers.
This patch adds Intel specific support to capture these new vector
registers for perf tools.
Co-developed-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
---
tools/arch/x86/include/uapi/asm/perf_regs.h | 79 ++++++++++-
tools/perf/arch/x86/util/perf_regs.c | 129 +++++++++++++++++-
.../perf/util/perf-regs-arch/perf_regs_x86.c | 82 +++++++++++
3 files changed, 285 insertions(+), 5 deletions(-)
diff --git a/tools/arch/x86/include/uapi/asm/perf_regs.h b/tools/arch/x86/include/uapi/asm/perf_regs.h
index 1c7ab5af5cc1..c05c6ec127c8 100644
--- a/tools/arch/x86/include/uapi/asm/perf_regs.h
+++ b/tools/arch/x86/include/uapi/asm/perf_regs.h
@@ -36,7 +36,7 @@ enum perf_event_x86_regs {
/* PERF_REG_INTEL_PT_MAX ignores the SSP register. */
PERF_REG_INTEL_PT_MAX = PERF_REG_X86_R15 + 1,
- /* These all need two bits set because they are 128bit */
+ /* These all need two bits set because they are 128 bits */
PERF_REG_X86_XMM0 = 32,
PERF_REG_X86_XMM1 = 34,
PERF_REG_X86_XMM2 = 36,
@@ -56,6 +56,83 @@ enum perf_event_x86_regs {
/* These include both GPRs and XMMX registers */
PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2,
+
+ /* Leave bits[127:64] for other GP registers, like R16 ~ R31.*/
+
+ /*
+ * Each YMM register need 4 bits to represent because they are 256 bits.
+ * PERF_REG_X86_YMMH0 = 128
+ */
+ PERF_REG_X86_YMM0 = 128,
+ PERF_REG_X86_YMM1 = PERF_REG_X86_YMM0 + 4,
+ PERF_REG_X86_YMM2 = PERF_REG_X86_YMM1 + 4,
+ PERF_REG_X86_YMM3 = PERF_REG_X86_YMM2 + 4,
+ PERF_REG_X86_YMM4 = PERF_REG_X86_YMM3 + 4,
+ PERF_REG_X86_YMM5 = PERF_REG_X86_YMM4 + 4,
+ PERF_REG_X86_YMM6 = PERF_REG_X86_YMM5 + 4,
+ PERF_REG_X86_YMM7 = PERF_REG_X86_YMM6 + 4,
+ PERF_REG_X86_YMM8 = PERF_REG_X86_YMM7 + 4,
+ PERF_REG_X86_YMM9 = PERF_REG_X86_YMM8 + 4,
+ PERF_REG_X86_YMM10 = PERF_REG_X86_YMM9 + 4,
+ PERF_REG_X86_YMM11 = PERF_REG_X86_YMM10 + 4,
+ PERF_REG_X86_YMM12 = PERF_REG_X86_YMM11 + 4,
+ PERF_REG_X86_YMM13 = PERF_REG_X86_YMM12 + 4,
+ PERF_REG_X86_YMM14 = PERF_REG_X86_YMM13 + 4,
+ PERF_REG_X86_YMM15 = PERF_REG_X86_YMM14 + 4,
+ PERF_REG_X86_YMM_MAX = PERF_REG_X86_YMM15 + 4,
+
+ /*
+ * Each ZMM register needs 8 bits to represent because they are 512 bits
+ * PERF_REG_X86_ZMMH0 = 192
+ */
+ PERF_REG_X86_ZMM0 = PERF_REG_X86_YMM_MAX,
+ PERF_REG_X86_ZMM1 = PERF_REG_X86_ZMM0 + 8,
+ PERF_REG_X86_ZMM2 = PERF_REG_X86_ZMM1 + 8,
+ PERF_REG_X86_ZMM3 = PERF_REG_X86_ZMM2 + 8,
+ PERF_REG_X86_ZMM4 = PERF_REG_X86_ZMM3 + 8,
+ PERF_REG_X86_ZMM5 = PERF_REG_X86_ZMM4 + 8,
+ PERF_REG_X86_ZMM6 = PERF_REG_X86_ZMM5 + 8,
+ PERF_REG_X86_ZMM7 = PERF_REG_X86_ZMM6 + 8,
+ PERF_REG_X86_ZMM8 = PERF_REG_X86_ZMM7 + 8,
+ PERF_REG_X86_ZMM9 = PERF_REG_X86_ZMM8 + 8,
+ PERF_REG_X86_ZMM10 = PERF_REG_X86_ZMM9 + 8,
+ PERF_REG_X86_ZMM11 = PERF_REG_X86_ZMM10 + 8,
+ PERF_REG_X86_ZMM12 = PERF_REG_X86_ZMM11 + 8,
+ PERF_REG_X86_ZMM13 = PERF_REG_X86_ZMM12 + 8,
+ PERF_REG_X86_ZMM14 = PERF_REG_X86_ZMM13 + 8,
+ PERF_REG_X86_ZMM15 = PERF_REG_X86_ZMM14 + 8,
+ PERF_REG_X86_ZMM16 = PERF_REG_X86_ZMM15 + 8,
+ PERF_REG_X86_ZMM17 = PERF_REG_X86_ZMM16 + 8,
+ PERF_REG_X86_ZMM18 = PERF_REG_X86_ZMM17 + 8,
+ PERF_REG_X86_ZMM19 = PERF_REG_X86_ZMM18 + 8,
+ PERF_REG_X86_ZMM20 = PERF_REG_X86_ZMM19 + 8,
+ PERF_REG_X86_ZMM21 = PERF_REG_X86_ZMM20 + 8,
+ PERF_REG_X86_ZMM22 = PERF_REG_X86_ZMM21 + 8,
+ PERF_REG_X86_ZMM23 = PERF_REG_X86_ZMM22 + 8,
+ PERF_REG_X86_ZMM24 = PERF_REG_X86_ZMM23 + 8,
+ PERF_REG_X86_ZMM25 = PERF_REG_X86_ZMM24 + 8,
+ PERF_REG_X86_ZMM26 = PERF_REG_X86_ZMM25 + 8,
+ PERF_REG_X86_ZMM27 = PERF_REG_X86_ZMM26 + 8,
+ PERF_REG_X86_ZMM28 = PERF_REG_X86_ZMM27 + 8,
+ PERF_REG_X86_ZMM29 = PERF_REG_X86_ZMM28 + 8,
+ PERF_REG_X86_ZMM30 = PERF_REG_X86_ZMM29 + 8,
+ PERF_REG_X86_ZMM31 = PERF_REG_X86_ZMM30 + 8,
+ PERF_REG_X86_ZMM_MAX = PERF_REG_X86_ZMM31 + 8,
+
+ /*
+ * OPMASK Registers
+ * PERF_REG_X86_OPMASK0 = 448
+ */
+ PERF_REG_X86_OPMASK0 = PERF_REG_X86_ZMM_MAX,
+ PERF_REG_X86_OPMASK1 = PERF_REG_X86_OPMASK0 + 1,
+ PERF_REG_X86_OPMASK2 = PERF_REG_X86_OPMASK1 + 1,
+ PERF_REG_X86_OPMASK3 = PERF_REG_X86_OPMASK2 + 1,
+ PERF_REG_X86_OPMASK4 = PERF_REG_X86_OPMASK3 + 1,
+ PERF_REG_X86_OPMASK5 = PERF_REG_X86_OPMASK4 + 1,
+ PERF_REG_X86_OPMASK6 = PERF_REG_X86_OPMASK5 + 1,
+ PERF_REG_X86_OPMASK7 = PERF_REG_X86_OPMASK6 + 1,
+
+ PERF_REG_X86_VEC_MAX = PERF_REG_X86_OPMASK7 + 1,
};
#define PERF_REG_EXTENDED_MASK (~((1ULL << PERF_REG_X86_XMM0) - 1))
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
index 5b163f0a651a..bade6c64770c 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -54,6 +54,66 @@ static const struct sample_reg sample_reg_masks[] = {
SMPL_REG2(XMM13, PERF_REG_X86_XMM13),
SMPL_REG2(XMM14, PERF_REG_X86_XMM14),
SMPL_REG2(XMM15, PERF_REG_X86_XMM15),
+
+ SMPL_REG4_EXT(YMM0, PERF_REG_X86_YMM0),
+ SMPL_REG4_EXT(YMM1, PERF_REG_X86_YMM1),
+ SMPL_REG4_EXT(YMM2, PERF_REG_X86_YMM2),
+ SMPL_REG4_EXT(YMM3, PERF_REG_X86_YMM3),
+ SMPL_REG4_EXT(YMM4, PERF_REG_X86_YMM4),
+ SMPL_REG4_EXT(YMM5, PERF_REG_X86_YMM5),
+ SMPL_REG4_EXT(YMM6, PERF_REG_X86_YMM6),
+ SMPL_REG4_EXT(YMM7, PERF_REG_X86_YMM7),
+ SMPL_REG4_EXT(YMM8, PERF_REG_X86_YMM8),
+ SMPL_REG4_EXT(YMM9, PERF_REG_X86_YMM9),
+ SMPL_REG4_EXT(YMM10, PERF_REG_X86_YMM10),
+ SMPL_REG4_EXT(YMM11, PERF_REG_X86_YMM11),
+ SMPL_REG4_EXT(YMM12, PERF_REG_X86_YMM12),
+ SMPL_REG4_EXT(YMM13, PERF_REG_X86_YMM13),
+ SMPL_REG4_EXT(YMM14, PERF_REG_X86_YMM14),
+ SMPL_REG4_EXT(YMM15, PERF_REG_X86_YMM15),
+
+ SMPL_REG8_EXT(ZMM0, PERF_REG_X86_ZMM0),
+ SMPL_REG8_EXT(ZMM1, PERF_REG_X86_ZMM1),
+ SMPL_REG8_EXT(ZMM2, PERF_REG_X86_ZMM2),
+ SMPL_REG8_EXT(ZMM3, PERF_REG_X86_ZMM3),
+ SMPL_REG8_EXT(ZMM4, PERF_REG_X86_ZMM4),
+ SMPL_REG8_EXT(ZMM5, PERF_REG_X86_ZMM5),
+ SMPL_REG8_EXT(ZMM6, PERF_REG_X86_ZMM6),
+ SMPL_REG8_EXT(ZMM7, PERF_REG_X86_ZMM7),
+ SMPL_REG8_EXT(ZMM8, PERF_REG_X86_ZMM8),
+ SMPL_REG8_EXT(ZMM9, PERF_REG_X86_ZMM9),
+ SMPL_REG8_EXT(ZMM10, PERF_REG_X86_ZMM10),
+ SMPL_REG8_EXT(ZMM11, PERF_REG_X86_ZMM11),
+ SMPL_REG8_EXT(ZMM12, PERF_REG_X86_ZMM12),
+ SMPL_REG8_EXT(ZMM13, PERF_REG_X86_ZMM13),
+ SMPL_REG8_EXT(ZMM14, PERF_REG_X86_ZMM14),
+ SMPL_REG8_EXT(ZMM15, PERF_REG_X86_ZMM15),
+ SMPL_REG8_EXT(ZMM16, PERF_REG_X86_ZMM16),
+ SMPL_REG8_EXT(ZMM17, PERF_REG_X86_ZMM17),
+ SMPL_REG8_EXT(ZMM18, PERF_REG_X86_ZMM18),
+ SMPL_REG8_EXT(ZMM19, PERF_REG_X86_ZMM19),
+ SMPL_REG8_EXT(ZMM20, PERF_REG_X86_ZMM20),
+ SMPL_REG8_EXT(ZMM21, PERF_REG_X86_ZMM21),
+ SMPL_REG8_EXT(ZMM22, PERF_REG_X86_ZMM22),
+ SMPL_REG8_EXT(ZMM23, PERF_REG_X86_ZMM23),
+ SMPL_REG8_EXT(ZMM24, PERF_REG_X86_ZMM24),
+ SMPL_REG8_EXT(ZMM25, PERF_REG_X86_ZMM25),
+ SMPL_REG8_EXT(ZMM26, PERF_REG_X86_ZMM26),
+ SMPL_REG8_EXT(ZMM27, PERF_REG_X86_ZMM27),
+ SMPL_REG8_EXT(ZMM28, PERF_REG_X86_ZMM28),
+ SMPL_REG8_EXT(ZMM29, PERF_REG_X86_ZMM29),
+ SMPL_REG8_EXT(ZMM30, PERF_REG_X86_ZMM30),
+ SMPL_REG8_EXT(ZMM31, PERF_REG_X86_ZMM31),
+
+ SMPL_REG_EXT(OPMASK0, PERF_REG_X86_OPMASK0),
+ SMPL_REG_EXT(OPMASK1, PERF_REG_X86_OPMASK1),
+ SMPL_REG_EXT(OPMASK2, PERF_REG_X86_OPMASK2),
+ SMPL_REG_EXT(OPMASK3, PERF_REG_X86_OPMASK3),
+ SMPL_REG_EXT(OPMASK4, PERF_REG_X86_OPMASK4),
+ SMPL_REG_EXT(OPMASK5, PERF_REG_X86_OPMASK5),
+ SMPL_REG_EXT(OPMASK6, PERF_REG_X86_OPMASK6),
+ SMPL_REG_EXT(OPMASK7, PERF_REG_X86_OPMASK7),
+
SMPL_REG_END
};
@@ -283,13 +343,59 @@ const struct sample_reg *arch__sample_reg_masks(void)
return sample_reg_masks;
}
-void arch__intr_reg_mask(unsigned long *mask)
+static void check_ext2_regs_mask(struct perf_event_attr *attr, bool user,
+ int idx, u64 fmask, unsigned long *mask)
+{
+ u64 reg_mask[PERF_SAMPLE_ARRAY_SIZE] = { 0 };
+ int fd;
+
+ if (user) {
+ attr->sample_regs_user = 0;
+ attr->sample_regs_user_ext[idx] = fmask;
+ } else {
+ attr->sample_regs_intr = 0;
+ attr->sample_regs_intr_ext[idx] = fmask;
+ }
+
+ /* reg_mask[] includes sample_regs_intr regs, so index need add 1. */
+ reg_mask[idx + 1] = fmask;
+
+ fd = sys_perf_event_open(attr, 0, -1, -1, 0);
+ if (fd != -1) {
+ close(fd);
+ bitmap_or(mask, mask, (unsigned long *)reg_mask,
+ PERF_SAMPLE_REGS_NUM);
+ }
+}
+
+#define PERF_REG_EXTENDED_YMM_MASK GENMASK_ULL(63, 0)
+#define PERF_REG_EXTENDED_ZMM_MASK GENMASK_ULL(63, 0)
+#define PERF_REG_EXTENDED_OPMASK_MASK GENMASK_ULL(7, 0)
+
+static void get_ext2_regs_mask(struct perf_event_attr *attr, bool user,
+ unsigned long *mask)
+{
+ event_attr_init(attr);
+
+ /* Check YMM regs, bits 128 ~ 191. */
+ check_ext2_regs_mask(attr, user, 1, PERF_REG_EXTENDED_YMM_MASK, mask);
+ /* Check ZMM 0-7 regs, bits 192 ~ 255. */
+ check_ext2_regs_mask(attr, user, 2, PERF_REG_EXTENDED_ZMM_MASK, mask);
+ /* Check ZMM 8-15 regs, bits 256 ~ 319. */
+ check_ext2_regs_mask(attr, user, 3, PERF_REG_EXTENDED_ZMM_MASK, mask);
+ /* Check ZMM 16-23 regs, bits 320 ~ 383. */
+ check_ext2_regs_mask(attr, user, 4, PERF_REG_EXTENDED_ZMM_MASK, mask);
+ /* Check ZMM 16-23 regs, bits 384 ~ 447. */
+ check_ext2_regs_mask(attr, user, 5, PERF_REG_EXTENDED_ZMM_MASK, mask);
+ /* Check OPMASK regs, bits 448 ~ 455. */
+ check_ext2_regs_mask(attr, user, 6, PERF_REG_EXTENDED_OPMASK_MASK, mask);
+}
+
+static void arch__get_reg_mask(unsigned long *mask, bool user)
{
struct perf_event_attr attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
- .sample_type = PERF_SAMPLE_REGS_INTR,
- .sample_regs_intr = PERF_REG_EXTENDED_MASK,
.precise_ip = 1,
.disabled = 1,
.exclude_kernel = 1,
@@ -298,6 +404,14 @@ void arch__intr_reg_mask(unsigned long *mask)
*(u64 *)mask = PERF_REGS_MASK;
+ if (user) {
+ attr.sample_type = PERF_SAMPLE_REGS_USER;
+ attr.sample_regs_user = PERF_REG_EXTENDED_MASK;
+ } else {
+ attr.sample_type = PERF_SAMPLE_REGS_INTR;
+ attr.sample_regs_intr = PERF_REG_EXTENDED_MASK;
+ }
+
/*
* In an unnamed union, init it here to build on older gcc versions
*/
@@ -325,9 +439,16 @@ void arch__intr_reg_mask(unsigned long *mask)
close(fd);
*(u64 *)mask = PERF_REG_EXTENDED_MASK | PERF_REGS_MASK;
}
+
+ get_ext2_regs_mask(&attr, user, mask);
+}
+
+void arch__intr_reg_mask(unsigned long *mask)
+{
+ arch__get_reg_mask(mask, false);
}
void arch__user_reg_mask(unsigned long *mask)
{
- *(uint64_t *)mask = PERF_REGS_MASK;
+ arch__get_reg_mask(mask, true);
}
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
index c0e95215b577..eb1e3d716f27 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
@@ -78,6 +78,88 @@ const char *__perf_reg_name_x86(int id)
XMM(14)
XMM(15)
#undef XMM
+
+#define YMM(x) \
+ case PERF_REG_X86_YMM ## x: \
+ case PERF_REG_X86_YMM ## x + 1: \
+ case PERF_REG_X86_YMM ## x + 2: \
+ case PERF_REG_X86_YMM ## x + 3: \
+ return "YMM" #x;
+ YMM(0)
+ YMM(1)
+ YMM(2)
+ YMM(3)
+ YMM(4)
+ YMM(5)
+ YMM(6)
+ YMM(7)
+ YMM(8)
+ YMM(9)
+ YMM(10)
+ YMM(11)
+ YMM(12)
+ YMM(13)
+ YMM(14)
+ YMM(15)
+#undef YMM
+
+#define ZMM(x) \
+ case PERF_REG_X86_ZMM ## x: \
+ case PERF_REG_X86_ZMM ## x + 1: \
+ case PERF_REG_X86_ZMM ## x + 2: \
+ case PERF_REG_X86_ZMM ## x + 3: \
+ case PERF_REG_X86_ZMM ## x + 4: \
+ case PERF_REG_X86_ZMM ## x + 5: \
+ case PERF_REG_X86_ZMM ## x + 6: \
+ case PERF_REG_X86_ZMM ## x + 7: \
+ return "ZMM" #x;
+ ZMM(0)
+ ZMM(1)
+ ZMM(2)
+ ZMM(3)
+ ZMM(4)
+ ZMM(5)
+ ZMM(6)
+ ZMM(7)
+ ZMM(8)
+ ZMM(9)
+ ZMM(10)
+ ZMM(11)
+ ZMM(12)
+ ZMM(13)
+ ZMM(14)
+ ZMM(15)
+ ZMM(16)
+ ZMM(17)
+ ZMM(18)
+ ZMM(19)
+ ZMM(20)
+ ZMM(21)
+ ZMM(22)
+ ZMM(23)
+ ZMM(24)
+ ZMM(25)
+ ZMM(26)
+ ZMM(27)
+ ZMM(28)
+ ZMM(29)
+ ZMM(30)
+ ZMM(31)
+#undef ZMM
+
+#define OPMASK(x) \
+ case PERF_REG_X86_OPMASK ## x: \
+ return "opmask" #x;
+
+ OPMASK(0)
+ OPMASK(1)
+ OPMASK(2)
+ OPMASK(3)
+ OPMASK(4)
+ OPMASK(5)
+ OPMASK(6)
+ OPMASK(7)
+#undef OPMASK
default:
return NULL;
}
--
2.40.1