[Patch v8 12/23] perf: Add sampling support for SIMD registers

From: Dapeng Mi

Date: Fri May 29 2026 - 04:06:06 EST


From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

Users may be interested in sampling SIMD registers during profiling.
The current sample_regs_* structure does not have sufficient space
for all SIMD registers.

To address this, new attribute fields sample_simd_{pred,vec}_reg_* are
added to struct perf_event_attr to represent the SIMD registers that are
expected to be sampled.

Currently, the perf/x86 code supports XMM registers in sample_regs_*.
To unify the configuration of SIMD registers and ensure a consistent
method for configuring XMM and other SIMD registers, a new event
attribute field, sample_simd_regs_enabled, is introduced. When
sample_simd_regs_enabled is set, it indicates that all SIMD registers,
including XMM, will be represented by the newly introduced
sample_simd_{pred|vec}_reg_* fields. The original XMM space in
sample_regs_* is reserved for future uses.

Since SIMD registers are wider than 64 bits, a new output format is
introduced. The number and width of SIMD registers are dumped first,
followed by the register values. The number and width are based on the
user's configuration.

A new ABI, PERF_SAMPLE_REGS_ABI_SIMD, is added to indicate the new format.
The enum perf_sample_regs_abi is now a bitmap. This change should not
impact existing tools, as the version and bitmap remain the same for
values 1 and 2.

Additionally, two new __weak functions are introduced:
- perf_simd_reg_value(): Retrieves the value of the requested SIMD
register.
- perf_simd_reg_validate(): Validates the configuration of the SIMD
registers.

A new flag, PERF_PMU_CAP_SIMD_REGS, is added to indicate that the PMU
supports SIMD register dumping. An error is generated if
sample_simd_{pred|vec}_reg_* is mistakenly set for a PMU that does not
support this capability.

Suggested-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
Co-developed-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 8 ++
include/linux/perf_regs.h | 6 ++
include/uapi/linux/perf_event.h | 49 +++++++++-
kernel/events/core.c | 153 +++++++++++++++++++++++++++++---
4 files changed, 202 insertions(+), 14 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb38affa7352..5f0642ef4fd2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -306,6 +306,7 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_PAUSE 0x0200
#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400
#define PERF_PMU_CAP_MEDIATED_VPMU 0x0800
+#define PERF_PMU_CAP_SIMD_REGS 0x1000

/**
* pmu::scope
@@ -1534,6 +1535,13 @@ perf_event__output_id_sample(struct perf_event *event,
extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

+static inline bool event_has_simd_regs(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ return attr->sample_simd_regs_enabled != 0;
+}
+
static inline bool event_has_extended_regs(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
index 144bcc3ff19f..8fa3eeb14953 100644
--- a/include/linux/perf_regs.h
+++ b/include/linux/perf_regs.h
@@ -14,6 +14,12 @@ int perf_reg_validate(u64 mask);
u64 perf_reg_abi(struct task_struct *task);
void perf_get_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs);
+int perf_simd_reg_validate(u16 simd_enabled, u16 vec_qwords,
+ u64 vec_mask_intr, u64 vec_mask_user,
+ u16 pred_qwords, u32 pred_mask_intr,
+ u32 pred_mask_user);
+u64 perf_simd_reg_value(struct pt_regs *regs, int idx,
+ u16 qwords_idx, bool pred);

#ifdef CONFIG_HAVE_PERF_REGS
#include <asm/perf_regs.h>
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index fd10aa8d697f..c49fc76292f7 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -314,8 +314,9 @@ enum {
*/
enum perf_sample_regs_abi {
PERF_SAMPLE_REGS_ABI_NONE = 0,
- PERF_SAMPLE_REGS_ABI_32 = 1,
- PERF_SAMPLE_REGS_ABI_64 = 2,
+ PERF_SAMPLE_REGS_ABI_32 = (1 << 0),
+ PERF_SAMPLE_REGS_ABI_64 = (1 << 1),
+ PERF_SAMPLE_REGS_ABI_SIMD = (1 << 2),
};

/*
@@ -383,6 +384,7 @@ enum perf_event_read_format {
#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
#define PERF_ATTR_SIZE_VER9 144 /* add: config4 */
+#define PERF_ATTR_SIZE_VER10 176 /* Add: sample_simd_{vec|pred}_reg_* */

/*
* 'struct perf_event_attr' contains various attributes that define
@@ -547,6 +549,29 @@ struct perf_event_attr {

__u64 config3; /* extension of config2 */
__u64 config4; /* extension of config3 */
+
+ /*
+ * Defines the sampling SIMD/PRED(predicate) registers bitmap and
+ * qwords (8 bytes) length.
+ *
+ * sample_simd_regs_enabled != 0 indicates there are SIMD/PRED
+ * registers to be sampled, the SIMD/PRED registers bitmap and
+ * qwords length are represented in
+ * sample_simd_{vec|pred}_reg_{intr|user} and
+ * sample_simd_{vec|pred}_reg_qwords fields separately.
+ *
+ * sample_simd_regs_enabled == 0 indicates no SIMD/PRED registers
+ * are sampled.
+ */
+ __u16 sample_simd_regs_enabled;
+ __u16 sample_simd_pred_reg_qwords;
+ __u16 sample_simd_vec_reg_qwords;
+ __u16 __reserved_4;
+
+ __u32 sample_simd_pred_reg_intr;
+ __u32 sample_simd_pred_reg_user;
+ __u64 sample_simd_vec_reg_intr;
+ __u64 sample_simd_vec_reg_user;
};

/*
@@ -1020,7 +1045,15 @@ enum perf_event_type {
* } && PERF_SAMPLE_BRANCH_STACK
*
* { u64 abi; # enum perf_sample_regs_abi
- * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
+ * u64 regs[weight(mask)];
+ * struct {
+ * u64 nr_vectors; # 0 ... weight(sample_simd_vec_reg_user)
+ * u64 vector_qwords; # 0 ... sample_simd_vec_reg_qwords
+ * u64 nr_pred; # 0 ... weight(sample_simd_pred_reg_user)
+ * u64 pred_qwords; # 0 ... sample_simd_pred_reg_qwords
+ * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
+ * } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
+ * } && PERF_SAMPLE_REGS_USER
*
* { u64 size;
* char data[size];
@@ -1047,7 +1080,15 @@ enum perf_event_type {
* { u64 data_src; } && PERF_SAMPLE_DATA_SRC
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
- * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * u64 regs[weight(mask)];
+ * struct {
+ * u64 nr_vectors; # 0 ... weight(sample_simd_vec_reg_intr)
+ * u64 vector_qwords; # 0 ... sample_simd_vec_reg_qwords
+ * u64 nr_pred; # 0 ... weight(sample_simd_pred_reg_intr)
+ * u64 pred_qwords; # 0 ... sample_simd_pred_reg_qwords
+ * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
+ * } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
+ * } && PERF_SAMPLE_REGS_INTR
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* { u64 cgroup;} && PERF_SAMPLE_CGROUP
* { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2ce553db4dcb..94bb034da9b9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7791,22 +7791,60 @@ void __weak perf_get_regs_user(struct perf_regs *regs_user,
regs_user->abi = perf_reg_abi(current);
}

+#define word_for_each_set_bit(bit, val) \
+ for (unsigned long long __v = (val); \
+ __v && ((bit = __builtin_ctzll(__v)), 1); \
+ __v &= __v - 1)
+
static void
perf_output_sample_regs(struct perf_output_handle *handle,
struct pt_regs *regs, u64 mask)
{
int bit;
- DECLARE_BITMAP(_mask, 64);
-
- bitmap_from_u64(_mask, mask);
- for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
- u64 val;

- val = perf_reg_value(regs, bit);
+ word_for_each_set_bit(bit, mask) {
+ u64 val = perf_reg_value(regs, bit);
perf_output_put(handle, val);
}
}

+static void
+perf_output_sample_simd_regs(struct perf_output_handle *handle,
+ struct perf_event *event,
+ struct pt_regs *regs,
+ u64 mask, u32 pred_mask)
+{
+ u64 pred_qwords = event->attr.sample_simd_pred_reg_qwords;
+ u64 vec_qwords = event->attr.sample_simd_vec_reg_qwords;
+ u64 nr_vectors = hweight64(mask);
+ u64 nr_pred = hweight32(pred_mask);
+ int bit;
+
+ perf_output_put(handle, nr_vectors);
+ perf_output_put(handle, vec_qwords);
+ perf_output_put(handle, nr_pred);
+ perf_output_put(handle, pred_qwords);
+
+ if (nr_vectors) {
+ word_for_each_set_bit(bit, mask) {
+ for (int i = 0; i < vec_qwords; i++) {
+ u64 val = perf_simd_reg_value(regs, bit,
+ i, false);
+ perf_output_put(handle, val);
+ }
+ }
+ }
+ if (nr_pred) {
+ word_for_each_set_bit(bit, pred_mask) {
+ for (int i = 0; i < pred_qwords; i++) {
+ u64 val = perf_simd_reg_value(regs, bit,
+ i, true);
+ perf_output_put(handle, val);
+ }
+ }
+ }
+}
+
static void perf_sample_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs)
{
@@ -7828,6 +7866,22 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
regs_intr->abi = perf_reg_abi(current);
}

+int __weak perf_simd_reg_validate(u16 simd_enabled, u16 vec_qwords,
+ u64 vec_mask_intr, u64 vec_mask_user,
+ u16 pred_qwords, u32 pred_mask_intr,
+ u32 pred_mask_user)
+{
+ return simd_enabled ||
+ vec_qwords || vec_mask_intr || vec_mask_user ||
+ pred_qwords || pred_mask_intr || pred_mask_user ?
+ -EINVAL : 0;
+}
+
+u64 __weak perf_simd_reg_value(struct pt_regs *regs, int idx,
+ u16 qwords_idx, bool pred)
+{
+ return 0;
+}

/*
* Get remaining task size from user stack pointer.
@@ -8358,10 +8412,17 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, abi);

if (abi) {
- u64 mask = event->attr.sample_regs_user;
+ struct perf_event_attr *attr = &event->attr;
+ u64 mask = attr->sample_regs_user;
perf_output_sample_regs(handle,
data->regs_user.regs,
mask);
+ if (abi & PERF_SAMPLE_REGS_ABI_SIMD) {
+ perf_output_sample_simd_regs(handle, event,
+ data->regs_user.regs,
+ attr->sample_simd_vec_reg_user,
+ attr->sample_simd_pred_reg_user);
+ }
}
}

@@ -8389,11 +8450,18 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, abi);

if (abi) {
- u64 mask = event->attr.sample_regs_intr;
+ struct perf_event_attr *attr = &event->attr;
+ u64 mask = attr->sample_regs_intr;

perf_output_sample_regs(handle,
data->regs_intr.regs,
mask);
+ if (abi & PERF_SAMPLE_REGS_ABI_SIMD) {
+ perf_output_sample_simd_regs(handle, event,
+ data->regs_intr.regs,
+ attr->sample_simd_vec_reg_intr,
+ attr->sample_simd_pred_reg_intr);
+ }
}
}

@@ -8596,6 +8664,33 @@ static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
return d * !!(flags & s);
}

+static u64 perf_update_xregs_size(struct perf_event *event, bool intr)
+{
+ u16 pred_qwords = event->attr.sample_simd_pred_reg_qwords;
+ u16 vec_qwords = event->attr.sample_simd_vec_reg_qwords;
+ u64 pred_mask;
+ u64 mask;
+ int size;
+
+ if (intr) {
+ mask = event->attr.sample_simd_vec_reg_intr;
+ pred_mask = event->attr.sample_simd_pred_reg_intr;
+ } else {
+ mask = event->attr.sample_simd_vec_reg_user;
+ pred_mask = event->attr.sample_simd_pred_reg_user;
+ }
+
+ /* nr_vectors, vector_qwords, nr_pred, pred_qwords */
+ size = sizeof(u64) * 4;
+ size += (hweight64(mask) * vec_qwords +
+ hweight64(pred_mask) * pred_qwords) * sizeof(u64);
+
+ /* Warn if exceeding perf_event_header.size (u16). */
+ WARN_ON_ONCE(size > U16_MAX);
+
+ return size;
+}
+
void perf_prepare_sample(struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
@@ -8661,6 +8756,11 @@ void perf_prepare_sample(struct perf_sample_data *data,
size += hweight64(mask) * sizeof(u64);
}

+ if (data->regs_user.abi && event_has_simd_regs(event)) {
+ size += perf_update_xregs_size(event, false);
+ data->regs_user.abi |= PERF_SAMPLE_REGS_ABI_SIMD;
+ }
+
data->dyn_size += size;
data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
@@ -8724,6 +8824,11 @@ void perf_prepare_sample(struct perf_sample_data *data,
size += hweight64(mask) * sizeof(u64);
}

+ if (data->regs_intr.abi && event_has_simd_regs(event)) {
+ size += perf_update_xregs_size(event, true);
+ data->regs_intr.abi |= PERF_SAMPLE_REGS_ABI_SIMD;
+ }
+
data->dyn_size += size;
data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
@@ -13089,6 +13194,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ret)
goto err_pmu;

+ if (!(pmu->capabilities & PERF_PMU_CAP_SIMD_REGS) &&
+ event_has_simd_regs(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
+
if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
event_has_extended_regs(event)) {
ret = -EOPNOTSUPP;
@@ -13585,7 +13696,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,

attr->size = size;

- if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+ if (attr->__reserved_1 || attr->__reserved_2 ||
+ attr->__reserved_3 || attr->__reserved_4)
return -EINVAL;

if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -13634,6 +13746,15 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
ret = perf_reg_validate(attr->sample_regs_user);
if (ret)
return ret;
+ ret = perf_simd_reg_validate(attr->sample_simd_regs_enabled,
+ attr->sample_simd_vec_reg_qwords,
+ attr->sample_simd_vec_reg_intr,
+ attr->sample_simd_vec_reg_user,
+ attr->sample_simd_pred_reg_qwords,
+ attr->sample_simd_pred_reg_intr,
+ attr->sample_simd_pred_reg_user);
+ if (ret)
+ return ret;
}

if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
@@ -13654,8 +13775,20 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (!attr->sample_max_stack)
attr->sample_max_stack = sysctl_perf_event_max_stack;

- if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
+ if (attr->sample_type & PERF_SAMPLE_REGS_INTR) {
ret = perf_reg_validate(attr->sample_regs_intr);
+ if (ret)
+ return ret;
+ ret = perf_simd_reg_validate(attr->sample_simd_regs_enabled,
+ attr->sample_simd_vec_reg_qwords,
+ attr->sample_simd_vec_reg_intr,
+ attr->sample_simd_vec_reg_user,
+ attr->sample_simd_pred_reg_qwords,
+ attr->sample_simd_pred_reg_intr,
+ attr->sample_simd_pred_reg_user);
+ if (ret)
+ return ret;
+ }

#ifndef CONFIG_CGROUP_PERF
if (attr->sample_type & PERF_SAMPLE_CGROUP)
--
2.34.1