[RFC PATCH 2/4] perf: Introduce PERF_SAMPLE_TLS_USER sample type

From: Beau Belgrave
Date: Thu Apr 11 2024 - 20:18:32 EST


When samples are generated, there is no way via the perf_event ABI to
fetch per-thread data. This data is very useful in tracing scenarios
that involve correlation IDs, such as OpenTelemetry. They are also
useful for tracking per-thread performance details directly within a
cooperating user process.

The newly establish OpenTelemetry profiling group requires a way to get
tracing correlations on both Linux and Windows. On Windows this
correlation is on a per-thread basis directly via ETW. On Linux we need
a fast mechanism to store these details and TLS seems like the best
option, see links for more details.

Add a new sample type (PERF_SAMPLE_TLS_USER) that fetches TLS data up to
X bytes per-sample. Use the existing PERF_SAMPLE_STACK_USER ABI for
outputting data out to consumers. Store requested data size by the user
in the previously reserved u16 (__reserved_2) within perf_event_attr.

Add tls_addr and tls_user_size to perf_sample_data and calculate them
during sample preparation. This allows the output side to know if
truncation is going to occur and not having to re-fetch the TLS value
from the user process a second time.

Add CONFIG_HAVE_PERF_USER_TLS_DUMP so that architectures can specify if
they have a TLS specific register (or other logic) that can be used for
dumping. This does not yet enable any architecture to do TLS dump, it
simply makes it possible by allowing a arch defined method named
arch_perf_user_tls_pointer().

Add perf_tls struct that arch_perf_user_tls_pointer() utilizes to set
TLS details of the address and size (for 32bit on 64bit compat cases).

Link: https://opentelemetry.io/blog/2024/profiling/
Link: https://www.elastic.co/blog/continuous-profiling-distributed-tracing-correlation
Signed-off-by: Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx>
---
arch/Kconfig | 7 +++
include/linux/perf_event.h | 7 +++
include/uapi/linux/perf_event.h | 5 +-
kernel/events/core.c | 105 +++++++++++++++++++++++++++++++-
kernel/events/internal.h | 16 +++++
5 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 9f066785bb71..6afaf5f46e2f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,13 @@ config HAVE_PERF_USER_STACK_DUMP
access to the user stack pointer which is not unified across
architectures.

+config HAVE_PERF_USER_TLS_DUMP
+ bool
+ help
+ Support user tls dumps for perf event samples. This needs
+ access to the user tls pointer which is not unified across
+ architectures.
+
config HAVE_ARCH_JUMP_LABEL
bool

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a..7fac81929eed 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1202,8 +1202,15 @@ struct perf_sample_data {
u64 data_page_size;
u64 code_page_size;
u64 aux_size;
+ u64 tls_addr;
+ u64 tls_user_size;
} ____cacheline_aligned;

+struct perf_tls {
+ unsigned long base; /* Base address for TLS */
+ unsigned long size; /* Size of base address */
+};
+
/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\
PERF_MEM_S(LVL, NA) |\
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3a64499b0f5d..b62669cfe581 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -162,8 +162,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22,
PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23,
PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24,
+ PERF_SAMPLE_TLS_USER = 1U << 25,

- PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 26, /* non-ABI */
};

#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT)
@@ -509,7 +510,7 @@ struct perf_event_attr {
*/
__u32 aux_watermark;
__u16 sample_max_stack;
- __u16 __reserved_2;
+ __u16 sample_tls_user; /* Size of TLS data to dump on samples */
__u32 aux_sample_size;
__u32 __reserved_3;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 07de5cc2aa25..f848bf4be9bd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6926,6 +6926,45 @@ static u64 perf_ustack_task_size(struct pt_regs *regs)
return TASK_SIZE - addr;
}

+/*
+ * Get remaining task size from user tls pointer.
+ *
+ * Outputs the address to use for the dump to avoid doing
+ * this twice (prepare and output).
+ */
+static u64
+perf_utls_task_size(struct pt_regs *regs, u64 dump_size, u64 *tls_addr)
+{
+ struct perf_tls tls;
+ unsigned long addr;
+
+ *tls_addr = 0;
+
+ /* No regs, no tls pointer, no dump. */
+ if (!regs)
+ return 0;
+
+ perf_user_tls_pointer(&tls);
+
+ if (WARN_ONCE(tls.size > sizeof(addr), "perf: Bad TLS size.\n"))
+ return 0;
+
+ addr = 0;
+ arch_perf_out_copy_user(&addr, (void *)tls.base, tls.size);
+
+ if (addr < dump_size)
+ return 0;
+
+ addr -= dump_size;
+
+ if (!addr || addr >= TASK_SIZE)
+ return 0;
+
+ *tls_addr = addr;
+
+ return TASK_SIZE - addr;
+}
+
static u16
perf_sample_dump_size(u16 dump_size, u16 header_size, u64 task_size)
{
@@ -6997,6 +7036,43 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}

+static void
+perf_output_sample_utls(struct perf_output_handle *handle, u64 addr,
+ u64 dump_size, struct pt_regs *regs)
+{
+ /* Case of a kernel thread, nothing to dump */
+ if (!regs) {
+ u64 size = 0;
+ perf_output_put(handle, size);
+ } else {
+ unsigned int rem;
+ u64 dyn_size;
+
+ /*
+ * We dump:
+ * static size
+ * - the size requested by user or the best one we can fit
+ * in to the sample max size
+ * data
+ * - user tls dump data
+ * dynamic size
+ * - the actual dumped size
+ */
+
+ /* Static size. */
+ perf_output_put(handle, dump_size);
+
+ /* Data. */
+ rem = __output_copy_user(handle, (void *)addr, dump_size);
+ dyn_size = dump_size - rem;
+
+ perf_output_skip(handle, rem);
+
+ /* Dynamic size. */
+ perf_output_put(handle, dyn_size);
+ }
+}
+
static unsigned long perf_prepare_sample_aux(struct perf_event *event,
struct perf_sample_data *data,
size_t size)
@@ -7474,6 +7550,13 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
perf_output_put(handle, data->code_page_size);

+ if (sample_type & PERF_SAMPLE_TLS_USER) {
+ perf_output_sample_utls(handle,
+ data->tls_addr,
+ data->tls_user_size,
+ data->regs_user.regs);
+ }
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);

@@ -7759,6 +7842,19 @@ void perf_prepare_sample(struct perf_sample_data *data,
data->sample_flags |= PERF_SAMPLE_STACK_USER;
}

+ if (filtered_sample_type & PERF_SAMPLE_TLS_USER) {
+ u16 tls_size = event->attr.sample_tls_user;
+ u64 task_size = perf_utls_task_size(data->regs_user.regs,
+ tls_size,
+ &data->tls_addr);
+
+ tls_size = perf_prepare_dump_data(data, event, regs,
+ tls_size, task_size);
+
+ data->tls_user_size = tls_size;
+ data->sample_flags |= PERF_SAMPLE_TLS_USER;
+ }
+
if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
data->weight.full = 0;
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
@@ -12159,7 +12255,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,

attr->size = size;

- if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+ if (attr->__reserved_1 || attr->__reserved_3)
return -EINVAL;

if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -12225,6 +12321,13 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
return -EINVAL;
}

+ if (attr->sample_type & PERF_SAMPLE_TLS_USER) {
+ if (!arch_perf_have_user_tls_dump())
+ return -ENOSYS;
+ else if (!IS_ALIGNED(attr->sample_tls_user, sizeof(u64)))
+ return -EINVAL;
+ }
+
if (!attr->sample_max_stack)
attr->sample_max_stack = sysctl_perf_event_max_stack;

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5150d5f84c03..b42747b1eb04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -243,4 +243,20 @@ static inline bool arch_perf_have_user_stack_dump(void)
#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */

+#ifdef CONFIG_HAVE_PERF_USER_TLS_DUMP
+static inline bool arch_perf_have_user_tls_dump(void)
+{
+ return true;
+}
+
+#define perf_user_tls_pointer(tls) arch_perf_user_tls_pointer(tls)
+#else
+static inline bool arch_perf_have_user_tls_dump(void)
+{
+ return false;
+}
+
+#define perf_user_tls_pointer(tls) memset(tls, 0, sizeof(*tls))
+#endif /* CONFIG_HAVE_PERF_USER_TLS_DUMP */
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
--
2.34.1