[tip: perf/urgent] perf/core: Add PERF_SAMPLE_CGROUP feature

From: tip-bot2 for Namhyung Kim
Date: Sat Apr 04 2020 - 04:43:50 EST


The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: 6546b19f95acc986807de981402bbac6b3a94b0f
Gitweb: https://git.kernel.org/tip/6546b19f95acc986807de981402bbac6b3a94b0f
Author: Namhyung Kim <namhyung@xxxxxxxxxx>
AuthorDate: Wed, 25 Mar 2020 21:45:29 +09:00
Committer: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
CommitterDate: Fri, 27 Mar 2020 10:41:44 -03:00

perf/core: Add PERF_SAMPLE_CGROUP feature

The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information in
the sample. It will add a 64-bit id to identify current cgroup and it's
the file handle in the cgroup file system. Userspace should use this
information with PERF_RECORD_CGROUP event to match which cgroup it
belongs.

I put it before PERF_SAMPLE_AUX for simplicity since it just needs a
64-bit word. But if we want bigger samples, I can work on that
direction too.

Committer testing:

$ pahole perf_sample_data | grep -w cgroup -B5 -A5
/* --- cacheline 4 boundary (256 bytes) was 56 bytes ago --- */
struct perf_regs regs_intr; /* 312 16 */
/* --- cacheline 5 boundary (320 bytes) was 8 bytes ago --- */
u64 stack_user_size; /* 328 8 */
u64 phys_addr; /* 336 8 */
u64 cgroup; /* 344 8 */

/* size: 384, cachelines: 6, members: 22 */
/* padding: 32 */
};
$

Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
Tested-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
Acked-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Acked-by: Tejun Heo <tj@xxxxxxxxxx>
Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Mark Rutland <mark.rutland@xxxxxxx>
Cc: Zefan Li <lizefan@xxxxxxxxxx>
Link: http://lore.kernel.org/lkml/20200325124536.2800725-3-namhyung@xxxxxxxxxx
Signed-off-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
---
include/linux/perf_event.h | 1 +
include/uapi/linux/perf_event.h | 3 ++-
init/Kconfig | 3 ++-
kernel/events/core.c | 22 ++++++++++++++++++++++
4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8768a39..9c3e761 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1020,6 +1020,7 @@ struct perf_sample_data {
u64 stack_user_size;

u64 phys_addr;
+ u64 cgroup;
} ____cacheline_aligned;

/* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index de95f6c..7b2d6fc 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -142,8 +142,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_REGS_INTR = 1U << 18,
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
PERF_SAMPLE_AUX = 1U << 20,
+ PERF_SAMPLE_CGROUP = 1U << 21,

- PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */

__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
};
diff --git a/init/Kconfig b/init/Kconfig
index 20a6ac3..7766b06 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1027,7 +1027,8 @@ config CGROUP_PERF
help
This option extends the perf per-cpu mode to restrict monitoring
to threads which belong to the cgroup specified and run on the
- designated cpu.
+ designated cpu. Or this can be used to have cgroup ID in samples
+ so that it can monitor performance events among cgroups.

Say N if unsure.

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 994932d..1569979 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1862,6 +1862,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);

+ if (sample_type & PERF_SAMPLE_CGROUP)
+ size += sizeof(data->cgroup);
+
event->header_size = size;
}

@@ -6867,6 +6870,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);

+ if (sample_type & PERF_SAMPLE_CGROUP)
+ perf_output_put(handle, data->cgroup);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);

@@ -7066,6 +7072,16 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);

+#ifdef CONFIG_CGROUP_PERF
+ if (sample_type & PERF_SAMPLE_CGROUP) {
+ struct cgroup *cgrp;
+
+ /* protected by RCU */
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+ data->cgroup = cgroup_id(cgrp);
+ }
+#endif
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;

@@ -11264,6 +11280,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,

if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
+ return -EINVAL;
+#endif
+
out:
return ret;