[PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event

From: Namhyung Kim
Date: Thu Dec 19 2019 - 23:33:26 EST

Next message: Namhyung Kim: "[PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature"
Previous message: Namhyung Kim: "[PATCHSET 0/9] perf: Improve cgroup profiling (v2)"
In reply to: Namhyung Kim: "[PATCHSET 0/9] perf: Improve cgroup profiling (v2)"
Next in thread: Peter Zijlstra: "Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

To support cgroup tracking, add CGROUP event to save a link between
cgroup path and inode number. The attr.cgroup bit was also added to
enable cgroup tracking from userspace.

This event will be generated when a new cgroup becomes active.
Userspace might need to synthesize those events for existing cgroups.

Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Li Zefan <lizefan@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Adrian Hunter <adrian.hunter@xxxxxxxxx>
Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
---
include/uapi/linux/perf_event.h | 14 +++-
kernel/events/core.c | 112 ++++++++++++++++++++++++++++++++
2 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 377d794d3105..7bae2d3380a6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -377,7 +377,8 @@ struct perf_event_attr {
ksymbol : 1, /* include ksymbol events */
bpf_event : 1, /* include bpf events */
aux_output : 1, /* generate AUX records instead of events */
- __reserved_1 : 32;
+ cgroup : 1, /* include cgroup events */
+ __reserved_1 : 31;

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1006,6 +1007,17 @@ enum perf_event_type {
*/
PERF_RECORD_BPF_EVENT = 18,

+ /*
+ * struct {
+ * struct perf_event_header header;
+ * u64 id;
+ * u64 path_len;
+ * char path[];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CGROUP = 19,
+
PERF_RECORD_MAX, /* non-ABI */
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4ff86d57f9e5..9bcb2b552acc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
+static atomic_t nr_cgroup_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -4455,6 +4456,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_comm_events);
if (event->attr.namespaces)
atomic_dec(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_dec(&nr_cgroup_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -7564,6 +7567,106 @@ void perf_event_namespaces(struct task_struct *task)
NULL);
}

+/*
+ * cgroup tracking
+ */
+#ifdef CONFIG_CGROUPS
+
+struct perf_cgroup_event {
+ char *path;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 path_len;
+ char path[];
+ } event_id;
+};
+
+static int perf_event_cgroup_match(struct perf_event *event)
+{
+ return event->attr.cgroup;
+}
+
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
+{
+ struct perf_cgroup_event *cgroup_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u16 header_size = cgroup_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_cgroup_match(event))
+ return;
+
+ perf_event_header__init_id(&cgroup_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ cgroup_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, cgroup_event->event_id);
+ __output_copy(&handle, cgroup_event->path,
+ cgroup_event->event_id.path_len);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ cgroup_event->event_id.header.size = header_size;
+}
+
+void perf_event_cgroup(struct cgroup *cgrp)
+{
+ struct perf_cgroup_event cgroup_event;
+ char path_enomem[16] = "//enomem";
+ char *pathname;
+ size_t size;
+
+ if (!atomic_read(&nr_cgroup_events))
+ return;
+
+ cgroup_event = (struct perf_cgroup_event){
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_CGROUP,
+ .misc = 0,
+ .size = sizeof(cgroup_event.event_id),
+ },
+ .id = cgroup_id(cgrp),
+ },
+ };
+
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (pathname == NULL) {
+ cgroup_event.path = path_enomem;
+ } else {
+ /* just to be sure to have enough space for alignment */
+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
+ cgroup_event.path = pathname;
+ }
+
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(cgroup_event.path) + 1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ cgroup_event.path[size++] = '\0';
+
+ cgroup_event.event_id.header.size += size;
+ cgroup_event.event_id.path_len = size;
+
+ perf_iterate_sb(perf_event_cgroup_output,
+ &cgroup_event,
+ NULL);
+
+ kfree(pathname);
+}
+
+#endif
+
/*
* mmap tracking
*/
@@ -10607,6 +10710,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_comm_events);
if (event->attr.namespaces)
atomic_inc(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_inc(&nr_cgroup_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -12581,6 +12686,12 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
kfree(jc);
}

+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ perf_event_cgroup(css->cgroup);
+ return 0;
+}
+
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
@@ -12602,6 +12713,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
+ .css_online = perf_cgroup_css_online,
.attach = perf_cgroup_attach,
/*
* Implicitly enable on dfl hierarchy so that perf events can
--
2.24.1.735.g03f4e72817-goog

Next message: Namhyung Kim: "[PATCH 2/9] perf/core: Add PERF_SAMPLE_CGROUP feature"
Previous message: Namhyung Kim: "[PATCHSET 0/9] perf: Improve cgroup profiling (v2)"
In reply to: Namhyung Kim: "[PATCHSET 0/9] perf: Improve cgroup profiling (v2)"
Next in thread: Peter Zijlstra: "Re: [PATCH 1/9] perf/core: Add PERF_RECORD_CGROUP event"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]