[RFC PATCH] perf: Container-aware tracing support

From: Aravinda Prasad
Date: Thu Jan 12 2017 - 07:11:29 EST


The RFC patch supports filtering container specific events
when perf tool is executed inside a container.

Unlike previous approaches, this approach lets the user
decide what is a container through a set of kernel configs.
The main reason for such an approach is the lack of
container-unique identifier in the kernel and a clear
definition on what constitutes a container; any combination
of the namespaces can be considered as a container.

Previous approaches mandated at least a PID namespace or a
cgroup namespace or a perf-namespace (was newly introduced
to support container-aware tracing) to be a part of a container.
However, based on the discussions in LKML, mandating a
namespace to be a part of a container is not acceptable.
Hence, this patch lets the user to define a container
through a set of kernel configs.

This patch restricts the filtering of events to perf hardware
events with sample type set to PERF_SAMPLE_IDENTIFIER.
Further, this patch piggybacks on the cgroups support, i.e.,
the patch expects processes inside a container to be grouped
into a single perf_event cgroup.

However, if the approach of user deciding what is a container
is acceptable, then the filtering will be extended to other
events and further will be decoupled from grouping the processes
to perf_event cgroup.

Limitation:
- Two different definitions of a container cannot co-exist.

Links to earlier approaches:
- https://lwn.net/Articles/695601/
- https://lwn.net/Articles/691298/
- https://lkml.org/lkml/2015/7/15/192

Patch is based on 4.8 kernel

Signed-off-by: Aravinda Prasad <aravinda@xxxxxxxxxxxxxxxxxx>
---
init/Kconfig | 64 ++++++++++++++++++++++++++++++++
kernel/events/core.c | 99 ++++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 148 insertions(+), 15 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index cac3f09..48568f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1720,6 +1720,70 @@ config DEBUG_PERF_USE_VMALLOC

Say N if unsure.

+config PERF_NS_TRACE
+ default n
+ bool "Container-aware tracing support"
+ depends on CGROUPS && NAMESPACES
+ help
+ Enable tracing support inside a container.
+
+ This allows to filter container specific events, without
+ any change in the user interface, when perf is invoked
+ within a container.
+
+ As the kernel has no concept of a container the user should
+ select from the below choice to let the kernel identify a container.
+
+ Say N if unsure.
+
+if PERF_NS_TRACE
+
+menu "Select the namespaces with which containers are created"
+
+config UTS_NS_TRACE
+ bool "UTS namespace"
+ depends on UTS_NS
+ default n
+ help
+ Select if containers are created with UTS namespace"
+
+config IPC_NS_TRACE
+ bool "IPC namespace"
+ depends on IPC_NS
+ default n
+ help
+ Select if containers are created with IPC namespace"
+
+config MNT_NS_TRACE
+ bool "Mount namespace"
+ default n
+ help
+ Select if containers are created with mount namespace"
+
+config PID_NS_TRACE
+ bool "PID Namespaces"
+ default y
+ depends on PID_NS
+ help
+ Select if containers are created with IPC namespace"
+
+config NET_NS_TRACE
+ bool "Network namespace"
+ depends on NET_NS
+ default n
+ help
+ Select if containers are created with NET namespace"
+
+config CGROUPS_NS_TRACE
+ bool "Cgroup namespace"
+ default y
+ help
+ Select if containers are created with cgroup namespace"
+
+endmenu
+
+endif #PERF_NS_TRACE
+
endmenu

config VM_EVENT_COUNTERS
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc9bb22..5920c9c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -802,23 +802,86 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
rcu_read_unlock();
}

+#ifdef CONFIG_PERF_NS_TRACE
+static inline bool is_container(void)
+{
+ bool flag = 0;
+#ifdef CONFIG_PID_NS_TRACE
+ if (task_active_pid_ns(current) == &init_pid_ns)
+ return 0;
+ else
+ flag = 1;
+#endif
+#ifdef CONFIG_UTS_NS_TRACE
+ if (current->nsproxy->uts_ns == &init_uts_ns)
+ return 0;
+ else
+ flag = 1;
+#endif
+#ifdef CONFIG_IPC_NS_TRACE
+ if (current->nsproxy->ipc_ns == &init_ipc_ns)
+ return 0;
+ else
+ flag = 1;
+#endif
+#ifdef CONFIG_MNT_NS_TRACE
+ if (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns)
+ return 0;
+ else
+ flag = 1;
+#endif
+#ifdef CONFIG_NET_NS_TRACE
+ if (current->nsproxy->net_ns == &init_net)
+ return 0;
+ else
+ flag = 1;
+#endif
+#ifdef CONFIG_CGROUPS_NS_TRACE
+ if (current->nsproxy->cgroup_ns == &init_cgroup_ns)
+ return 0;
+ else
+ flag = 1;
+#endif
+ return flag;
+}
+#endif /* #ifdef CONFIG_PERF_NS_TRACE */
+
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct fd f = fdget(fd);
+ struct fd f;
int ret = 0;

- if (!f.file)
- return -EBADF;
+ if (fd != -1) {
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;

- css = css_tryget_online_from_dir(f.file->f_path.dentry,
- &perf_event_cgrp_subsys);
- if (IS_ERR(css)) {
- ret = PTR_ERR(css);
- goto out;
+ css = css_tryget_online_from_dir(f.file->f_path.dentry,
+ &perf_event_cgrp_subsys);
+ if (IS_ERR(css)) {
+ ret = PTR_ERR(css);
+ fdput(f);
+ return ret;
+ }
+#ifdef CONFIG_PERF_NS_TRACE
+ } else if (event->attach_state == PERF_ATTACH_TASK) {
+ /* Tracing on a PID. No need to set event->cgrp */
+ return ret;
+ } else if (is_container()) {
+ css = task_css(current, perf_event_cgrp_id);
+ if (!css || !css_tryget_online(css))
+ return -ENOENT;
+ } else {
+ /*
+ * perf invoked from global context and hence don't set
+ * event->cgrp as all the events should be included
+ */
+ return ret;
+#endif /* #ifdef CONFIG_PERF_NS_TRACE */
}

cgrp = container_of(css, struct perf_cgroup, css);
@@ -833,8 +896,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
perf_detach_cgroup(event);
ret = -EINVAL;
}
-out:
- fdput(f);
+ if (fd != -1)
+ fdput(f);
+
return ret;
}

@@ -9059,11 +9123,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;

- if (cgroup_fd != -1) {
- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
- if (err)
- goto err_ns;
- }
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ goto err_ns;

pmu = perf_init_event(event);
if (!pmu)
@@ -9404,6 +9466,13 @@ SYSCALL_DEFINE5(perf_event_open,
return -EACCES;
}

+#ifdef CONFIG_PERF_NS_TRACE
+ if (is_container() && !(attr.type == PERF_TYPE_HARDWARE &&
+ attr.sample_type == PERF_SAMPLE_IDENTIFIER)) {
+ return -EACCES;
+ }
+#endif
+
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;