[RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring
From: Stephane Eranian
Date: Tue Aug 31 2010 - 11:32:19 EST
This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.
The patch adds perf_event_attr.cgroup, a boolean, to activate
the mode. The cgroup is designated by passing, perf_event_attr.cgroup_fd,
on opened file descriptor to the <mnt>/<cgroup>/perf_event.perf file.
Signed-off-by: Stephane Eranian <eranian@xxxxxxxxxx>
--
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3cb7d04..ed76357 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -618,6 +618,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
unsigned short css_id(struct cgroup_subsys_state *css);
unsigned short css_depth(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id);
+
#else /* !CONFIG_CGROUPS */
static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff0..93f86b7 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
SUBSYS(blkio)
#endif
+#ifdef CONFIG_PERF_EVENTS
+SUBSYS(perf)
+#endif
+
/* */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 000610c..9f7a645 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -215,8 +215,9 @@ struct perf_event_attr {
*/
precise_ip : 2, /* skid constraint */
mmap_data : 1, /* non-exec mmap data */
+ cgroup : 1, /* cgroup aggregation */
- __reserved_1 : 46;
+ __reserved_1 : 45;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -226,6 +227,8 @@ struct perf_event_attr {
__u32 bp_type;
__u64 bp_addr;
__u64 bp_len;
+
+ int cgroup_fd;
};
/*
@@ -463,6 +466,7 @@ enum perf_callchain_context {
#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
+# include <linux/cgroup.h>
#endif
struct perf_guest_info_callbacks {
@@ -657,6 +661,12 @@ struct swevent_hlist {
#define PERF_ATTACH_CONTEXT 0x01
#define PERF_ATTACH_GROUP 0x02
+#ifdef CONFIG_CGROUPS
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+};
+#endif
+
/**
* struct perf_event - performance event kernel representation:
*/
@@ -759,7 +769,9 @@ struct perf_event {
struct ftrace_event_call *tp_event;
struct event_filter *filter;
#endif
-
+#ifdef CONFIG_CGROUPS
+ struct perf_cgroup *css;
+#endif
#endif /* CONFIG_PERF_EVENTS */
};
@@ -806,6 +818,8 @@ struct perf_event_context {
u64 generation;
int pin_count;
struct rcu_head rcu_head;
+
+ int nr_cgroups;
};
/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e5c5497..3e56354 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4722,6 +4722,23 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}
+struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id)
+{
+ struct cgroup *cgrp;
+
+ /* check in cgroup filesystem */
+ if (f->f_op != &cgroup_seqfile_operations)
+ return ERR_PTR(-EBADF);
+
+ if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+ return ERR_PTR(-EINVAL);
+
+ /* get cgroup */
+ cgrp = __d_cgrp(f->f_dentry->d_parent);
+
+ return cgrp->subsys[id];
+}
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index d196412..01a85f7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -49,6 +49,77 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
+#ifdef CONFIG_CGROUPS
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+ if (!task)
+ return NULL;
+ return container_of(task_subsys_state(task, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+static inline
+struct perf_cgroup *perf_cgroup_from_cont(struct cgroup *cont)
+{
+ return container_of(cgroup_subsys_state(cont, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+ struct perf_cgroup *css = perf_cgroup_from_task(task);
+ return !event->css || event->css == css;
+}
+
+static void *perf_get_cgroup(int fd)
+{
+ struct cgroup_subsys_state *css;
+ struct file *file;
+ int fput_needed;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ css = cgroup_css_from_file(file, perf_subsys_id);
+ if (!IS_ERR(css))
+ css_get(css);
+
+ fput_light(file, fput_needed);
+
+ return css;
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+ if (event->css)
+ css_put(&event->css->css);
+}
+#else /* !CONFIG_CGROUP */
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+ return true;
+}
+
+static inline void *perf_get_cgroup(int fd)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{}
+
+#endif
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return event->css != NULL;
+}
+
/*
* perf event paranoia level:
* -1 - not paranoid at all
@@ -301,6 +372,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}
+ if (is_cgroup_event(event))
+ ctx->nr_cgroups++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
@@ -340,6 +414,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
+ if (is_cgroup_event(event))
+ ctx->nr_cgroups--;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -403,9 +480,10 @@ static void perf_group_detach(struct perf_event *event)
}
static inline int
-event_filter_match(struct perf_event *event)
+event_filter_match(struct perf_event *event, struct task_struct *task)
{
- return event->cpu == -1 || event->cpu == smp_processor_id();
+ return (event->cpu == -1 || event->cpu == smp_processor_id())
+ && perf_cgroup_match(event, task);
}
static void
@@ -421,7 +499,7 @@ event_sched_out(struct perf_event *event,
* via read() for time_enabled, time_running
*/
if (event->state == PERF_EVENT_STATE_INACTIVE
- && !event_filter_match(event)) {
+ && !event_filter_match(event, current)) {
delta = ctx->time - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = ctx->time;
@@ -820,7 +898,7 @@ static void __perf_install_in_context(void *info)
add_event_to_ctx(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
goto unlock;
/*
@@ -966,7 +1044,7 @@ static void __perf_event_enable(void *info)
goto unlock;
__perf_event_mark_enabled(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
goto unlock;
/*
@@ -1209,71 +1287,6 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-/*
- * Called from scheduler to remove the events of the current task,
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- struct perf_event_context *ctx = task->perf_event_ctxp;
- struct perf_event_context *next_ctx;
- struct perf_event_context *parent;
- int do_switch = 1;
-
- perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-
- if (likely(!ctx || !cpuctx->task_ctx))
- return;
-
- rcu_read_lock();
- parent = rcu_dereference(ctx->parent_ctx);
- next_ctx = next->perf_event_ctxp;
- if (parent && next_ctx &&
- rcu_dereference(next_ctx->parent_ctx) == parent) {
- /*
- * Looks like the two contexts are clones, so we might be
- * able to optimize the context switch. We lock both
- * contexts and check that they are clones under the
- * lock (including re-checking that neither has been
- * uncloned in the meantime). It doesn't matter which
- * order we take the locks because no other cpu could
- * be trying to lock both of these tasks.
- */
- raw_spin_lock(&ctx->lock);
- raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
- if (context_equiv(ctx, next_ctx)) {
- /*
- * XXX do we need a memory barrier of sorts
- * wrt to rcu_dereference() of perf_event_ctxp
- */
- task->perf_event_ctxp = next_ctx;
- next->perf_event_ctxp = ctx;
- ctx->task = next;
- next_ctx->task = task;
- do_switch = 0;
-
- perf_event_sync_stat(ctx, next_ctx);
- }
- raw_spin_unlock(&next_ctx->lock);
- raw_spin_unlock(&ctx->lock);
- }
- rcu_read_unlock();
-
- if (do_switch) {
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
- }
-}
-
static void task_ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
{
@@ -1308,14 +1321,15 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+ struct perf_cpu_context *cpuctx,
+ struct task_struct *task)
{
struct perf_event *event;
list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, task))
continue;
if (group_can_go_on(event, cpuctx, 1))
@@ -1334,7 +1348,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+ struct perf_cpu_context *cpuctx,
+ struct task_struct *task)
{
struct perf_event *event;
int can_add_hw = 1;
@@ -1347,7 +1362,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, task))
continue;
if (group_can_go_on(event, cpuctx, can_add_hw))
@@ -1359,7 +1374,8 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
@@ -1375,11 +1391,11 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (event_type & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, cpuctx, task);
/* Then walk through the lower prio flexible groups */
if (event_type & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, cpuctx, task);
perf_enable();
out:
@@ -1387,11 +1403,12 @@ ctx_sched_in(struct perf_event_context *ctx,
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
struct perf_event_context *ctx = &cpuctx->ctx;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, task);
}
static void task_ctx_sched_in(struct task_struct *task,
@@ -1404,7 +1421,7 @@ static void task_ctx_sched_in(struct task_struct *task,
return;
if (cpuctx->task_ctx == ctx)
return;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, task);
cpuctx->task_ctx = ctx;
}
/*
@@ -1438,15 +1455,90 @@ void perf_event_task_sched_in(struct task_struct *task)
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
cpuctx->task_ctx = ctx;
perf_enable();
}
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event_context *ctx = task->perf_event_ctxp;
+ struct perf_event_context *next_ctx;
+ struct perf_event_context *parent;
+ int do_switch = 1;
+
+ perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+ /*
+ * if events have cgroups, then we switch out all per-cpu
+ * events, and reschedule only the ones for the cgroup to
+ * come
+ */
+ if (cpuctx->ctx.nr_cgroups > 0) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, next);
+ }
+ if (likely(!ctx || !cpuctx->task_ctx))
+ return;
+
+ rcu_read_lock();
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_ctx = next->perf_event_ctxp;
+ if (parent && next_ctx &&
+ rcu_dereference(next_ctx->parent_ctx) == parent) {
+ /*
+ * Looks like the two contexts are clones, so we might be
+ * able to optimize the context switch. We lock both
+ * contexts and check that they are clones under the
+ * lock (including re-checking that neither has been
+ * uncloned in the meantime). It doesn't matter which
+ * order we take the locks because no other cpu could
+ * be trying to lock both of these tasks.
+ */
+ raw_spin_lock(&ctx->lock);
+ raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+ if (context_equiv(ctx, next_ctx)) {
+ /*
+ * XXX do we need a memory barrier of sorts
+ * wrt to rcu_dereference() of perf_event_ctxp
+ */
+ task->perf_event_ctxp = next_ctx;
+ next->perf_event_ctxp = ctx;
+ ctx->task = next;
+ next_ctx->task = task;
+ do_switch = 0;
+
+ perf_event_sync_stat(ctx, next_ctx);
+ }
+ raw_spin_unlock(&next_ctx->lock);
+ raw_spin_unlock(&ctx->lock);
+ }
+ rcu_read_unlock();
+
+ if (do_switch) {
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ cpuctx->task_ctx = NULL;
+ }
+}
+
+
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
@@ -1579,7 +1671,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
continue;
hwc = &event->hw;
@@ -1660,7 +1752,7 @@ void perf_event_task_tick(struct task_struct *curr)
if (ctx)
rotate_ctx(ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, curr);
if (ctx)
task_ctx_sched_in(curr, EVENT_FLEXIBLE);
perf_enable();
@@ -2132,6 +2224,9 @@ static void free_event(struct perf_event *event)
event->buffer = NULL;
}
+ if (is_cgroup_event(event))
+ perf_put_cgroup(event);
+
if (event->destroy)
event->destroy(event);
@@ -3764,7 +3859,7 @@ static int perf_event_task_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
return 0;
if (event->attr.comm || event->attr.mmap ||
@@ -3878,7 +3973,7 @@ static int perf_event_comm_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
return 0;
if (event->attr.comm)
@@ -3999,7 +4094,7 @@ static int perf_event_mmap_match(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event, current))
return 0;
if ((!executable && event->attr.mmap_data) ||
@@ -5031,12 +5126,32 @@ perf_event_alloc(struct perf_event_attr *attr,
const struct pmu *pmu;
struct perf_event *event;
struct hw_perf_event *hwc;
+ struct perf_cgroup *css = NULL;
long err;
event = kzalloc(sizeof(*event), gfpflags);
if (!event)
return ERR_PTR(-ENOMEM);
+ if (attr->cgroup) {
+ css = perf_get_cgroup(attr->cgroup_fd);
+ if (IS_ERR(css)) {
+ kfree(event);
+ return (void *)css;
+ }
+ /*
+ * all events in a group must monitor
+ * the same cgroup because a thread belongs
+ * to only one cgroup at a time
+ */
+ if (group_leader && group_leader->css != css) {
+ event->css = css;
+ perf_put_cgroup(event);
+ kfree(event);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
/*
* Single events are their own group leaders, with an
* empty sibling list:
@@ -5067,6 +5182,7 @@ perf_event_alloc(struct perf_event_attr *attr,
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
+ event->css = css;
if (!overflow_handler && parent_event)
overflow_handler = parent_event->overflow_handler;
@@ -5125,6 +5241,7 @@ done:
if (err) {
if (event->ns)
put_pid_ns(event->ns);
+ perf_put_cgroup(event);
kfree(event);
return ERR_PTR(err);
}
@@ -5320,6 +5437,10 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* cgroup reserved for system-wide */
+ if (attr.cgroup && pid != -1)
+ return -EINVAL;
+
event_fd = get_unused_fd_flags(O_RDWR);
if (event_fd < 0)
return event_fd;
@@ -6094,3 +6215,51 @@ static int __init perf_event_sysfs_init(void)
&perfclass_attr_group);
}
device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUPS
+static int perf_cgroup_read_map(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ return 0;
+}
+
+static struct cftype perf_cgroup_files[] = {
+ { .name = "perf",
+ .read_map = perf_cgroup_read_map,
+ },
+};
+
+static struct cgroup_subsys_state *perf_cgroup_create(
+ struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+
+ jc = vmalloc(sizeof(*jc));
+ if (!jc)
+ return ERR_PTR(-ENOMEM);
+ memset(jc, 0, sizeof(*jc));
+ return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ vfree(perf_cgroup_from_cont(cont));
+}
+
+static int perf_cgroup_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, perf_cgroup_files,
+ ARRAY_SIZE(perf_cgroup_files));
+}
+
+struct cgroup_subsys perf_subsys = {
+ .name = "perf_event",
+ .subsys_id = perf_subsys_id,
+ .create = perf_cgroup_create,
+ .destroy = perf_cgroup_destroy,
+ .populate = perf_cgroup_populate,
+ .early_init = 0,
+};
+#endif /* CONFIG_CGROUP */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/