Re: [PATCH 1/2] perf_events: add cgroup support (v8)

From: Stephane Eranian
Date: Mon Feb 07 2011 - 15:30:19 EST


Peter,

I will try your changes and report back tomorrow.
Thanks.


On Mon, Feb 7, 2011 at 5:10 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> Compile tested only, depends on the cgroup::exit patch
>
> ---
> Subject: perf: Add cgroup support
> From: Stephane Eranian <eranian@xxxxxxxxxx>
> Date: Mon Feb 07 17:02:25 CET 2011
>
> This kernel patch adds the ability to filter monitoring based on
> container groups (cgroups). This is for use in per-cpu mode only.
>
> The cgroup to monitor is passed as a file descriptor in the pid
> argument to the syscall. The file descriptor must be opened to
> the cgroup name in the cgroup filesystem. For instance, if the
> cgroup name is foo and cgroupfs is mounted in /cgroup, then the
> file descriptor is opened to /cgroup/foo. Cgroup mode is
> activated by passing PERF_FLAG_PID_CGROUP in the flags argument
> to the syscall.
>
> For instance to measure in cgroup foo on CPU1 assuming
> cgroupfs is mounted under /cgroup:
>
> struct perf_event_attr attr;
> int cgroup_fd, fd;
>
> cgroup_fd = open("/cgroup/foo", O_RDONLY);
> fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP);
> close(cgroup_fd);
>
> Signed-off-by: Stephane Eranian <eranian@xxxxxxxxxx>
> [ added perf_cgroup_{exit,attach} ]
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> LKML-Reference: <new-submission>
> ---
> Âinclude/linux/cgroup.h    Â|  Â1
> Âinclude/linux/cgroup_subsys.h | Â Â4
> Âinclude/linux/perf_event.h  Â|  33 +-
> Âinit/Kconfig         Â|  10
> Âkernel/cgroup.c        |  23 +
> Âkernel/perf_event.c      | Â641 +++++++++++++++++++++++++++++++++++++++---
> Â6 files changed, 665 insertions(+), 47 deletions(-)
>
> Index: linux-2.6/include/linux/cgroup.h
> ===================================================================
> --- linux-2.6.orig/include/linux/cgroup.h
> +++ linux-2.6/include/linux/cgroup.h
> @@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsy
> Â/* Get id and depth of css */
> Âunsigned short css_id(struct cgroup_subsys_state *css);
> Âunsigned short css_depth(struct cgroup_subsys_state *css);
> +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
>
> Â#else /* !CONFIG_CGROUPS */
>
> Index: linux-2.6/include/linux/cgroup_subsys.h
> ===================================================================
> --- linux-2.6.orig/include/linux/cgroup_subsys.h
> +++ linux-2.6/include/linux/cgroup_subsys.h
> @@ -65,4 +65,8 @@ SUBSYS(net_cls)
> ÂSUBSYS(blkio)
> Â#endif
>
> +#ifdef CONFIG_CGROUP_PERF
> +SUBSYS(perf)
> +#endif
> +
> Â/* */
> Index: linux-2.6/include/linux/perf_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_event.h
> +++ linux-2.6/include/linux/perf_event.h
> @@ -464,6 +464,7 @@ enum perf_callchain_context {
>
> Â#define PERF_FLAG_FD_NO_GROUP Â(1U << 0)
> Â#define PERF_FLAG_FD_OUTPUT Â Â(1U << 1)
> +#define PERF_FLAG_PID_CGROUP Â (1U << 2) /* pid=cgroup id, per-cpu mode only */
>
> Â#ifdef __KERNEL__
> Â/*
> @@ -471,6 +472,7 @@ enum perf_callchain_context {
> Â*/
>
> Â#ifdef CONFIG_PERF_EVENTS
> +# include <linux/cgroup.h>
> Â# include <asm/perf_event.h>
> Â# include <asm/local64.h>
> Â#endif
> @@ -716,6 +718,22 @@ struct swevent_hlist {
> Â#define PERF_ATTACH_GROUP Â Â Â0x02
> Â#define PERF_ATTACH_TASK Â Â Â 0x04
>
> +#ifdef CONFIG_CGROUP_PERF
> +/*
> + * perf_cgroup_info keeps track of time_enabled for a cgroup.
> + * This is a per-cpu dynamically allocated data structure.
> + */
> +struct perf_cgroup_info {
> + Â Â Â u64 time;
> + Â Â Â u64 timestamp;
> +};
> +
> +struct perf_cgroup {
> + Â Â Â struct cgroup_subsys_state css;
> + Â Â Â struct perf_cgroup_info *info; Â/* timing info, one per cpu */
> +};
> +#endif
> +
> Â/**
> Â* struct perf_event - performance event kernel representation:
> Â*/
> @@ -832,6 +850,11 @@ struct perf_event {
>    Âstruct event_filter       *filter;
> Â#endif
>
> +#ifdef CONFIG_CGROUP_PERF
> +    struct perf_cgroup       Â*cgrp; /* cgroup event is attach to */
> +    int               cgrp_defer_enabled;
> +#endif
> +
> Â#endif /* CONFIG_PERF_EVENTS */
> Â};
>
> @@ -886,6 +909,7 @@ struct perf_event_context {
> Â Â Â Âu64 Â Â Â Â Â Â Â Â Â Â Â Â Â Â generation;
>    Âint               pin_count;
>    Âstruct rcu_head         rcu_head;
> +    int               nr_cgroups; /* cgroup events present */
> Â};
>
> Â/*
> @@ -905,6 +929,9 @@ struct perf_cpu_context {
>    Âstruct list_head        Ârotation_list;
>    Âint               jiffies_interval;
>    Âstruct pmu           Â*active_pmu;
> +#ifdef CONFIG_CGROUP_PERF
> +    struct perf_cgroup       Â*cgrp;
> +#endif
> Â};
>
> Âstruct perf_output_handle {
> @@ -1040,11 +1067,11 @@ perf_sw_event(u32 event_id, u64 nr, int
> Â Â Â Â__perf_sw_event(event_id, nr, nmi, regs, addr);
> Â}
>
> -extern atomic_t perf_task_events;
> +extern atomic_t perf_sched_events;
>
> Âstatic inline void perf_event_task_sched_in(struct task_struct *task)
> Â{
> - Â Â Â COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
> + Â Â Â COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
> Â}
>
> Âstatic inline
> @@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct ta
> Â{
> Â Â Â Âperf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
>
> - Â Â Â COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
> + Â Â Â COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
> Â}
>
> Âextern void perf_event_mmap(struct vm_area_struct *vma);
> Index: linux-2.6/init/Kconfig
> ===================================================================
> --- linux-2.6.orig/init/Kconfig
> +++ linux-2.6/init/Kconfig
> @@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
> Â Â Â Â Âselect this option (if, for some reason, they need to disable it
> Â Â Â Â Âthen noswapaccount does the trick).
>
> +config CGROUP_PERF
> + Â Â Â bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
> + Â Â Â depends on PERF_EVENTS && CGROUPS
> + Â Â Â help
> + Â Â Â Â This option extends the per-cpu mode to restrict monitoring to
> + Â Â Â Â threads which belong to the cgroup specificied and run on the
> + Â Â Â Â designated cpu.
> +
> + Â Â Â Â Say N if unsure.
> +
> Âmenuconfig CGROUP_SCHED
> Â Â Â Âbool "Group CPU scheduler"
> Â Â Â Âdepends on EXPERIMENTAL
> Index: linux-2.6/kernel/cgroup.c
> ===================================================================
> --- linux-2.6.orig/kernel/cgroup.c
> +++ linux-2.6/kernel/cgroup.c
> @@ -4822,6 +4822,29 @@ css_get_next(struct cgroup_subsys *ss, i
> Â Â Â Âreturn ret;
> Â}
>
> +/*
> + * get corresponding css from file open on cgroupfs directory
> + */
> +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
> +{
> + Â Â Â struct cgroup *cgrp;
> + Â Â Â struct inode *inode;
> + Â Â Â struct cgroup_subsys_state *css;
> +
> + Â Â Â inode = f->f_dentry->d_inode;
> + Â Â Â /* check in cgroup filesystem dir */
> + Â Â Â if (inode->i_op != &cgroup_dir_inode_operations)
> + Â Â Â Â Â Â Â return ERR_PTR(-EBADF);
> +
> + Â Â Â if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
> + Â Â Â Â Â Â Â return ERR_PTR(-EINVAL);
> +
> + Â Â Â /* get cgroup */
> + Â Â Â cgrp = __d_cgrp(f->f_dentry);
> + Â Â Â css = cgrp->subsys[id];
> + Â Â Â return css ? css : ERR_PTR(-ENOENT);
> +}
> +
> Â#ifdef CONFIG_CGROUP_DEBUG
> Âstatic struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct cgroup *cont)
> Index: linux-2.6/kernel/perf_event.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_event.c
> +++ linux-2.6/kernel/perf_event.c
> @@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, in
> Â Â Â Âreturn data.ret;
> Â}
>
> +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
> + Â Â Â Â Â Â Â Â Â Â ÂPERF_FLAG_FD_OUTPUT Â|\
> + Â Â Â Â Â Â Â Â Â Â ÂPERF_FLAG_PID_CGROUP)
> +
> Âenum event_type_t {
> Â Â Â ÂEVENT_FLEXIBLE = 0x1,
> Â Â Â ÂEVENT_PINNED = 0x2,
> Â Â Â ÂEVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> Â};
>
> -atomic_t perf_task_events __read_mostly;
> +/*
> + * perf_sched_events : >0 events exist
> + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
> + */
> +atomic_t perf_sched_events __read_mostly;
> +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
> +
> Âstatic atomic_t nr_mmap_events __read_mostly;
> Âstatic atomic_t nr_comm_events __read_mostly;
> Âstatic atomic_t nr_task_events __read_mostly;
> @@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct per
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âenum event_type_t event_type);
>
> Âstatic void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> - Â Â Â Â Â Â Â Â Â Â Â Â Â Âenum event_type_t event_type);
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Âenum event_type_t event_type,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Âstruct task_struct *task, int cgrp_sw);
> +
> +static void update_context_time(struct perf_event_context *ctx);
> +static u64 perf_event_time(struct perf_event *event);
>
> Âvoid __weak perf_event_print_debug(void) Â Â Â { }
>
> @@ -162,6 +176,315 @@ static inline u64 perf_clock(void)
> Â Â Â Âreturn local_clock();
> Â}
>
> +#ifdef CONFIG_CGROUP_PERF
> +
> +static inline struct perf_cgroup *
> +perf_cgroup_from_task(struct task_struct *task)
> +{
> + Â Â Â return container_of(task_subsys_state(task, perf_subsys_id),
> + Â Â Â Â Â Â Â Â Â Â Â struct perf_cgroup, css);
> +}
> +
> +static inline bool
> +perf_cgroup_match(struct perf_event *event, struct task_struct *task)
> +{
> + Â Â Â struct perf_cgroup *cgrp = NULL;
> + Â Â Â if (task)
> + Â Â Â Â Â Â Â cgrp = perf_cgroup_from_task(task);
> + Â Â Â return !event->cgrp || event->cgrp == cgrp;
> +}
> +
> +static inline void perf_get_cgroup(struct perf_event *event)
> +{
> + Â Â Â css_get(&event->cgrp->css);
> +}
> +
> +static inline void perf_put_cgroup(struct perf_event *event)
> +{
> + Â Â Â css_put(&event->cgrp->css);
> +}
> +
> +static inline void perf_detach_cgroup(struct perf_event *event)
> +{
> + Â Â Â perf_put_cgroup(event);
> + Â Â Â event->cgrp = NULL;
> +}
> +
> +static inline int is_cgroup_event(struct perf_event *event)
> +{
> + Â Â Â return event->cgrp != NULL;
> +}
> +
> +static inline u64 perf_cgroup_event_time(struct perf_event *event)
> +{
> + Â Â Â struct perf_cgroup_info *t;
> +
> + Â Â Â t = per_cpu_ptr(event->cgrp->info, event->cpu);
> + Â Â Â return t->time;
> +}
> +
> +static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
> +{
> + Â Â Â struct perf_cgroup_info *info;
> + Â Â Â u64 now;
> +
> + Â Â Â now = perf_clock();
> +
> + Â Â Â info = this_cpu_ptr(cgrp->info);
> +
> + Â Â Â info->time += now - info->timestamp;
> + Â Â Â info->timestamp = now;
> +}
> +
> +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
> +{
> + Â Â Â struct perf_cgroup *cgrp_out = cpuctx->cgrp;
> + Â Â Â if (cgrp_out)
> + Â Â Â Â Â Â Â __update_cgrp_time(cgrp_out);
> +}
> +
> +static inline void update_cgrp_time_from_event(struct perf_event *event)
> +{
> + Â Â Â struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
> + Â Â Â /*
> + Â Â Â Â* do not update time when cgroup is not active
> + Â Â Â Â*/
> + Â Â Â if (!event->cgrp || cgrp != event->cgrp)
> + Â Â Â Â Â Â Â return;
> +
> + Â Â Â __update_cgrp_time(event->cgrp);
> +}
> +
> +static inline void
> +perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
> +{
> + Â Â Â struct perf_cgroup *cgrp;
> + Â Â Â struct perf_cgroup_info *info;
> +
> + Â Â Â if (!task)
> + Â Â Â Â Â Â Â return;
> +
> + Â Â Â cgrp = perf_cgroup_from_task(task);
> + Â Â Â info = per_cpu_ptr(cgrp->info, smp_processor_id());
> + Â Â Â info->timestamp = now;
> +}
> +
> +#define PERF_CGROUP_SWOUT Â Â Â0x1 /* cgroup switch out every event */
> +#define PERF_CGROUP_SWIN Â Â Â 0x2 /* cgroup switch in events based on task */
> +
> +/*
> + * reschedule events based on the cgroup constraint of task.
> + *
> + * mode SWOUT : schedule out everything
> + * mode SWIN : schedule in based on cgroup for next
> + */
> +void perf_cgroup_switch(struct task_struct *task, int mode)
> +{
> + Â Â Â struct perf_cpu_context *cpuctx;
> + Â Â Â struct pmu *pmu;
> + Â Â Â unsigned long flags;
> +
> + Â Â Â /*
> + Â Â Â Â* disable interrupts to avoid geting nr_cgroup
> + Â Â Â Â* changes via __perf_event_disable(). Also
> + Â Â Â Â* avoids preemption.
> + Â Â Â Â*/
> + Â Â Â local_irq_save(flags);
> +
> + Â Â Â /*
> + Â Â Â Â* we reschedule only in the presence of cgroup
> + Â Â Â Â* constrained events.
> + Â Â Â Â*/
> + Â Â Â rcu_read_lock();
> +
> + Â Â Â list_for_each_entry_rcu(pmu, &pmus, entry) {
> +
> + Â Â Â Â Â Â Â cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> +
> + Â Â Â Â Â Â Â perf_pmu_disable(cpuctx->ctx.pmu);
> +
> + Â Â Â Â Â Â Â /*
> + Â Â Â Â Â Â Â Â* perf_cgroup_events says at least one
> + Â Â Â Â Â Â Â Â* context on this CPU has cgroup events.
> + Â Â Â Â Â Â Â Â*
> + Â Â Â Â Â Â Â Â* ctx->nr_cgroups reports the number of cgroup
> + Â Â Â Â Â Â Â Â* events for a context.
> + Â Â Â Â Â Â Â Â*/
> + Â Â Â Â Â Â Â if (cpuctx->ctx.nr_cgroups > 0) {
> +
> + Â Â Â Â Â Â Â Â Â Â Â if (mode & PERF_CGROUP_SWOUT)
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> +
> + Â Â Â Â Â Â Â Â Â Â Â if (mode & PERF_CGROUP_SWIN) {
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, 1);
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â cpuctx->cgrp = perf_cgroup_from_task(task);
> + Â Â Â Â Â Â Â Â Â Â Â }
> + Â Â Â Â Â Â Â }
> +
> + Â Â Â Â Â Â Â perf_pmu_enable(cpuctx->ctx.pmu);
> + Â Â Â }
> +
> + Â Â Â rcu_read_unlock();
> +
> + Â Â Â local_irq_restore(flags);
> +}
> +
> +static inline void perf_cgroup_sched_out(struct task_struct *task)
> +{
> + Â Â Â perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
> +}
> +
> +static inline void perf_cgroup_sched_in(struct task_struct *task)
> +{
> + Â Â Â perf_cgroup_switch(task, PERF_CGROUP_SWIN);
> +}
> +
> +static inline int perf_cgroup_connect(int fd, struct perf_event *event,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct perf_event_attr *attr,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct perf_event *group_leader)
> +{
> + Â Â Â struct perf_cgroup *cgrp;
> + Â Â Â struct cgroup_subsys_state *css;
> + Â Â Â struct file *file;
> + Â Â Â int ret = 0, fput_needed;
> +
> + Â Â Â file = fget_light(fd, &fput_needed);
> + Â Â Â if (!file)
> + Â Â Â Â Â Â Â return -EBADF;
> +
> + Â Â Â css = cgroup_css_from_dir(file, perf_subsys_id);
> + Â Â Â if (IS_ERR(css))
> + Â Â Â Â Â Â Â return PTR_ERR(css);
> +
> + Â Â Â cgrp = container_of(css, struct perf_cgroup, css);
> + Â Â Â event->cgrp = cgrp;
> +
> + Â Â Â /*
> + Â Â Â Â* all events in a group must monitor
> + Â Â Â Â* the same cgroup because a task belongs
> + Â Â Â Â* to only one perf cgroup at a time
> + Â Â Â Â*/
> + Â Â Â if (group_leader && group_leader->cgrp != cgrp) {
> + Â Â Â Â Â Â Â perf_detach_cgroup(event);
> + Â Â Â Â Â Â Â ret = -EINVAL;
> + Â Â Â } else {
> + Â Â Â Â Â Â Â /* must be done before we fput() the file */
> + Â Â Â Â Â Â Â perf_get_cgroup(event);
> + Â Â Â }
> + Â Â Â fput_light(file, fput_needed);
> + Â Â Â return ret;
> +}
> +
> +static inline void
> +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
> +{
> + Â Â Â struct perf_cgroup_info *t;
> + Â Â Â t = per_cpu_ptr(event->cgrp->info, event->cpu);
> + Â Â Â event->shadow_ctx_time = now - t->timestamp;
> +}
> +
> +static inline void
> +perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
> +{
> + Â Â Â /*
> + Â Â Â Â* when the current task's perf cgroup does not match
> + Â Â Â Â* the event's, we need to remember to call the
> + Â Â Â Â* perf_mark_enable() function the first time a task with
> + Â Â Â Â* a matching perf cgroup is scheduled in.
> + Â Â Â Â*/
> + Â Â Â if (is_cgroup_event(event) && !perf_cgroup_match(event, task))
> + Â Â Â Â Â Â Â event->cgrp_defer_enabled = 1;
> +}
> +
> +static inline void
> +perf_cgroup_mark_enabled(struct perf_event *event,
> + Â Â Â Â Â Â Â Â Â Â Â Âstruct perf_event_context *ctx)
> +{
> + Â Â Â struct perf_event *sub;
> + Â Â Â u64 tstamp = perf_event_time(event);
> +
> + Â Â Â if (!event->cgrp_defer_enabled)
> + Â Â Â Â Â Â Â return;
> +
> + Â Â Â event->cgrp_defer_enabled = 0;
> +
> + Â Â Â event->tstamp_enabled = tstamp - event->total_time_enabled;
> + Â Â Â list_for_each_entry(sub, &event->sibling_list, group_entry) {
> + Â Â Â Â Â Â Â if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
> + Â Â Â Â Â Â Â Â Â Â Â sub->tstamp_enabled = tstamp - sub->total_time_enabled;
> + Â Â Â Â Â Â Â Â Â Â Â sub->cgrp_defer_enabled = 0;
> + Â Â Â Â Â Â Â }
> + Â Â Â }
> +}
> +#else /* !CONFIG_CGROUP_PERF */
> +
> +static inline bool
> +perf_cgroup_match(struct perf_event *event, struct task_struct *task)
> +{
> + Â Â Â return true;
> +}
> +
> +static inline void perf_detach_cgroup(struct perf_event *event)
> +{}
> +
> +static inline int is_cgroup_event(struct perf_event *event)
> +{
> + Â Â Â return 0;
> +}
> +
> +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
> +{
> + Â Â Â return 0;
> +}
> +
> +static inline void update_cgrp_time_from_event(struct perf_event *event)
> +{}
> +
> +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
> +{}
> +
> +static inline void perf_cgroup_sched_out(struct task_struct *task)
> +{
> +}
> +
> +static inline void perf_cgroup_sched_in(struct task_struct *task)
> +{
> +}
> +
> +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct perf_event_attr *attr,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct perf_event *group_leader)
> +{
> + Â Â Â return -EINVAL;
> +}
> +
> +static inline void
> +perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
> +{}
> +
> +void
> +perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
> +{}
> +
> +static inline void
> +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
> +{}
> +
> +static inline u64 perf_cgroup_event_time(struct perf_event *event)
> +{
> + Â Â Â return 0;
> +}
> +
> +static inline void
> +perf_cgroup_defer_enabled(struct perf_event *event, struct task_struct *task)
> +{}
> +
> +static inline void
> +perf_cgroup_mark_enabled(struct perf_event *event,
> + Â Â Â Â Â Â Â Â Â Â Â Âstruct perf_event_context *ctx)
> +{}
> +#endif
> +
> Âvoid perf_pmu_disable(struct pmu *pmu)
> Â{
> Â Â Â Âint *count = this_cpu_ptr(pmu->pmu_disable_count);
> @@ -343,6 +666,10 @@ static void update_context_time(struct p
> Âstatic u64 perf_event_time(struct perf_event *event)
> Â{
> Â Â Â Âstruct perf_event_context *ctx = event->ctx;
> +
> + Â Â Â if (is_cgroup_event(event))
> + Â Â Â Â Â Â Â return perf_cgroup_event_time(event);
> +
> Â Â Â Âreturn ctx ? ctx->time : 0;
> Â}
>
> @@ -357,9 +684,20 @@ static void update_event_times(struct pe
> Â Â Â Âif (event->state < PERF_EVENT_STATE_INACTIVE ||
> Â Â Â Â Â Âevent->group_leader->state < PERF_EVENT_STATE_INACTIVE)
> Â Â Â Â Â Â Â Âreturn;
> -
> - Â Â Â if (ctx->is_active)
> + Â Â Â /*
> + Â Â Â Â* in cgroup mode, time_enabled represents
> + Â Â Â Â* the time the event was enabled AND active
> + Â Â Â Â* tasks were in the monitored cgroup. This is
> + Â Â Â Â* independent of the activity of the context as
> + Â Â Â Â* there may be a mix of cgroup and non-cgroup events.
> + Â Â Â Â*
> + Â Â Â Â* That is why we treat cgroup events differently
> + Â Â Â Â* here.
> + Â Â Â Â*/
> + Â Â Â if (is_cgroup_event(event))
> Â Â Â Â Â Â Â Ârun_end = perf_event_time(event);
> + Â Â Â else if (ctx->is_active)
> + Â Â Â Â Â Â Â run_end = ctx->time;
> Â Â Â Âelse
> Â Â Â Â Â Â Â Ârun_end = event->tstamp_stopped;
>
> @@ -371,6 +709,7 @@ static void update_event_times(struct pe
> Â Â Â Â Â Â Â Ârun_end = perf_event_time(event);
>
> Â Â Â Âevent->total_time_running = run_end - event->tstamp_running;
> +
> Â}
>
> Â/*
> @@ -419,6 +758,17 @@ list_add_event(struct perf_event *event,
> Â Â Â Â Â Â Â Âlist_add_tail(&event->group_entry, list);
> Â Â Â Â}
>
> + Â Â Â if (is_cgroup_event(event)) {
> + Â Â Â Â Â Â Â ctx->nr_cgroups++;
> + Â Â Â Â Â Â Â /*
> + Â Â Â Â Â Â Â Â* one more event:
> + Â Â Â Â Â Â Â Â* - that has cgroup constraint on event->cpu
> + Â Â Â Â Â Â Â Â* - that may need work on context switch
> + Â Â Â Â Â Â Â Â*/
> + Â Â Â Â Â Â Â atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
> + Â Â Â Â Â Â Â jump_label_inc(&perf_sched_events);
> + Â Â Â }
> +
> Â Â Â Âlist_add_rcu(&event->event_entry, &ctx->event_list);
> Â Â Â Âif (!ctx->nr_events)
> Â Â Â Â Â Â Â Âperf_pmu_rotate_start(ctx->pmu);
> @@ -545,6 +895,12 @@ list_del_event(struct perf_event *event,
>
> Â Â Â Âevent->attach_state &= ~PERF_ATTACH_CONTEXT;
>
> + Â Â Â if (is_cgroup_event(event)) {
> + Â Â Â Â Â Â Â ctx->nr_cgroups--;
> + Â Â Â Â Â Â Â atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
> + Â Â Â Â Â Â Â jump_label_dec(&perf_sched_events);
> + Â Â Â }
> +
> Â Â Â Âctx->nr_events--;
> Â Â Â Âif (event->attr.inherit_stat)
> Â Â Â Â Â Â Â Âctx->nr_stat--;
> @@ -614,9 +970,10 @@ static void perf_group_detach(struct per
> Â}
>
> Âstatic inline int
> -event_filter_match(struct perf_event *event)
> +event_filter_match(struct perf_event *event, struct task_struct *task)
> Â{
> - Â Â Â return event->cpu == -1 || event->cpu == smp_processor_id();
> + Â Â Â return (event->cpu == -1 || event->cpu == smp_processor_id())
> + Â Â Â Â Â && perf_cgroup_match(event, task);
> Â}
>
> Âstatic void
> @@ -633,8 +990,8 @@ event_sched_out(struct perf_event *event
> Â Â Â Â * via read() for time_enabled, time_running:
> Â Â Â Â */
> Â Â Â Âif (event->state == PERF_EVENT_STATE_INACTIVE
> - Â Â Â Â Â && !event_filter_match(event)) {
> - Â Â Â Â Â Â Â delta = ctx->time - event->tstamp_stopped;
> + Â Â Â Â Â && !event_filter_match(event, current)) {
> + Â Â Â Â Â Â Â delta = tstamp - event->tstamp_stopped;
> Â Â Â Â Â Â Â Âevent->tstamp_running += delta;
> Â Â Â Â Â Â Â Âevent->tstamp_stopped = tstamp;
> Â Â Â Â}
> @@ -783,6 +1140,7 @@ static int __perf_event_disable(void *in
> Â Â Â Â */
> Â Â Â Âif (event->state >= PERF_EVENT_STATE_INACTIVE) {
> Â Â Â Â Â Â Â Âupdate_context_time(ctx);
> + Â Â Â Â Â Â Â update_cgrp_time_from_event(event);
> Â Â Â Â Â Â Â Âupdate_group_times(event);
> Â Â Â Â Â Â Â Âif (event == event->group_leader)
> Â Â Â Â Â Â Â Â Â Â Â Âgroup_sched_out(event, cpuctx, ctx);
> @@ -851,6 +1209,41 @@ void perf_event_disable(struct perf_even
> Â Â Â Âraw_spin_unlock_irq(&ctx->lock);
> Â}
>
> +static void perf_set_shadow_time(struct perf_event *event,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âstruct perf_event_context *ctx,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âu64 tstamp)
> +{
> + Â Â Â /*
> + Â Â Â Â* use the correct time source for the time snapshot
> + Â Â Â Â*
> + Â Â Â Â* We could get by without this by leveraging the
> + Â Â Â Â* fact that to get to this function, the caller
> + Â Â Â Â* has most likely already called update_context_time()
> + Â Â Â Â* and update_cgrp_time_xx() and thus both timestamp
> + Â Â Â Â* are identical (or very close). Given that tstamp is,
> + Â Â Â Â* already adjusted for cgroup, we could say that:
> + Â Â Â Â* Â Âtstamp - ctx->timestamp
> + Â Â Â Â* is equivalent to
> + Â Â Â Â* Â Âtstamp - cgrp->timestamp.
> + Â Â Â Â*
> + Â Â Â Â* Then, in perf_output_read(), the calculation would
> + Â Â Â Â* work with no changes because:
> + Â Â Â Â* - event is guaranteed scheduled in
> + Â Â Â Â* - no scheduled out in between
> + Â Â Â Â* - thus the timestamp would be the same
> + Â Â Â Â*
> + Â Â Â Â* But this is a bit hairy.
> + Â Â Â Â*
> + Â Â Â Â* So instead, we have an explicit cgroup call to remain
> + Â Â Â Â* within the time time source all along. We believe it
> + Â Â Â Â* is cleaner and simpler to understand.
> + Â Â Â Â*/
> + Â Â Â if (is_cgroup_event(event))
> + Â Â Â Â Â Â Â perf_cgroup_set_shadow_time(event, tstamp);
> + Â Â Â else
> + Â Â Â Â Â Â Â event->shadow_ctx_time = tstamp - ctx->timestamp;
> +}
> +
> Âstatic int
> Âevent_sched_in(struct perf_event *event,
> Â Â Â Â Â Â Â Â struct perf_cpu_context *cpuctx,
> @@ -876,7 +1269,7 @@ event_sched_in(struct perf_event *event,
>
> Â Â Â Âevent->tstamp_running += tstamp - event->tstamp_stopped;
>
> - Â Â Â event->shadow_ctx_time = tstamp - ctx->timestamp;
> + Â Â Â perf_set_shadow_time(event, ctx, tstamp);
>
> Â Â Â Âif (!is_software_event(event))
> Â Â Â Â Â Â Â Âcpuctx->active_oncpu++;
> @@ -992,12 +1385,13 @@ static void add_event_to_ctx(struct perf
>
> Â Â Â Âlist_add_event(event, ctx);
> Â Â Â Âperf_group_attach(event);
> - Â Â Â event->tstamp_enabled = tstamp;
> Â Â Â Âevent->tstamp_running = tstamp;
> Â Â Â Âevent->tstamp_stopped = tstamp;
> + Â Â Â event->tstamp_enabled = tstamp;
> Â}
>
> -static void perf_event_context_sched_in(struct perf_event_context *ctx);
> +static void perf_event_context_sched_in(struct perf_event_context *ctx,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct task_struct *tsk);
>
> Â/*
> Â* Cross CPU call to install and enable a performance event
> @@ -1018,15 +1412,21 @@ static int Â__perf_install_in_context(vo
> Â Â Â Â * which do context switches with IRQs enabled.
> Â Â Â Â */
> Â Â Â Âif (ctx->task && !cpuctx->task_ctx)
> - Â Â Â Â Â Â Â perf_event_context_sched_in(ctx);
> + Â Â Â Â Â Â Â perf_event_context_sched_in(ctx, ctx->task);
>
> Â Â Â Âraw_spin_lock(&ctx->lock);
> Â Â Â Âctx->is_active = 1;
> Â Â Â Âupdate_context_time(ctx);
> + Â Â Â /*
> + Â Â Â Â* update cgrp time only if current cgrp
> + Â Â Â Â* matches event->cgrp. Must be done before
> + Â Â Â Â* calling add_event_to_ctx()
> + Â Â Â Â*/
> + Â Â Â update_cgrp_time_from_event(event);
>
> Â Â Â Âadd_event_to_ctx(event, ctx);
>
> - Â Â Â if (!event_filter_match(event))
> + Â Â Â if (!event_filter_match(event, current))
> Â Â Â Â Â Â Â Âgoto unlock;
>
> Â Â Â Â/*
> @@ -1160,10 +1560,19 @@ static int __perf_event_enable(void *inf
>
> Â Â Â Âif (event->state >= PERF_EVENT_STATE_INACTIVE)
> Â Â Â Â Â Â Â Âgoto unlock;
> +
> + Â Â Â /*
> + Â Â Â Â* set current task's cgroup time reference point
> + Â Â Â Â*/
> + Â Â Â perf_cgroup_set_timestamp(current, perf_clock());
> +
> Â Â Â Â__perf_event_mark_enabled(event, ctx);
>
> - Â Â Â if (!event_filter_match(event))
> + Â Â Â if (!event_filter_match(event, current)) {
> + Â Â Â Â Â Â Â if (is_cgroup_event(event))
> + Â Â Â Â Â Â Â Â Â Â Â perf_cgroup_defer_enabled(event, current);
> Â Â Â Â Â Â Â Âgoto unlock;
> + Â Â Â }
>
> Â Â Â Â/*
> Â Â Â Â * If the event is in a group and isn't the group leader,
> @@ -1292,6 +1701,7 @@ static void ctx_sched_out(struct perf_ev
> Â Â Â Âif (likely(!ctx->nr_events))
> Â Â Â Â Â Â Â Âgoto out;
> Â Â Â Âupdate_context_time(ctx);
> + Â Â Â update_cgrp_time_from_cpuctx(cpuctx);
>
> Â Â Â Âif (!ctx->nr_active)
> Â Â Â Â Â Â Â Âgoto out;
> @@ -1481,6 +1891,14 @@ void __perf_event_task_sched_out(struct
>
> Â Â Â Âfor_each_task_context_nr(ctxn)
> Â Â Â Â Â Â Â Âperf_event_context_sched_out(task, ctxn, next);
> +
> + Â Â Â /*
> + Â Â Â Â* if cgroup events exist on this CPU, then we need
> + Â Â Â Â* to check if we have to switch out PMU state.
> + Â Â Â Â* cgroup event are system-wide mode only
> + Â Â Â Â*/
> + Â Â Â if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
> + Â Â Â Â Â Â Â perf_cgroup_sched_out(task);
> Â}
>
> Âstatic void task_ctx_sched_out(struct perf_event_context *ctx,
> @@ -1509,16 +1927,21 @@ static void cpu_ctx_sched_out(struct per
>
> Âstatic void
> Âctx_pinned_sched_in(struct perf_event_context *ctx,
> - Â Â Â Â Â Â Â Â Â struct perf_cpu_context *cpuctx)
> + Â Â Â Â Â Â Â Â Â struct perf_cpu_context *cpuctx,
> + Â Â Â Â Â Â Â Â Â struct task_struct *task, int cgrp_sw)
> Â{
> Â Â Â Âstruct perf_event *event;
>
> Â Â Â Âlist_for_each_entry(event, &ctx->pinned_groups, group_entry) {
> Â Â Â Â Â Â Â Âif (event->state <= PERF_EVENT_STATE_OFF)
> Â Â Â Â Â Â Â Â Â Â Â Âcontinue;
> - Â Â Â Â Â Â Â if (!event_filter_match(event))
> + Â Â Â Â Â Â Â if (!event_filter_match(event, task))
> Â Â Â Â Â Â Â Â Â Â Â Âcontinue;
>
> + Â Â Â Â Â Â Â /* may need to reset tstamp_enabled */
> + Â Â Â Â Â Â Â if (is_cgroup_event(event))
> + Â Â Â Â Â Â Â Â Â Â Â perf_cgroup_mark_enabled(event, ctx);
> +
> Â Â Â Â Â Â Â Âif (group_can_go_on(event, cpuctx, 1))
> Â Â Â Â Â Â Â Â Â Â Â Âgroup_sched_in(event, cpuctx, ctx);
>
> @@ -1535,7 +1958,8 @@ ctx_pinned_sched_in(struct perf_event_co
>
> Âstatic void
> Âctx_flexible_sched_in(struct perf_event_context *ctx,
> - Â Â Â Â Â Â Â Â Â Â struct perf_cpu_context *cpuctx)
> + Â Â Â Â Â Â Â Â Â Â struct perf_cpu_context *cpuctx,
> + Â Â Â Â Â Â Â Â Â Â struct task_struct *task, int cgrp_sw)
> Â{
> Â Â Â Âstruct perf_event *event;
> Â Â Â Âint can_add_hw = 1;
> @@ -1548,9 +1972,13 @@ ctx_flexible_sched_in(struct perf_event_
> Â Â Â Â Â Â Â Â * Listen to the 'cpu' scheduling filter constraint
> Â Â Â Â Â Â Â Â * of events:
> Â Â Â Â Â Â Â Â */
> - Â Â Â Â Â Â Â if (!event_filter_match(event))
> + Â Â Â Â Â Â Â if (!event_filter_match(event, task))
> Â Â Â Â Â Â Â Â Â Â Â Âcontinue;
>
> + Â Â Â Â Â Â Â /* may need to reset tstamp_enabled */
> + Â Â Â Â Â Â Â if (is_cgroup_event(event))
> + Â Â Â Â Â Â Â Â Â Â Â perf_cgroup_mark_enabled(event, ctx);
> +
> Â Â Â Â Â Â Â Âif (group_can_go_on(event, cpuctx, can_add_hw)) {
> Â Â Â Â Â Â Â Â Â Â Â Âif (group_sched_in(event, cpuctx, ctx))
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âcan_add_hw = 0;
> @@ -1561,36 +1989,41 @@ ctx_flexible_sched_in(struct perf_event_
> Âstatic void
> Âctx_sched_in(struct perf_event_context *ctx,
> Â Â Â Â Â Â struct perf_cpu_context *cpuctx,
> - Â Â Â Â Â Âenum event_type_t event_type)
> + Â Â Â Â Â Âenum event_type_t event_type,
> + Â Â Â Â Â Âstruct task_struct *task, int cgrp_sw)
> Â{
> + Â Â Â u64 now;
> +
> Â Â Â Âraw_spin_lock(&ctx->lock);
> Â Â Â Âctx->is_active = 1;
> Â Â Â Âif (likely(!ctx->nr_events))
> Â Â Â Â Â Â Â Âgoto out;
>
> - Â Â Â ctx->timestamp = perf_clock();
> -
> + Â Â Â now = perf_clock();
> + Â Â Â ctx->timestamp = now;
> + Â Â Â perf_cgroup_set_timestamp(task, now);
> Â Â Â Â/*
> Â Â Â Â * First go through the list and put on any pinned groups
> Â Â Â Â * in order to give them the best chance of going on.
> Â Â Â Â */
> Â Â Â Âif (event_type & EVENT_PINNED)
> - Â Â Â Â Â Â Â ctx_pinned_sched_in(ctx, cpuctx);
> + Â Â Â Â Â Â Â ctx_pinned_sched_in(ctx, cpuctx, task, cgrp_sw);
>
> Â Â Â Â/* Then walk through the lower prio flexible groups */
> Â Â Â Âif (event_type & EVENT_FLEXIBLE)
> - Â Â Â Â Â Â Â ctx_flexible_sched_in(ctx, cpuctx);
> + Â Â Â Â Â Â Â ctx_flexible_sched_in(ctx, cpuctx, task, cgrp_sw);
>
> Âout:
> Â Â Â Âraw_spin_unlock(&ctx->lock);
> Â}
>
> Âstatic void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> - Â Â Â Â Â Â Â Â Â Â Â Â Â Âenum event_type_t event_type)
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Âenum event_type_t event_type,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Âstruct task_struct *task, int cgrp_sw)
> Â{
> Â Â Â Âstruct perf_event_context *ctx = &cpuctx->ctx;
>
> - Â Â Â ctx_sched_in(ctx, cpuctx, event_type);
> + Â Â Â ctx_sched_in(ctx, cpuctx, event_type, task, cgrp_sw);
> Â}
>
> Âstatic void task_ctx_sched_in(struct perf_event_context *ctx,
> @@ -1602,11 +2035,12 @@ static void task_ctx_sched_in(struct per
> Â Â Â Âif (cpuctx->task_ctx == ctx)
> Â Â Â Â Â Â Â Âreturn;
>
> - Â Â Â ctx_sched_in(ctx, cpuctx, event_type);
> + Â Â Â ctx_sched_in(ctx, cpuctx, event_type, NULL, 0);
> Â Â Â Âcpuctx->task_ctx = ctx;
> Â}
>
> -static void perf_event_context_sched_in(struct perf_event_context *ctx)
> +static void perf_event_context_sched_in(struct perf_event_context *ctx,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct task_struct *task)
> Â{
> Â Â Â Âstruct perf_cpu_context *cpuctx;
>
> @@ -1622,9 +2056,9 @@ static void perf_event_context_sched_in(
> Â Â Â Â */
> Â Â Â Âcpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>
> - Â Â Â ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
> - Â Â Â cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
> - Â Â Â ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
> + Â Â Â ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, 0);
> + Â Â Â cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, 0);
> + Â Â Â ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, 0);
>
> Â Â Â Âcpuctx->task_ctx = ctx;
>
> @@ -1657,8 +2091,15 @@ void __perf_event_task_sched_in(struct t
> Â Â Â Â Â Â Â Âif (likely(!ctx))
> Â Â Â Â Â Â Â Â Â Â Â Âcontinue;
>
> - Â Â Â Â Â Â Â perf_event_context_sched_in(ctx);
> + Â Â Â Â Â Â Â perf_event_context_sched_in(ctx, task);
> Â Â Â Â}
> + Â Â Â /*
> + Â Â Â Â* if cgroup events exist on this CPU, then we need
> + Â Â Â Â* to check if we have to switch in PMU state.
> + Â Â Â Â* cgroup event are system-wide mode only
> + Â Â Â Â*/
> + Â Â Â if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
> + Â Â Â Â Â Â Â perf_cgroup_sched_in(task);
> Â}
>
> Â#define MAX_INTERRUPTS (~0ULL)
> @@ -1775,7 +2216,7 @@ static void perf_ctx_adjust_freq(struct
> Â Â Â Â Â Â Â Âif (event->state != PERF_EVENT_STATE_ACTIVE)
> Â Â Â Â Â Â Â Â Â Â Â Âcontinue;
>
> - Â Â Â Â Â Â Â if (!event_filter_match(event))
> + Â Â Â Â Â Â Â if (!event_filter_match(event, current))
> Â Â Â Â Â Â Â Â Â Â Â Âcontinue;
>
> Â Â Â Â Â Â Â Âhwc = &event->hw;
> @@ -1833,9 +2274,10 @@ static void perf_rotate_context(struct p
> Â Â Â Âstruct perf_event_context *ctx = NULL;
> Â Â Â Âint rotate = 0, remove = 1;
>
> - Â Â Â if (cpuctx->ctx.nr_events) {
> + Â Â Â ctx = &cpuctx->ctx;
> + Â Â Â if (ctx->nr_events) {
> Â Â Â Â Â Â Â Âremove = 0;
> - Â Â Â Â Â Â Â if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
> + Â Â Â Â Â Â Â if (ctx->nr_events != ctx->nr_active)
> Â Â Â Â Â Â Â Â Â Â Â Ârotate = 1;
> Â Â Â Â}
>
> @@ -1862,7 +2304,7 @@ static void perf_rotate_context(struct p
> Â Â Â Âif (ctx)
> Â Â Â Â Â Â Â Ârotate_ctx(ctx);
>
> - Â Â Â cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
> + Â Â Â cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current, 0);
> Â Â Â Âif (ctx)
> Â Â Â Â Â Â Â Âtask_ctx_sched_in(ctx, EVENT_FLEXIBLE);
>
> @@ -1941,7 +2383,7 @@ static void perf_event_enable_on_exec(st
>
> Â Â Â Âraw_spin_unlock(&ctx->lock);
>
> - Â Â Â perf_event_context_sched_in(ctx);
> + Â Â Â perf_event_context_sched_in(ctx, ctx->task);
> Âout:
> Â Â Â Âlocal_irq_restore(flags);
> Â}
> @@ -1968,6 +2410,7 @@ static void __perf_event_read(void *info
> Â Â Â Âraw_spin_lock(&ctx->lock);
> Â Â Â Âif (ctx->is_active)
> Â Â Â Â Â Â Â Âupdate_context_time(ctx);
> + Â Â Â update_cgrp_time_from_event(event);
> Â Â Â Âupdate_event_times(event);
> Â Â Â Âif (event->state == PERF_EVENT_STATE_ACTIVE)
> Â Â Â Â Â Â Â Âevent->pmu->read(event);
> @@ -1998,8 +2441,10 @@ static u64 perf_event_read(struct perf_e
> Â Â Â Â Â Â Â Â * (e.g., thread is blocked), in that case
> Â Â Â Â Â Â Â Â * we cannot update context time
> Â Â Â Â Â Â Â Â */
> - Â Â Â Â Â Â Â if (ctx->is_active)
> + Â Â Â Â Â Â Â if (ctx->is_active) {
> Â Â Â Â Â Â Â Â Â Â Â Âupdate_context_time(ctx);
> + Â Â Â Â Â Â Â Â Â Â Â update_cgrp_time_from_event(event);
> + Â Â Â Â Â Â Â }
> Â Â Â Â Â Â Â Âupdate_event_times(event);
> Â Â Â Â Â Â Â Âraw_spin_unlock_irqrestore(&ctx->lock, flags);
> Â Â Â Â}
> @@ -2384,7 +2829,7 @@ static void free_event(struct perf_event
>
> Â Â Â Âif (!event->parent) {
> Â Â Â Â Â Â Â Âif (event->attach_state & PERF_ATTACH_TASK)
> - Â Â Â Â Â Â Â Â Â Â Â jump_label_dec(&perf_task_events);
> + Â Â Â Â Â Â Â Â Â Â Â jump_label_dec(&perf_sched_events);
> Â Â Â Â Â Â Â Âif (event->attr.mmap || event->attr.mmap_data)
> Â Â Â Â Â Â Â Â Â Â Â Âatomic_dec(&nr_mmap_events);
> Â Â Â Â Â Â Â Âif (event->attr.comm)
> @@ -2400,6 +2845,9 @@ static void free_event(struct perf_event
> Â Â Â Â Â Â Â Âevent->buffer = NULL;
> Â Â Â Â}
>
> + Â Â Â if (is_cgroup_event(event))
> + Â Â Â Â Â Â Â perf_detach_cgroup(event);
> +
> Â Â Â Âif (event->destroy)
> Â Â Â Â Â Â Â Âevent->destroy(event);
>
> @@ -3984,7 +4432,7 @@ static int perf_event_task_match(struct
> Â Â Â Âif (event->state < PERF_EVENT_STATE_INACTIVE)
> Â Â Â Â Â Â Â Âreturn 0;
>
> - Â Â Â if (!event_filter_match(event))
> + Â Â Â if (!event_filter_match(event, current))
> Â Â Â Â Â Â Â Âreturn 0;
>
> Â Â Â Âif (event->attr.comm || event->attr.mmap ||
> @@ -4121,7 +4569,7 @@ static int perf_event_comm_match(struct
> Â Â Â Âif (event->state < PERF_EVENT_STATE_INACTIVE)
> Â Â Â Â Â Â Â Âreturn 0;
>
> - Â Â Â if (!event_filter_match(event))
> + Â Â Â if (!event_filter_match(event, current))
> Â Â Â Â Â Â Â Âreturn 0;
>
> Â Â Â Âif (event->attr.comm)
> @@ -4269,7 +4717,7 @@ static int perf_event_mmap_match(struct
> Â Â Â Âif (event->state < PERF_EVENT_STATE_INACTIVE)
> Â Â Â Â Â Â Â Âreturn 0;
>
> - Â Â Â if (!event_filter_match(event))
> + Â Â Â if (!event_filter_match(event, current))
> Â Â Â Â Â Â Â Âreturn 0;
>
> Â Â Â Âif ((!executable && event->attr.mmap_data) ||
> @@ -5289,6 +5737,7 @@ static void task_clock_event_read(struct
>
> Â Â Â Âif (!in_nmi()) {
> Â Â Â Â Â Â Â Âupdate_context_time(event->ctx);
> + Â Â Â Â Â Â Â update_cgrp_time_from_event(event);
> Â Â Â Â Â Â Â Âtime = event->ctx->time;
> Â Â Â Â} else {
> Â Â Â Â Â Â Â Âu64 now = perf_clock();
> @@ -5714,7 +6163,7 @@ perf_event_alloc(struct perf_event_attr
>
> Â Â Â Âif (!event->parent) {
> Â Â Â Â Â Â Â Âif (event->attach_state & PERF_ATTACH_TASK)
> - Â Â Â Â Â Â Â Â Â Â Â jump_label_inc(&perf_task_events);
> + Â Â Â Â Â Â Â Â Â Â Â jump_label_inc(&perf_sched_events);
> Â Â Â Â Â Â Â Âif (event->attr.mmap || event->attr.mmap_data)
> Â Â Â Â Â Â Â Â Â Â Â Âatomic_inc(&nr_mmap_events);
> Â Â Â Â Â Â Â Âif (event->attr.comm)
> @@ -5889,7 +6338,7 @@ SYSCALL_DEFINE5(perf_event_open,
> Â Â Â Âint err;
>
> Â Â Â Â/* for future expandability... */
> - Â Â Â if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
> + Â Â Â if (flags & ~PERF_FLAG_ALL)
> Â Â Â Â Â Â Â Âreturn -EINVAL;
>
> Â Â Â Âerr = perf_copy_attr(attr_uptr, &attr);
> @@ -5906,6 +6355,15 @@ SYSCALL_DEFINE5(perf_event_open,
> Â Â Â Â Â Â Â Â Â Â Â Âreturn -EINVAL;
> Â Â Â Â}
>
> + Â Â Â /*
> + Â Â Â Â* In cgroup mode, the pid argument is used to pass the fd
> + Â Â Â Â* opened to the cgroup directory in cgroupfs. The cpu argument
> + Â Â Â Â* designates the cpu on which to monitor threads from that
> + Â Â Â Â* cgroup.
> + Â Â Â Â*/
> + Â Â Â if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
> + Â Â Â Â Â Â Â return -EINVAL;
> +
> Â Â Â Âevent_fd = get_unused_fd_flags(O_RDWR);
> Â Â Â Âif (event_fd < 0)
> Â Â Â Â Â Â Â Âreturn event_fd;
> @@ -5923,7 +6381,7 @@ SYSCALL_DEFINE5(perf_event_open,
> Â Â Â Â Â Â Â Â Â Â Â Âgroup_leader = NULL;
> Â Â Â Â}
>
> - Â Â Â if (pid != -1) {
> + Â Â Â if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
> Â Â Â Â Â Â Â Âtask = find_lively_task_by_vpid(pid);
> Â Â Â Â Â Â Â Âif (IS_ERR(task)) {
> Â Â Â Â Â Â Â Â Â Â Â Âerr = PTR_ERR(task);
> @@ -5937,6 +6395,12 @@ SYSCALL_DEFINE5(perf_event_open,
> Â Â Â Â Â Â Â Âgoto err_task;
> Â Â Â Â}
>
> + Â Â Â if (flags & PERF_FLAG_PID_CGROUP) {
> + Â Â Â Â Â Â Â err = perf_cgroup_connect(pid, event, &attr, group_leader);
> + Â Â Â Â Â Â Â if (err)
> + Â Â Â Â Â Â Â Â Â Â Â goto err_alloc;
> + Â Â Â }
> +
> Â Â Â Â/*
> Â Â Â Â * Special case software events and allow them to be part of
> Â Â Â Â * any hardware group.
> @@ -6797,3 +7261,92 @@ static int __init perf_event_sysfs_init(
> Â Â Â Âreturn ret;
> Â}
> Âdevice_initcall(perf_event_sysfs_init);
> +
> +#ifdef CONFIG_CGROUP_PERF
> +static struct cgroup_subsys_state *perf_cgroup_create(
> + Â Â Â struct cgroup_subsys *ss, struct cgroup *cont)
> +{
> + Â Â Â struct perf_cgroup *jc;
> + Â Â Â struct perf_cgroup_info *t;
> + Â Â Â int c;
> +
> + Â Â Â jc = kmalloc(sizeof(*jc), GFP_KERNEL);
> + Â Â Â if (!jc)
> + Â Â Â Â Â Â Â return ERR_PTR(-ENOMEM);
> +
> + Â Â Â memset(jc, 0, sizeof(*jc));
> +
> + Â Â Â jc->info = alloc_percpu(struct perf_cgroup_info);
> + Â Â Â if (!jc->info) {
> + Â Â Â Â Â Â Â kfree(jc);
> + Â Â Â Â Â Â Â return ERR_PTR(-ENOMEM);
> + Â Â Â }
> +
> + Â Â Â for_each_possible_cpu(c) {
> + Â Â Â Â Â Â Â t = per_cpu_ptr(jc->info, c);
> + Â Â Â Â Â Â Â t->time = 0;
> + Â Â Â Â Â Â Â t->timestamp = 0;
> + Â Â Â }
> + Â Â Â return &jc->css;
> +}
> +
> +static void perf_cgroup_destroy(struct cgroup_subsys *ss,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct cgroup *cont)
> +{
> + Â Â Â struct perf_cgroup *jc;
> + Â Â Â jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
> + Â Â Â Â Â Â Â Â Â Â Â Â struct perf_cgroup, css);
> + Â Â Â free_percpu(jc->info);
> + Â Â Â kfree(jc);
> +}
> +
> +static int __perf_cgroup_move(void *info)
> +{
> + Â Â Â struct task_struct *task = info;
> + Â Â Â perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
> + Â Â Â return 0;
> +}
> +
> +static void perf_cgroup_move(struct task_struct *task)
> +{
> + Â Â Â task_function_call(task, __perf_cgroup_move, task);
> +}
> +
> +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> + Â Â Â Â Â Â Â struct cgroup *old_cgrp, struct task_struct *task,
> + Â Â Â Â Â Â Â bool threadgroup)
> +{
> + Â Â Â perf_cgroup_move(task);
> + Â Â Â if (threadgroup) {
> + Â Â Â Â Â Â Â struct task_struct *c;
> + Â Â Â Â Â Â Â rcu_read_lock();
> + Â Â Â Â Â Â Â list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> + Â Â Â Â Â Â Â Â Â Â Â perf_cgroup_move(c);
> + Â Â Â Â Â Â Â }
> + Â Â Â Â Â Â Â rcu_read_unlock();
> + Â Â Â }
> +}
> +
> +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
> + Â Â Â Â Â Â Â struct cgroup *old_cgrp, struct task_struct *task)
> +{
> + Â Â Â /*
> + Â Â Â Â* cgroup_exit() is called in the copy_process() failure path.
> + Â Â Â Â* Ignore this case since the task hasn't ran yet, this avoids
> + Â Â Â Â* trying to poke a half freed task state from generic code.
> + Â Â Â Â*/
> + Â Â Â if (!(task->flags & PF_EXITING))
> + Â Â Â Â Â Â Â return;
> +
> + Â Â Â perf_cgroup_move(task);
> +}
> +
> +struct cgroup_subsys perf_subsys = {
> + Â Â Â .name = "perf_event",
> + Â Â Â .subsys_id = perf_subsys_id,
> + Â Â Â .create = perf_cgroup_create,
> + Â Â Â .destroy = perf_cgroup_destroy,
> + Â Â Â .exit = perf_cgroup_exit,
> + Â Â Â .attach = perf_cgroup_attach,
> +};
> +#endif /* CONFIG_CGROUP_PERF */
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/