Re: [External] Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

From: Chengming Zhou
Date: Wed Mar 23 2022 - 09:07:17 EST


On 2022/3/23 8:51 下午, Peter Zijlstra wrote:
> On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 8b5cf2aedfe6..848a3bfa9513 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>
>> @@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
>> */
>> local_irq_save(flags);
>>
>> + cgrp = perf_cgroup_from_task(task, NULL);
>> + if (cgrp == __this_cpu_read(cpu_perf_cgroup))
>> + goto out;
>> +
>> + __this_cpu_write(cpu_perf_cgroup, cgrp);
>> +
>> list = this_cpu_ptr(&cgrp_cpuctx_list);
>> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
>> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>>
>> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> + if (cpuctx->cgrp == cgrp)
>> + continue;
>> +
>> perf_pmu_disable(cpuctx->ctx.pmu);
>>
>> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>
> This is just straight up broken.. you can't continue after taking a
> lock, that'll miss unlock.

Yes, Namhyung has pointed it out, I will fix it next version.

>
> Also, I really don't see the point of cpu_perf_cgroup, cpuctx->cgrp
> should be able to do this.

But the problem is that we have two cpuctx on the percpu list, do you
mean we should use perf_cgroup of the first cpuctx to compare with
the next task's perf_cgroup ?

Or we should delete the cgrp in cpuctx, and use this new percpu cpu_perf_cgroup?

>
> Also, perhaps merge this in the previous patch, I'm not sure it makes
> sense to split this.

Ok, will do. I put it in one patch in v1 RFC, then split it for easier review.
I will put it together in the next version.

Thanks.

>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -833,6 +833,7 @@ static DEFINE_PER_CPU(struct list_head,
> */
> static void perf_cgroup_switch(struct task_struct *task)
> {
> + struct perf_cgroup *cgrp;
> struct perf_cpu_context *cpuctx, *tmp;
> struct list_head *list;
> unsigned long flags;
> @@ -843,11 +844,20 @@ static void perf_cgroup_switch(struct ta
> */
> local_irq_save(flags);
>
> + cgrp = perf_cgroup_from_task(task, NULL);
> +
> list = this_cpu_ptr(&cgrp_cpuctx_list);
> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>
> + if (READ_ONCE(cpuctx->cgrp == cgrp))
> + continue
> +
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> + if (cpuctx->cgrp == cgrp)
> + goto next;
> +
> perf_pmu_disable(cpuctx->ctx.pmu);
>
> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> @@ -855,50 +865,22 @@ static void perf_cgroup_switch(struct ta
> * must not be done before ctxswout due
> * to event_filter_match() in event_sched_out()
> */
> - cpuctx->cgrp = perf_cgroup_from_task(task,
> - &cpuctx->ctx);
> + WRITE_ONCE(cpuctx->cgrp, cgrp);
> /*
> * set cgrp before ctxsw in to allow
> * event_filter_match() to not have to pass
> * task around
> - * we pass the cpuctx->ctx to perf_cgroup_from_task()
> - * because cgroup events are only per-cpu
> */
> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>
> perf_pmu_enable(cpuctx->ctx.pmu);
> +next:
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
>
> local_irq_restore(flags);
> }
>
> -static inline void perf_cgroup_sched_switch(struct task_struct *task,
> - struct task_struct *next)
> -{
> - struct perf_cgroup *cgrp1;
> - struct perf_cgroup *cgrp2 = NULL;
> -
> - rcu_read_lock();
> - /*
> - * we come here when we know perf_cgroup_events > 0
> - * we do not need to pass the ctx here because we know
> - * we are holding the rcu lock
> - */
> - cgrp1 = perf_cgroup_from_task(task, NULL);
> - cgrp2 = perf_cgroup_from_task(next, NULL);
> -
> - /*
> - * only schedule out current cgroup events if we know
> - * that we are switching to a different cgroup. Otherwise,
> - * do no touch the cgroup events.
> - */
> - if (cgrp1 != cgrp2)
> - perf_cgroup_switch(task);
> -
> - rcu_read_unlock();
> -}
> -
> static int perf_cgroup_ensure_storage(struct perf_event *event,
> struct cgroup_subsys_state *css)
> {
> @@ -1062,11 +1044,6 @@ static inline void update_cgrp_time_from
> {
> }
>
> -static inline void perf_cgroup_sched_switch(struct task_struct *task,
> - struct task_struct *next)
> -{
> -}
> -
> static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
> struct perf_event_attr *attr,
> struct perf_event *group_leader)
> @@ -1080,11 +1057,6 @@ perf_cgroup_set_timestamp(struct task_st
> {
> }
>
> -static inline void
> -perf_cgroup_sched_switch(struct task_struct *task, struct task_struct *next)
> -{
> -}
> -
> static inline u64 perf_cgroup_event_time(struct perf_event *event)
> {
> return 0;
> @@ -1104,6 +1076,10 @@ static inline void
> perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
> {
> }
> +
> +static void perf_cgroup_switch(struct task_struct *task)
> +{
> +}
> #endif
>
> /*
> @@ -3625,7 +3601,7 @@ void __perf_event_task_sched_out(struct
> * cgroup event are system-wide mode only
> */
> if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
> - perf_cgroup_sched_switch(task, next);
> + perf_cgroup_switch(next);
> }
>
> /*