Re: [PATCH v2] perf/core: install cgroup events to correct cpuctx

From: Peter Zijlstra
Date: Wed Mar 18 2020 - 15:33:53 EST


On Wed, Mar 18, 2020 at 07:05:35PM +0100, Peter Zijlstra wrote:
> On Wed, Mar 18, 2020 at 07:07:29AM +0000, Song Liu wrote:

> > Could you please share your thoughts on this? I think we cannot use current
> > in list_update_cgroup_event(), unless we call it on the target CPU.
>
> Bah, that cgroup crap is 'wrong'. It's pointless to track the
> cpuctx->cgrp state for disabled events.
>
> Let me see if I can unravel that crud.

This compiles, but I've no clue how to operate cgroups. Please test.

---
kernel/events/core.c | 70 ++++++++++++++++++++++++++++++++--------------------
1 file changed, 43 insertions(+), 27 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ccf8d4fc6374..9f0713292cba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -981,16 +981,10 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}

-/*
- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
- * cleared when last cgroup event is removed.
- */
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
- struct list_head *cpuctx_entry;

if (!is_cgroup_event(event))
return;
@@ -1007,28 +1001,41 @@ list_update_cgroup_event(struct perf_event *event,
* because if the first would mismatch, the second would not try again
* and we would leave cpuctx->cgrp unset.
*/
- if (add && !cpuctx->cgrp) {
+ if (ctx->is_active && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);

if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
}

- if (add && ctx->nr_cgroups++)
+ if (ctx->nr_cgroups++)
return;
- else if (!add && --ctx->nr_cgroups)
+
+ list_add(&cpuctx->cgrp_cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+}
+
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
return;

- /* no cgroup running */
- if (!add)
+ /*
+ * Because cgroup events are always per-cpu events,
+ * @ctx == &cpuctx->ctx.
+ */
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (--ctx->nr_cgroups)
+ return;
+
+ if (ctx->is_active && cpuctx->cgrp)
cpuctx->cgrp = NULL;

- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- if (add)
- list_add(cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
- else
- list_del(cpuctx_entry);
+ list_del(&cpuctx->cgrp_cpuctx_entry);
}

#else /* !CONFIG_CGROUP_PERF */
@@ -1094,11 +1101,14 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}

static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}

+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+}
#endif

/*
@@ -1789,13 +1799,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
add_event_to_groups(event, ctx);
}

- list_update_cgroup_event(event, ctx, true);
-
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;

+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_enable(event, ctx);
+
ctx->generation++;
}

@@ -1971,8 +1982,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)

event->attach_state &= ~PERF_ATTACH_CONTEXT;

- list_update_cgroup_event(event, ctx, false);
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1989,8 +1998,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of error state is by explicit re-enabling
* of the event
*/
- if (event->state > PERF_EVENT_STATE_OFF)
+ if (event->state > PERF_EVENT_STATE_OFF) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ }

ctx->generation++;
}
@@ -2221,6 +2232,7 @@ event_sched_out(struct perf_event *event,

if (READ_ONCE(event->pending_disable) >= 0) {
WRITE_ONCE(event->pending_disable, -1);
+ perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2357,6 +2369,7 @@ static void __perf_event_disable(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);

perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ perf_cgroup_event_disable(event, ctx);
}

/*
@@ -2740,7 +2753,7 @@ static int __perf_install_in_context(void *info)
}

#ifdef CONFIG_CGROUP_PERF
- if (is_cgroup_event(event)) {
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2900,6 +2913,7 @@ static void __perf_event_enable(struct perf_event *event,
ctx_sched_out(ctx, cpuctx, EVENT_TIME);

perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_cgroup_event_enable(event, ctx);

if (!ctx->is_active)
return;
@@ -3609,8 +3623,10 @@ static int merge_sched_in(struct perf_event *event, void *data)
}

if (event->state == PERF_EVENT_STATE_INACTIVE) {
- if (event->attr.pinned)
+ if (event->attr.pinned) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }

*can_add_hw = 0;
ctx->rotate_necessary = 1;