Re: [RFC v2] perf: Rewrite core context handling

From: Peter Zijlstra
Date: Fri Jun 17 2022 - 09:37:24 EST


On Mon, Jun 13, 2022 at 04:35:11PM +0200, Peter Zijlstra wrote:
> +/* XXX: No need of list now. Convert it to per-cpu variable */
> static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);

Something like so I suppose...

---
include/linux/perf_event.h | 1
kernel/events/core.c | 70 ++++++++++++++-------------------------------
2 files changed, 22 insertions(+), 49 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -936,7 +936,6 @@ struct perf_cpu_context {

#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
- struct list_head cgrp_cpuctx_entry;
#endif

/*
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -829,55 +829,41 @@ perf_cgroup_set_timestamp(struct perf_cp
}
}

-/* XXX: No need of list now. Convert it to per-cpu variable */
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
/*
* reschedule events based on the cgroup constraint of task.
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&cpu_context);
struct perf_cgroup *cgrp;
- struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;

- /*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
- */
- local_irq_save(flags);
-
cgrp = perf_cgroup_from_task(task, NULL);

- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- if (READ_ONCE(cpuctx->cgrp) == cgrp)
- continue;
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_ctx_disable(&cpuctx->ctx);
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ continue;

- ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to update_cgrp_time_from_cpuctx() in
- * ctx_sched_out()
- */
- cpuctx->cgrp = cgrp;
- /*
- * set cgrp before ctxsw in to allow
- * perf_cgroup_set_timestamp() in ctx_sched_in()
- * to not have to pass task around
- */
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_disable(&cpuctx->ctx);

- perf_ctx_enable(&cpuctx->ctx);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
+ */
+ cpuctx->cgrp = cgrp;
+ /*
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
+ */
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL);

- local_irq_restore(flags);
+ perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -979,8 +965,6 @@ perf_cgroup_event_enable(struct perf_eve
return;

cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}

static inline void
@@ -1001,7 +985,6 @@ perf_cgroup_event_disable(struct perf_ev
return;

cpuctx->cgrp = NULL;
- list_del(&cpuctx->cgrp_cpuctx_entry);
}

#else /* !CONFIG_CGROUP_PERF */
@@ -2372,11 +2355,7 @@ static void perf_remove_from_context(str
* event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock);
- /*
- * Cgroup events are per-cpu events, and must IPI because of
- * cgrp_cpuctx_list.
- */
- if (!ctx->is_active && !is_cgroup_event(event)) {
+ if (!ctx->is_active) {
__perf_remove_from_context(event, this_cpu_ptr(&cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
@@ -2807,8 +2786,6 @@ perf_install_in_context(struct perf_even
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
- * Similarly, cgroup events for the context also needs the IPI to
- * manipulate the cgrp_cpuctx_list.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
@@ -13301,9 +13278,6 @@ static void __init perf_event_init_all_c
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));

cpuctx = per_cpu_ptr(&cpu_context, cpu);