Re: [PATCH v3] perf/core: Fix missing read event generation on task exit

From: James Clark

Date: Wed Mar 11 2026 - 06:39:33 EST




On 10/03/2026 6:25 pm, Ian Rogers wrote:
On Tue, Mar 3, 2026 at 6:29 AM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:

I suppose the easiest fix is this, but urgh...

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03ced7aad309..c38acf1e2f43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3664,6 +3664,9 @@ static int context_equiv(struct perf_event_context *ctx1,
if (ctx1->pin_count || ctx2->pin_count)
return 0;

+ if (ctx1->nr_stat || ctx2->nr_stat)
+ return 0;
+

So, just for fun, I ran the original patch, James's bug description
and this proposed fix through an AI using Chris Mason's kernel review
prompts. I'm sharing the output as the do_exit comments seem
legitimate given the stack trace in James' report.

Does this introduce a significant performance regression for multithreaded
workloads? By forcing a full sched_out and sched_in of the PMU on every
thread switch instead of just swapping context pointers, it removes the
performance benefits of context_equiv() for inherited stats.

inherit_stat isn't default though. I suppose if you ask for more stuff it costs more, and I assume complete data but slightly slower is better than incomplete but faster.


Furthermore, does this patch actually address the concurrent ring buffer
corruption reported in the bug?

No, the top patch fixes that issue, not the bottom one.


The corruption occurs because perf_event_exit_event() (called during do_exit)
invokes perf_output_begin() on the parent's ring buffer from the exiting child's
CPU. Because ring buffers use local_t operations which are not SMP-safe,
concurrent child exits corrupt the rb->nest tracking.

Since context_equiv() is only used during context switch
(perf_event_context_sched_out) and not during task exit, modifying it does
not prevent the concurrent ring buffer writes during do_exit.

Thanks,
Ian

/* If ctx1 is the parent of ctx2 */
if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
return 1;
@@ -3677,76 +3680,13 @@ static int context_equiv(struct perf_event_context *ctx1,
* hierarchy, see perf_event_init_context().
*/
if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
- ctx1->parent_gen == ctx2->parent_gen)
+ ctx1->parent_gen == ctx2->parent_gen)
return 1;

/* Unmatched */
return 0;
}

-static void __perf_event_sync_stat(struct perf_event *event,
- struct perf_event *next_event)
-{
- u64 value;
-
- if (!event->attr.inherit_stat)
- return;
-
- /*
- * Update the event value, we cannot use perf_event_read()
- * because we're in the middle of a context switch and have IRQs
- * disabled, which upsets smp_call_function_single(), however
- * we know the event must be on the current CPU, therefore we
- * don't need to use it.
- */
- perf_pmu_read(event);
-
- perf_event_update_time(event);
-
- /*
- * In order to keep per-task stats reliable we need to flip the event
- * values when we flip the contexts.
- */
- value = local64_read(&next_event->count);
- value = local64_xchg(&event->count, value);
- local64_set(&next_event->count, value);
-
- swap(event->total_time_enabled, next_event->total_time_enabled);
- swap(event->total_time_running, next_event->total_time_running);
-
- /*
- * Since we swizzled the values, update the user visible data too.
- */
- perf_event_update_userpage(event);
- perf_event_update_userpage(next_event);
-}
-
-static void perf_event_sync_stat(struct perf_event_context *ctx,
- struct perf_event_context *next_ctx)
-{
- struct perf_event *event, *next_event;
-
- if (!ctx->nr_stat)
- return;
-
- update_context_time(ctx);
-
- event = list_first_entry(&ctx->event_list,
- struct perf_event, event_entry);
-
- next_event = list_first_entry(&next_ctx->event_list,
- struct perf_event, event_entry);
-
- while (&event->event_entry != &ctx->event_list &&
- &next_event->event_entry != &next_ctx->event_list) {
-
- __perf_event_sync_stat(event, next_event);
-
- event = list_next_entry(event, event_entry);
- next_event = list_next_entry(next_event, event_entry);
- }
-}
-
static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
struct task_struct *task, bool sched_in)
{
@@ -3835,8 +3775,6 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
RCU_INIT_POINTER(next->perf_event_ctxp, ctx);

do_switch = 0;
-
- perf_event_sync_stat(ctx, next_ctx);
}
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);