Re: [PATCH v3] perf/core: Fix missing read event generation on task exit

From: James Clark

Date: Wed Mar 04 2026 - 06:28:24 EST

On 03/03/2026 2:29 pm, Peter Zijlstra wrote:

Sorry, things got lost again :-(

On Mon, Feb 09, 2026 at 04:12:39PM +0000, James Clark wrote:

@@ -41,10 +41,11 @@ static void perf_output_get_handle(struct
perf_output_handle *handle)
{
struct perf_buffer *rb = handle->rb;

preempt_disable();

+ WARN_ON(handle->event->cpu != smp_processor_id());

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..f566ad55b4fb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -14086,7 +14086,7 @@ static void sync_child_event(struct perf_event *child_event,
u64 child_val;
if (child_event->attr.inherit_stat) {
- if (task && task != TASK_TOMBSTONE)
+ if (task && task != TASK_TOMBSTONE && event_filter_match(child_event))
perf_event_read_event(child_event, task);
}

Turns out I tested this before with "child_event->cpu ==
raw_smp_processor_id()" rather than using event_filter_match() so I missed
that the loop over all the events needs to be wrapped with
preempt_disable(). But that can't be done because perf_event_exit_event()
takes a mutex.

I don't think the preempt_disable() can be on any smaller region than
outside the entire loop otherwise you can get rescheduled between
event_filter_match() checks and end up failing them all and not writing any
event out at all.

Ooh, cute. That's annoying. This is specific to using per-task-per-cpu
buffers afaict, nevertheless those ought to work.

Also, I suspect your WARN would trigger before c418d8b4d7a4 as well, if
it wouldn't have been for that TOMBSTONE thing.

How to fix this mess... perhaps something like this, except now I worry
about the revoke and remove_on_exec cases; do they want something?

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03ced7aad309..d0aae76cabf4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4727,7 +4727,6 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
struct perf_event_context *ctx,
- struct task_struct *task,
bool revoke);
/*
@@ -4755,7 +4754,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
modified = true;
- perf_event_exit_event(event, ctx, ctx->task, false);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -12863,7 +12862,7 @@ static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
/*
* De-schedule the event and mark it REVOKED.
*/
- perf_event_exit_event(event, ctx, ctx->task, true);
+ perf_event_exit_event(event, ctx, true);
/*
* All _free_event() bits that rely on event->pmu:
@@ -14424,17 +14423,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
-static void sync_child_event(struct perf_event *child_event,
- struct task_struct *task)
+static void sync_child_event(struct perf_event *child_event)
{
struct perf_event *parent_event = child_event->parent;
u64 child_val;
- if (child_event->attr.inherit_stat) {
- if (task && task != TASK_TOMBSTONE)
- perf_event_read_event(child_event, task);
- }
-
child_val = perf_event_count(child_event, false);
/*
@@ -14450,7 +14443,6 @@ static void sync_child_event(struct perf_event *child_event,
static void
perf_event_exit_event(struct perf_event *event,
struct perf_event_context *ctx,
- struct task_struct *task,
bool revoke)
{
struct perf_event *parent_event = event->parent;
@@ -14476,7 +14468,7 @@ perf_event_exit_event(struct perf_event *event,
attach_state = READ_ONCE(event->attach_state);
if (attach_state & PERF_ATTACH_CHILD)
- sync_child_event(event, task);
+ sync_child_event(event);
}
if (revoke)
@@ -14517,7 +14509,7 @@ perf_event_exit_event(struct perf_event *event,
static void perf_event_exit_task_context(struct task_struct *task, bool exit)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
- struct perf_event *child_event, *next;
+ struct perf_event *event, *next;
ctx = perf_pin_task_context(task);
if (!ctx)
@@ -14564,11 +14556,21 @@ static void perf_event_exit_task_context(struct task_struct *task, bool exit)
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
- if (exit)
+ if (exit) {
perf_event_task(task, ctx, 0);
- list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
- perf_event_exit_event(child_event, ctx, exit ? task : NULL, false);
+ guard(raw_spinlock_irq)(&ctx->lock);
+ list_for_each_entry(event, &ctx->event_list, event_entry) {
+ if (event->attr.inherit_stat) {
+ if (task && task != TASK_TOMBSTONE &&
+ event_filter_match(event))
+ perf_event_read_event(event, task);

Doesn't this only work if you happened to have an event opened on every CPU? Otherwise you could get rescheduled onto a core without an event and the event_filter_match() will fail and you'd drop the sample read for that exit.

+ }
+ }
+ }
+
+ list_for_each_entry_safe(event, next, &ctx->event_list, event_entry)
+ perf_event_exit_event(event, ctx, false);
mutex_unlock(&ctx->mutex);

While debugging I also noticed another issue with these per-thread count
records. perf_event_exit_event() only does anything if the event has a
parent. But the context switch optimization means that sometimes threads
re-use the original event which has no parent. So randomly you get threads
that are missing from the output.

There is a comment that mentions this under the parent check:

if (parent_event) {
/*
* Do not destroy the 'original' grouping; because of the
* context switch optimization the original events could've
* ended up in a random child task.

But I'm not sure if that was supposed to be worked around some other way and
it's now broken, or it was a known limitation of the implementation from the
beginning? But right now it randomly misses one of the threads and includes
the main thread counts, or includes all the threads and doesn't include the
main thread counts if no context switch optimisation was done.

The perf record docs don't say anything that you wouldn't expect all threads
to be there:

-s, --stat per thread counts

Ha! So commit bfbd3381e63a ("perf_counter: Implement more accurate per
task statistics") actually mentions that issue and tries to mitigate,
but I think I indeed missed a case.

I suppose the easiest fix is this, but urgh...

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03ced7aad309..c38acf1e2f43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3664,6 +3664,9 @@ static int context_equiv(struct perf_event_context *ctx1,
if (ctx1->pin_count || ctx2->pin_count)
return 0;
+ if (ctx1->nr_stat || ctx2->nr_stat)
+ return 0;
+
/* If ctx1 is the parent of ctx2 */
if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
return 1;
@@ -3677,76 +3680,13 @@ static int context_equiv(struct perf_event_context *ctx1,
* hierarchy, see perf_event_init_context().
*/
if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
- ctx1->parent_gen == ctx2->parent_gen)
+ ctx1->parent_gen == ctx2->parent_gen)
return 1;
/* Unmatched */
return 0;
}
-static void __perf_event_sync_stat(struct perf_event *event,
- struct perf_event *next_event)
-{
- u64 value;
-
- if (!event->attr.inherit_stat)
- return;
-
- /*
- * Update the event value, we cannot use perf_event_read()
- * because we're in the middle of a context switch and have IRQs
- * disabled, which upsets smp_call_function_single(), however
- * we know the event must be on the current CPU, therefore we
- * don't need to use it.
- */
- perf_pmu_read(event);
-
- perf_event_update_time(event);
-
- /*
- * In order to keep per-task stats reliable we need to flip the event
- * values when we flip the contexts.
- */
- value = local64_read(&next_event->count);
- value = local64_xchg(&event->count, value);
- local64_set(&next_event->count, value);
-
- swap(event->total_time_enabled, next_event->total_time_enabled);
- swap(event->total_time_running, next_event->total_time_running);
-
- /*
- * Since we swizzled the values, update the user visible data too.
- */
- perf_event_update_userpage(event);
- perf_event_update_userpage(next_event);
-}
-
-static void perf_event_sync_stat(struct perf_event_context *ctx,
- struct perf_event_context *next_ctx)
-{
- struct perf_event *event, *next_event;
-
- if (!ctx->nr_stat)
- return;
-
- update_context_time(ctx);
-
- event = list_first_entry(&ctx->event_list,
- struct perf_event, event_entry);
-
- next_event = list_first_entry(&next_ctx->event_list,
- struct perf_event, event_entry);
-
- while (&event->event_entry != &ctx->event_list &&
- &next_event->event_entry != &next_ctx->event_list) {
-
- __perf_event_sync_stat(event, next_event);
-
- event = list_next_entry(event, event_entry);
- next_event = list_next_entry(next_event, event_entry);
- }
-}
-
static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
struct task_struct *task, bool sched_in)
{
@@ -3835,8 +3775,6 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
-
- perf_event_sync_stat(ctx, next_ctx);
}
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);