Re: [PATCH RFC v2 8/8] selftests/perf: Add kselftest for remove_on_exec

From: Marco Elver
Date: Tue Mar 23 2021 - 05:53:50 EST


On Mon, Mar 22, 2021 at 05:42PM +0100, Peter Zijlstra wrote:
> On Mon, Mar 22, 2021 at 02:24:40PM +0100, Marco Elver wrote:
> > To make compatible with more recent libc, we'll need to fixup the tests
> > with the below.
>
> OK, that reprodiced things here, thanks!
>
> The below seems to not explode instantly.... it still has the
> alternative version in as well (and I think it might even work too, but
> the one I left in seems simpler).

Thanks! Unfortunately neither version worked if I tortured it a little
with this:

for x in {1..1000}; do ( tools/testing/selftests/perf_events/remove_on_exec & ); done

Which resulted in the 2 warnings:

WARNING: CPU: 1 PID: 795 at kernel/events/core.c:242 event_function+0xf3/0x100
WARNING: CPU: 1 PID: 795 at kernel/events/core.c:247 event_function+0xef/0x100

with efs->func==__perf_event_enable. I believe it's sufficient to add

mutex_lock(&parent_event->child_mutex);
list_del_init(&event->child_list);
mutex_unlock(&parent_event->child_mutex);

right before removing from context. With the version I have now (below
for completeness), extended torture with the above test results in no
more warnings and the test also passes.


I'd be happy to send a non-RFC v3 with all that squashed in. I'd need
your Signed-off-by for the diff you sent to proceed (and add your
Co-developed-by).

Thanks,
-- Marco

------ >8 ------

diff --git a/kernel/events/core.c b/kernel/events/core.c
index aa47e111435e..cea7c88fe131 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2165,8 +2165,9 @@ static void perf_group_detach(struct perf_event *event)
* If this is a sibling, remove it from its group.
*/
if (leader != event) {
+ leader->nr_siblings--;
list_del_init(&event->sibling_list);
- event->group_leader->nr_siblings--;
+ event->group_leader = event;
goto out;
}

@@ -2180,8 +2181,9 @@ static void perf_group_detach(struct perf_event *event)
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
perf_remove_sibling_event(sibling);

- sibling->group_leader = sibling;
+ leader->nr_siblings--;
list_del_init(&sibling->sibling_list);
+ sibling->group_leader = sibling;

/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
@@ -2358,10 +2360,19 @@ __perf_remove_from_context(struct perf_event *event,
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
struct perf_event_context *ctx = event->ctx;
+ bool remove;

lockdep_assert_held(&ctx->mutex);

- event_function_call(event, __perf_remove_from_context, (void *)flags);
+ /*
+ * There is concurrency vs remove_on_exec().
+ */
+ raw_spin_lock_irq(&ctx->lock);
+ remove = (event->attach_state & PERF_ATTACH_CONTEXT);
+ raw_spin_unlock_irq(&ctx->lock);
+
+ if (remove)
+ event_function_call(event, __perf_remove_from_context, (void *)flags);

/*
* The above event_function_call() can NO-OP when it hits
@@ -4198,41 +4209,68 @@ static void perf_event_enable_on_exec(int ctxn)
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *child_event,
struct perf_event_context *child_ctx,
- struct task_struct *child);
+ struct task_struct *child,
+ bool removed);

/*
* Removes all events from the current task that have been marked
* remove-on-exec, and feeds their values back to parent events.
*/
-static void perf_event_remove_on_exec(void)
+static void perf_event_remove_on_exec(int ctxn)
{
- int ctxn;
+ struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event *event, *next;
+ LIST_HEAD(free_list);
+ unsigned long flags;
+ bool modified = false;

- for_each_task_context_nr(ctxn) {
- struct perf_event_context *ctx;
- struct perf_event *event, *next;
+ ctx = perf_pin_task_context(current, ctxn);
+ if (!ctx)
+ return;

- ctx = perf_pin_task_context(current, ctxn);
- if (!ctx)
+ mutex_lock(&ctx->mutex);
+
+ if (WARN_ON_ONCE(ctx->task != current))
+ goto unlock;
+
+ list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
+ struct perf_event *parent_event = event->parent;
+
+ if (!event->attr.remove_on_exec)
continue;
- mutex_lock(&ctx->mutex);

- list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
- if (!event->attr.remove_on_exec)
- continue;
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);

- if (!is_kernel_event(event))
- perf_remove_from_owner(event);
- perf_remove_from_context(event, DETACH_GROUP);
+ modified = true;
+
+ if (parent_event) {
/*
- * Remove the event and feed back its values to the
- * parent event.
+ * Remove event from parent, to avoid race where the
+ * parent concurrently iterates through its children to
+ * enable, disable, or otherwise modify an event.
*/
- perf_event_exit_event(event, ctx, current);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
}
- mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
+
+ perf_remove_from_context(event, !!event->parent * DETACH_GROUP);
+ perf_event_exit_event(event, ctx, current, true);
}
+
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ if (modified)
+ clone_ctx = unclone_ctx(ctx);
+ --ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+unlock:
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+ if (clone_ctx)
+ put_ctx(clone_ctx);
}

struct perf_read_data {
@@ -7581,20 +7619,18 @@ void perf_event_exec(void)
struct perf_event_context *ctx;
int ctxn;

- rcu_read_lock();
for_each_task_context_nr(ctxn) {
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
perf_event_enable_on_exec(ctxn);
+ perf_event_remove_on_exec(ctxn);

- perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
- true);
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx) {
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
+ NULL, true);
+ }
+ rcu_read_unlock();
}
- rcu_read_unlock();
-
- perf_event_remove_on_exec();
}

struct remote_output {
@@ -12472,7 +12508,7 @@ static void sync_child_event(struct perf_event *child_event,
static void
perf_event_exit_event(struct perf_event *child_event,
struct perf_event_context *child_ctx,
- struct task_struct *child)
+ struct task_struct *child, bool removed)
{
struct perf_event *parent_event = child_event->parent;

@@ -12489,12 +12525,15 @@ perf_event_exit_event(struct perf_event *child_event,
* and being thorough is better.
*/
raw_spin_lock_irq(&child_ctx->lock);
- WARN_ON_ONCE(child_ctx->is_active);
+ if (!removed) {
+ WARN_ON_ONCE(child_ctx->is_active);

- if (parent_event)
- perf_group_detach(child_event);
- list_del_event(child_event, child_ctx);
- perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
+ if (parent_event)
+ perf_group_detach(child_event);
+ list_del_event(child_event, child_ctx);
+ }
+ if (child_event->state >= PERF_EVENT_STATE_EXIT)
+ perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
raw_spin_unlock_irq(&child_ctx->lock);

/*
@@ -12580,7 +12619,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
perf_event_task(child, child_ctx, 0);

list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx, child);
+ perf_event_exit_event(child_event, child_ctx, child, false);

mutex_unlock(&child_ctx->mutex);