Re: [PATCH] perf: Fix missing SIGTRAPs due to pending_disable abuse

From: Peter Zijlstra
Date: Wed Oct 05 2022 - 03:37:27 EST


On Tue, Oct 04, 2022 at 07:33:55PM +0200, Marco Elver wrote:
> It looks reasonable, but obviously needs to pass tests. :-)

Ikr :-)

> Also, see comment below (I think you're still turning signals
> asynchronous, which we shouldn't do).

Indeed so; I tried fixing that this morning, but so far that doesn't
seem to want to actually cure things :/ I'll need to stomp on this
harder.

Current hackery below. The main difference is that instead of trying to
restart the irq_work on sched_in, sched_out will now queue a task-work.

The event scheduling is done from 'regular' IRQ context and as such
there should be a return-to-userspace for the relevant task in the
immediate future (either directly or after scheduling).

Alas, something still isn't right...

---
include/linux/perf_event.h | 9 ++--
kernel/events/core.c | 115 ++++++++++++++++++++++++++++-----------------
2 files changed, 79 insertions(+), 45 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 853f64b6c8c2..f15726a6c127 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -756,11 +756,14 @@ struct perf_event {
struct fasync_struct *fasync;

/* delayed work for NMIs and such */
- int pending_wakeup;
- int pending_kill;
- int pending_disable;
+ unsigned int pending_wakeup :1;
+ unsigned int pending_disable :1;
+ unsigned int pending_sigtrap :1;
+ unsigned int pending_kill :3;
+
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending;
+ struct callback_head pending_sig;

atomic_t event_limit;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b981b879bcd8..e28257fb6f00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -54,6 +54,7 @@
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
+#include <linux/task_work.h>

#include "internal.h"

@@ -2276,11 +2277,19 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;

- if (READ_ONCE(event->pending_disable) >= 0) {
- WRITE_ONCE(event->pending_disable, -1);
+ if (event->pending_disable) {
+ event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
+
+ if (event->pending_sigtrap) {
+ if (state != PERF_EVENT_STATE_OFF)
+ task_work_add(current, &event->pending_sig, TWA_NONE);
+ else
+ event->pending_sigtrap = 0;
+ }
+
perf_event_set_state(event, state);

if (!is_software_event(event))
@@ -2471,8 +2480,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
- WRITE_ONCE(event->pending_disable, smp_processor_id());
- /* can fail, see perf_pending_event_disable() */
+ event->pending_disable = 1;
irq_work_queue(&event->pending);
}

@@ -6448,47 +6456,40 @@ static void perf_sigtrap(struct perf_event *event)
event->attr.type, event->attr.sig_data);
}

-static void perf_pending_event_disable(struct perf_event *event)
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_event(struct perf_event *event)
{
- int cpu = READ_ONCE(event->pending_disable);
+ int cpu = READ_ONCE(event->oncpu);

+ /*
+ * If the event isn't running; we done. event_sched_in() will restart
+ * the irq_work when needed.
+ */
if (cpu < 0)
return;

+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
if (cpu == smp_processor_id()) {
- WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
+ if (event->pending_sigtrap) {
+ event->pending_sigtrap = 0;
perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
}
-
- perf_event_disable_local(event);
- return;
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
+ }
}

/*
- * CPU-A CPU-B
- *
- * perf_event_disable_inatomic()
- * @pending_disable = CPU-A;
- * irq_work_queue();
- *
- * sched-out
- * @pending_disable = -1;
- *
- * sched-in
- * perf_event_disable_inatomic()
- * @pending_disable = CPU-B;
- * irq_work_queue(); // FAILS
- *
- * irq_work_run()
- * perf_pending_event()
- *
- * But the event runs on CPU-B and wants disabling there.
+ * Requeue if there's still any pending work left, make sure to follow
+ * where the event went.
*/
- irq_work_queue_on(&event->pending, cpu);
+ if (event->pending_disable || event->pending_sigtrap)
+ irq_work_queue_on(&event->pending, cpu);
}

static void perf_pending_event(struct irq_work *entry)
@@ -6496,19 +6497,43 @@ static void perf_pending_event(struct irq_work *entry)
struct perf_event *event = container_of(entry, struct perf_event, pending);
int rctx;

- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();

- perf_pending_event_disable(event);
-
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}

+ __perf_pending_event(event);
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+}
+
+static void perf_pending_sig(struct callback_head *head)
+{
+ struct perf_event *event = container_of(head, struct perf_event, pending_sig);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ rctx = perf_swevent_get_recursion_context();
+
+ if (event->pending_sigtrap) {
+ event->pending_sigtrap = 0;
+ perf_sigtrap(event);
+ }
+
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
@@ -9227,11 +9252,20 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
}

+ if (event->attr.sigtrap) {
+ /*
+ * Should not be able to return to user space without processing
+ * pending_sigtrap (kernel events can overflow multiple times).
+ */
+ WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+ event->pending_sigtrap = 1;
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);

if (*perf_event_fasync(event) && event->pending_kill) {
@@ -11560,8 +11594,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,


init_waitqueue_head(&event->waitq);
- event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);
+ init_task_work(&event->pending_sig, perf_pending_sig);

mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11583,9 +11617,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (parent_event)
event->event_caps = parent_event->event_caps;

- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
-
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*