Re: [PATCH v2] tracing/perf: Move rcu_irq_enter/exit_irqson() to perf trace point hook

From: Peter Zijlstra
Date: Tue Feb 11 2020 - 12:30:31 EST


On Tue, Feb 11, 2020 at 11:18:28AM -0500, Steven Rostedt wrote:
> On Tue, 11 Feb 2020 16:34:52 +0100
> Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> > > + if (unlikely(in_nmi()))
> > > + goto out;
> >
> > unless I'm mistaken, we can simply do rcu_nmi_enter() in this case, and
> > rcu_nmi_exit() on the other end.
> >
> > > + rcu_irq_enter_irqson();
>
> The thing is, I don't think this can ever happen.

Per your own argument it should be true in the trace_preempt_off()
tracepoint from nmi_enter():

<idle>
// rcu_is_watching() := false
<NMI>
nmi_enter()
preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET)
__preempt_count_add()
// in_nmi() := true
preempt_latency_start()
// .oO -- we'll never hit this tracepoint because idle has
// preempt_count() == 1, so we'll have:
// preempt_count() != val

// .oO-2 we should be able to this this when the NMI
// hits userspace and we have NOHZ_FULL on.
rcu_nmi_enter() // function tracer


But the point is, if we ever do hit it, rcu_nmi_enter() is the right
thing to call when '!rcu_is_watching() && in_nmi()', because, as per the
rcu_nmi_enter() comments, that function fully supports nested NMIs.

How about something like so? And then you get to use the same in
__trace_stack() or something, and anywhere else you're doing this dance.

---

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index da0af631ded5..e987236abf95 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -89,4 +89,52 @@ extern void irq_exit(void);
arch_nmi_exit(); \
} while (0)

+/*
+ * Tracing vs rcu_is_watching()
+ * ----------------------------
+ *
+ * tracepoints and function-tracing can happen when RCU isn't watching (idle,
+ * or early IRQ/NMI entry).
+ *
+ * When it happens during idle or early during IRQ entry, tracing will have
+ * to inform RCU that it ought to pay attention, this is done by calling
+ * rcu_irq_enter_irqon().
+ *
+ * On NMI entry, we must be very careful that tracing only happens after we've
+ * incremented preempt_count(), otherwise we cannot tell we're in NMI and take
+ * the special path.
+ */
+
+#define __TR_IRQ 1
+#define __TR_NMI 2
+
+#define trace_rcu_enter() \
+({ \
+ unsigned long state = 0; \
+ if (!rcu_is_watching()) { \
+ if (in_nmi()) { \
+ state = __TR_NMI; \
+ rcu_nmi_enter(); \
+ } else { \
+ state = __TR_IRQ; \
+ rcu_irq_enter_irqson(); \
+ } \
+ } \
+ state; \
+})
+
+#define trace_rcu_exit(state) \
+do { \
+ switch (state) { \
+ case __TR_IRQ: \
+ rcu_irq_exit_irqson(); \
+ break; \
+ case __IRQ_NMI: \
+ rcu_nmi_exit(); \
+ break; \
+ default: \
+ break; \
+ } \
+} while (0)
+
#endif /* LINUX_HARDIRQ_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e453589da97c..8f34592a1355 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8950,6 +8950,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
{
struct perf_sample_data data;
struct perf_event *event;
+ unsigned long rcu_flags;

struct perf_raw_record raw = {
.frag = {
@@ -8961,6 +8962,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_sample_data_init(&data, 0, 0);
data.raw = &raw;

+ rcu_flags = trace_rcu_enter();
+
perf_trace_buf_update(record, event_type);

hlist_for_each_entry_rcu(event, head, hlist_entry) {
@@ -8996,6 +8999,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
}

perf_swevent_put_recursion_context(rctx);
+
+ trace_rcu_exit(rcu_flags);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 45f79bcc3146..62f87807bbe6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3781,7 +3781,7 @@ static inline void preempt_latency_start(int val)
}
}

-void preempt_count_add(int val)
+void notrace preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -3813,7 +3813,7 @@ static inline void preempt_latency_stop(int val)
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
}

-void preempt_count_sub(int val)
+void notrace preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*