Re: [PATCH v2] tracing/perf: Move rcu_irq_enter/exit_irqson() to perf trace point hook
From: Mathieu Desnoyers
Date: Tue Feb 11 2020 - 12:35:25 EST
----- On Feb 11, 2020, at 12:29 PM, Peter Zijlstra peterz@xxxxxxxxxxxxx wrote:
> On Tue, Feb 11, 2020 at 11:18:28AM -0500, Steven Rostedt wrote:
>> On Tue, 11 Feb 2020 16:34:52 +0100
>> Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>>
>> > > + if (unlikely(in_nmi()))
>> > > + goto out;
>> >
>> > unless I'm mistaken, we can simply do rcu_nmi_enter() in this case, and
>> > rcu_nmi_exit() on the other end.
>> >
>> > > + rcu_irq_enter_irqson();
>>
>> The thing is, I don't think this can ever happen.
>
> Per your own argument it should be true in the trace_preempt_off()
> tracepoint from nmi_enter():
>
> <idle>
> // rcu_is_watching() := false
> <NMI>
> nmi_enter()
> preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET)
> __preempt_count_add()
> // in_nmi() := true
> preempt_latency_start()
> // .oO -- we'll never hit this tracepoint because idle has
> // preempt_count() == 1, so we'll have:
> // preempt_count() != val
>
> // .oO-2 we should be able to this this when the NMI
> // hits userspace and we have NOHZ_FULL on.
> rcu_nmi_enter() // function tracer
>
>
> But the point is, if we ever do hit it, rcu_nmi_enter() is the right
> thing to call when '!rcu_is_watching() && in_nmi()', because, as per the
> rcu_nmi_enter() comments, that function fully supports nested NMIs.
>
> How about something like so? And then you get to use the same in
> __trace_stack() or something, and anywhere else you're doing this dance.
>
> ---
>
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index da0af631ded5..e987236abf95 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -89,4 +89,52 @@ extern void irq_exit(void);
> arch_nmi_exit(); \
> } while (0)
>
> +/*
> + * Tracing vs rcu_is_watching()
> + * ----------------------------
> + *
> + * tracepoints and function-tracing can happen when RCU isn't watching (idle,
> + * or early IRQ/NMI entry).
> + *
> + * When it happens during idle or early during IRQ entry, tracing will have
> + * to inform RCU that it ought to pay attention, this is done by calling
> + * rcu_irq_enter_irqon().
> + *
> + * On NMI entry, we must be very careful that tracing only happens after we've
> + * incremented preempt_count(), otherwise we cannot tell we're in NMI and take
> + * the special path.
> + */
> +
> +#define __TR_IRQ 1
> +#define __TR_NMI 2
Minor nits:
Why not make these an enum ?
> +
> +#define trace_rcu_enter() \
> +({ \
> + unsigned long state = 0; \
> + if (!rcu_is_watching()) { \
> + if (in_nmi()) { \
> + state = __TR_NMI; \
> + rcu_nmi_enter(); \
> + } else { \
> + state = __TR_IRQ; \
> + rcu_irq_enter_irqson(); \
> + } \
> + } \
> + state; \
> +})
> +
> +#define trace_rcu_exit(state) \
> +do { \
> + switch (state) { \
> + case __TR_IRQ: \
> + rcu_irq_exit_irqson(); \
> + break; \
> + case __IRQ_NMI: \
> + rcu_nmi_exit(); \
> + break; \
> + default: \
> + break; \
> + } \
> +} while (0)
And convert these into static inline functions ?
Thanks,
Mathieu
> +
> #endif /* LINUX_HARDIRQ_H */
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index e453589da97c..8f34592a1355 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8950,6 +8950,7 @@ void perf_tp_event(u16 event_type, u64 count, void
> *record, int entry_size,
> {
> struct perf_sample_data data;
> struct perf_event *event;
> + unsigned long rcu_flags;
>
> struct perf_raw_record raw = {
> .frag = {
> @@ -8961,6 +8962,8 @@ void perf_tp_event(u16 event_type, u64 count, void
> *record, int entry_size,
> perf_sample_data_init(&data, 0, 0);
> data.raw = &raw;
>
> + rcu_flags = trace_rcu_enter();
> +
> perf_trace_buf_update(record, event_type);
>
> hlist_for_each_entry_rcu(event, head, hlist_entry) {
> @@ -8996,6 +8999,8 @@ void perf_tp_event(u16 event_type, u64 count, void
> *record, int entry_size,
> }
>
> perf_swevent_put_recursion_context(rctx);
> +
> + trace_rcu_exit(rcu_flags);
> }
> EXPORT_SYMBOL_GPL(perf_tp_event);
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 45f79bcc3146..62f87807bbe6 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3781,7 +3781,7 @@ static inline void preempt_latency_start(int val)
> }
> }
>
> -void preempt_count_add(int val)
> +void notrace preempt_count_add(int val)
> {
> #ifdef CONFIG_DEBUG_PREEMPT
> /*
> @@ -3813,7 +3813,7 @@ static inline void preempt_latency_stop(int val)
> trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
> }
>
> -void preempt_count_sub(int val)
> +void notrace preempt_count_sub(int val)
> {
> #ifdef CONFIG_DEBUG_PREEMPT
> /*
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com