Re: [RFC PATCH 2/2] trace: emit tracepoint in preempt and irqs off tracer
From: Mathieu Desnoyers
Date: Tue Oct 06 2015 - 22:54:25 EST
----- On Oct 6, 2015, at 7:56 PM, Mathieu Desnoyers mathieu.desnoyers@xxxxxxxxxxxx wrote:
> Allow Ftrace, Perf, and LTTng to hook on the irqsoff tracer and trace
> longest irqsoff and preemptoff critical sections, or those beyond a
> specified threshold.
>
> This requires introducing a __trace_<name>() tracepoint static inline
> which bypass static jump labels, because calling a breakpoint from the
> irqsoff tracer triggers machine reboots due to recursive calls into the
> tracer.
Actually, the static jump label bypass is not needed here, thanks to
data->disabled which skips nested events. I was seeing the machine
reboots in earlier attempts to add a tracepoint elsewhere in the
irqsoff tracer (before the delta checks). We should be OK with the
static jump label after all.
I'll simplify that part of the patch before resubmitting.
Thanks,
Mathieu
>
> How to setup the irqsoff tracer for this use-case (needed for use with
> Ftrace, Perf, or LTTng): if the tracing_thread value is set, all
> durations beyond the specified threshold will emit a tracepoint. If
> unset, then the max duration will be tracked. The current max duration
> can be reset using the tracing_max_latency file.
>
> echo 0 > /sys/kernel/debug/tracing/options/function-trace # don't need function
> tracing
> echo preemptirqsoff > /sys/kernel/debug/tracing/current_tracer
> echo 2 > /sys/kernel/debug/tracing/tracing_thresh # set threshold (e.g. 2
> Âs)
> echo 1 > /sys/kernel/debug/tracing/tracing_on
>
> The following two commands can also be useful to tweak the
> irqsoff tracer:
> echo 0 > /sys/kernel/debug/tracing/tracing_max_latency # reset max latency
> echo 0 > /sys/kernel/debug/tracing/tracing_thresh # disable threshold
>
> *** An example usage of Ftrace hooking on this tracepoint: ***
>
> echo 1 > /sys/kernel/debug/tracing/events/core/core_critical_timing_hit/enable
> cat /sys/kernel/debug/tracing/trace_pipe
> [...]
> cat-2422 1d..2 66157455us : core_critical_timing_hit: ip=0xffffffff81101cae
> parent_ip=0xffffffff818d38e7 delta_ns=5078 flags=6 preempt_cnt=0
>
> *** An example usage of Perf hooking on this tracepoint: ***
>
> perf record -g -e core:core_critical_timing_hit -c 1 sleep 30
> perf report -g
>
> + 100.00% 100.00% sleep [kernel.vmlinux] [k] check_critical_timing
> + 92.75% 0.00% sleep [kernel.vmlinux] [k] preempt_count_sub
> + 92.75% 0.00% sleep [kernel.vmlinux] [k] trace_preempt_on
> + 65.22% 0.00% sleep [kernel.vmlinux] [k] page_fault
> + 65.22% 0.00% sleep [kernel.vmlinux] [k] do_page_fault
> + 63.77% 0.00% sleep [kernel.vmlinux] [k] __do_page_fault
> + 63.77% 0.00% sleep [kernel.vmlinux] [k] handle_mm_fault
> + 56.52% 0.00% sleep [kernel.vmlinux] [k] _raw_spin_unlock
> + 30.43% 0.00% sleep ld-2.19.so [.] _dl_sysdep_start
> [...]
>
> *** An example usage of LTTng hooking on this tracepoint: ***
>
> (as root)
> lttng-sessiond -d
>
> (then, as root or tracing group)
> lttng create --live
> lttng enable-event -k 'core_*'
> lttng start
> lttng view
> [...]
> [17:34:17.090669588] (+1.569689755) compudjdev core_critical_timing_hit: {
> cpu_id = 3 }, { ip = 0xFFFFFFFF81102135, parent_ip = 0xFFFFFFFFA0798E13,
> delta_ns = 252994, irqs_disabled = 0, preempt_disabled = 1, in_softirq = 0,
> in_irq = 0, in_nmi = 0 }
>
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
> CC: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> CC: Steven Rostedt <rostedt@xxxxxxxxxxx>
> CC: Ingo Molnar <mingo@xxxxxxxxxx>
> CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> ---
> include/linux/tracepoint.h | 41 +++++++++++++++++++++++++++++-----
> include/trace/events/core.h | 52 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/trace/trace_irqsoff.c | 6 +++++
> 3 files changed, 94 insertions(+), 5 deletions(-)
> create mode 100644 include/trace/events/core.h
>
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index c8e3030..17f5a47 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -95,6 +95,23 @@ extern void syscall_unregfunc(void);
>
> #define TRACE_DEFINE_ENUM(x)
>
> +#ifdef CONFIG_LOCKDEP
> +static inline void check_trace_lockdep_rcu(struct tracepoint_func __rcu *funcs,
> + int condition)
> +{
> + if (!condition)
> + return;
> + rcu_read_lock_sched_notrace();
> + rcu_dereference_sched(funcs);
> + rcu_read_unlock_sched_notrace();
> +}
> +#else
> +static inline void check_trace_lockdep_rcu(struct tracepoint_func __rcu *funcs,
> + int condition)
> +{
> +}
> +#endif
> +
> #endif /* _LINUX_TRACEPOINT_H */
>
> /*
> @@ -172,6 +189,11 @@ extern void syscall_unregfunc(void);
> * instrumentation. This lets us find RCU issues triggered with tracepoints
> * even when this tracepoint is off. This code has no purpose other than
> * poking RCU a bit.
> + *
> + * trace_<name>() should usually be used for instrumentation.
> + *
> + * __trace_<name>() can be used in core instrumentation where recursion into
> + * the breakpoint handler is unwanted.
> */
> #define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
> extern struct tracepoint __tracepoint_##name; \
> @@ -182,11 +204,18 @@ extern void syscall_unregfunc(void);
> TP_PROTO(data_proto), \
> TP_ARGS(data_args), \
> TP_CONDITION(cond),,); \
> - if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
> - rcu_read_lock_sched_notrace(); \
> - rcu_dereference_sched(__tracepoint_##name.funcs);\
> - rcu_read_unlock_sched_notrace(); \
> - } \
> + check_trace_lockdep_rcu(__tracepoint_##name.funcs, \
> + TP_CONDITION(cond)); \
> + } \
> + static inline void __trace_##name(proto) \
> + { \
> + if (static_key_enabled(&__tracepoint_##name.key)) \
> + __DO_TRACE(&__tracepoint_##name, \
> + TP_PROTO(data_proto), \
> + TP_ARGS(data_args), \
> + TP_CONDITION(cond),,); \
> + check_trace_lockdep_rcu(__tracepoint_##name.funcs, \
> + TP_CONDITION(cond)); \
> } \
> __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
> PARAMS(cond), PARAMS(data_proto), PARAMS(data_args)) \
> @@ -239,6 +268,8 @@ extern void syscall_unregfunc(void);
> #define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
> static inline void trace_##name(proto) \
> { } \
> + static inline void __trace_##name(proto) \
> + { } \
> static inline void trace_##name##_rcuidle(proto) \
> { } \
> static inline int \
> diff --git a/include/trace/events/core.h b/include/trace/events/core.h
> new file mode 100644
> index 0000000..b2c130c
> --- /dev/null
> +++ b/include/trace/events/core.h
> @@ -0,0 +1,52 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM core
> +
> +#if !defined(_TRACE_CORE_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_CORE_H
> +
> +#include <linux/tracepoint.h>
> +
> +/*
> + * Tracepoint for critical timings.
> + */
> +DECLARE_EVENT_CLASS(core_critical_timing,
> +
> + TP_PROTO(unsigned long ip, unsigned long parent_ip,
> + unsigned long flags, int preempt_cnt,
> + cycles_t delta_ns),
> +
> + TP_ARGS(ip, parent_ip, flags, preempt_cnt, delta_ns),
> +
> + TP_STRUCT__entry(
> + __field( unsigned long, ip )
> + __field( unsigned long, parent_ip )
> + __field( cycles_t, delta_ns )
> + __field( unsigned long, flags )
> + __field( int, preempt_cnt )
> + ),
> +
> + TP_fast_assign(
> + __entry->ip = ip;
> + __entry->parent_ip = parent_ip;
> + __entry->delta_ns = delta_ns;
> + __entry->flags = flags;
> + __entry->preempt_cnt = preempt_cnt;
> + ),
> +
> + TP_printk("ip=0x%lx parent_ip=0x%lx delta_ns=%llu flags=%lu preempt_cnt=%d",
> + __entry->ip, __entry->parent_ip,
> + (unsigned long long) __entry->delta_ns,
> + __entry->flags, __entry->preempt_cnt)
> +);
> +
> +DEFINE_EVENT(core_critical_timing, core_critical_timing_hit,
> + TP_PROTO(unsigned long ip, unsigned long parent_ip,
> + unsigned long flags, int preempt_cnt,
> + cycles_t delta_ns),
> + TP_ARGS(ip, parent_ip, flags, preempt_cnt, delta_ns)
> +);
> +
> +#endif /* _TRACE_CORE_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
> index 8523ea3..fad8949 100644
> --- a/kernel/trace/trace_irqsoff.c
> +++ b/kernel/trace/trace_irqsoff.c
> @@ -14,6 +14,9 @@
> #include <linux/module.h>
> #include <linux/ftrace.h>
>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/core.h>
> +
> #include "trace.h"
>
> static struct trace_array *irqsoff_trace __read_mostly;
> @@ -338,6 +341,9 @@ check_critical_timing(struct trace_array *tr,
> /* Skip 5 functions to get to the irq/preempt enable function */
> __trace_stack(tr, flags, 5, pc);
>
> + __trace_core_critical_timing_hit(CALLER_ADDR0, parent_ip, flags, pc,
> + delta);
> +
> if (data->critical_sequence != max_sequence)
> goto out_unlock;
>
> --
> 2.1.4
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/