Re: [PATCH] Add support for disabling Intel PT trace in ftrace

From: Steven Rostedt
Date: Tue Nov 22 2016 - 17:38:02 EST


On Fri, 18 Nov 2016 08:55:24 -0800
Andi Kleen <andi@xxxxxxxxxxxxxx> wrote:

> From: Andi Kleen <ak@xxxxxxxxxxxxxxx>
>
> ftrace has powerfull trigger functions. Intel PT on modern Intel CPUs
> can trace execution flow.
>
> For debugging I found it useful to disable the PT trace from ftrace triggers,
> for example when specific kernel functions are hit, which indicate
> a problem. Then we can see the exact execution trace up to this point.
>
> This patch adds a "ptoff" ftrace trigger/function that disables the trace
> on the current function. The PT trace still has to be set up with perf
>
> % perf record -e intel_pt// -a ... &
> % cd /sys/kernel/debug/tracing
> % echo do_page_fault:ptoff > set_ftrace_filter
> ...
> % cd -
> % kill %1
> % perf script --itrace=i0ns
>
> I only implemented local disabling. Enabling would be much more complicated
> and require a black list of functions to avoid recursion. Global
> disabling with IPIs would be possible, but also risk some deadlock
> scenarios. Local disabling is very easy and can be done without
> accessing any special state, so there are no such problems. It is
> usually good enough for debugging purposes. The trace can be always
> reenabled from perf.
>
> This patch adds "ptoff" both as ftrace trigger and ftrace functions.
> This makes it work from "set_ftrace_filter" and through the trigger
> field of trace points.
>
> The PT driver exports a pt_disable() function for this that can be also
> used for manual instrumentation.
>
> Cc: tom.zanussi@xxxxxxxxxxxxxxx
> Cc: rostedt@xxxxxxxxxxx
> Cc: peterz@xxxxxxxxxxxxx
> Cc: alexander.shishkin@xxxxxxxxx
> Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> ---
> Documentation/trace/ftrace.txt | 5 +++
> arch/x86/events/intel/pt.c | 16 ++++++++
> include/linux/perf_event.h | 2 +

Peter Z, Thomas, H. Peter, Ingo,

Are you guys fine with this change. If so I can pull this into my tree.

-- Steve

> include/linux/trace_events.h | 1 +
> kernel/trace/trace.c | 6 +++
> kernel/trace/trace_events_trigger.c | 79 +++++++++++++++++++++++++++++++++++++
> kernel/trace/trace_functions.c | 58 +++++++++++++++++++++++++++
> 7 files changed, 167 insertions(+)
>
> diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
> index 185c39fea2a0..5dc8ec658678 100644
> --- a/Documentation/trace/ftrace.txt
> +++ b/Documentation/trace/ftrace.txt
> @@ -2549,6 +2549,11 @@ The following commands are supported:
> command, it only prints out the contents of the ring buffer for the
> CPU that executed the function that triggered the dump.
>
> +- ptoff
> + When the function is hit disable Intel PT trace. The Intel PT
> + trace has to be set up earlier with perf record -a -e intel_pt// ...
> + This disables the trace on the current CPU only.
> +
> trace_pipe
> ----------
>
> diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
> index c5047b8f777b..cf15881da9a5 100644
> --- a/arch/x86/events/intel/pt.c
> +++ b/arch/x86/events/intel/pt.c
> @@ -1455,3 +1455,19 @@ static __init int pt_init(void)
> return ret;
> }
> arch_initcall(pt_init);
> +
> +/*
> + * Disable the PT trace for debugging purposes.
> + */
> +void pt_disable(void)
> +{
> + u64 val;
> +
> + if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
> + return;
> +
> + rdmsrl_safe(MSR_IA32_RTIT_CTL, &val);
> + val &= ~RTIT_CTL_TRACEEN;
> + wrmsrl_safe(MSR_IA32_RTIT_CTL, val);
> +}
> +EXPORT_SYMBOL(pt_disable);
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 4741ecdb9817..a408d288298b 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1387,4 +1387,6 @@ int perf_event_exit_cpu(unsigned int cpu);
> #define perf_event_exit_cpu NULL
> #endif
>
> +void pt_disable(void);
> +
> #endif /* _LINUX_PERF_EVENT_H */
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index be007610ceb0..4d2d4a1b738e 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -391,6 +391,7 @@ enum event_trigger_type {
> ETT_EVENT_ENABLE = (1 << 3),
> ETT_EVENT_HIST = (1 << 4),
> ETT_HIST_ENABLE = (1 << 5),
> + ETT_PTOFF = (1 << 6),
> };
>
> extern int filter_match_preds(struct event_filter *filter, void *rec);
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 8696ce6bf2f6..e55405dce821 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -4082,6 +4082,9 @@ static const char readme_msg[] =
> #endif
> "\t\t dump\n"
> "\t\t cpudump\n"
> +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> + "\t\t ptoff\n"
> +#endif
> "\t example: echo do_fault:traceoff > set_ftrace_filter\n"
> "\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
> "\t The first one will disable tracing every time do_fault is hit\n"
> @@ -4175,6 +4178,9 @@ static const char readme_msg[] =
> #ifdef CONFIG_HIST_TRIGGERS
> "\t\t hist (see below)\n"
> #endif
> +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> + "\t\t ptoff\t\t- Disable PT trace on current CPU\n"
> +#endif
> "\t example: echo traceoff > events/block/block_unplug/trigger\n"
> "\t echo traceoff:3 > events/block/block_unplug/trigger\n"
> "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
> diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
> index a26ff1345784..b4ec8c417c12 100644
> --- a/kernel/trace/trace_events_trigger.c
> +++ b/kernel/trace/trace_events_trigger.c
> @@ -22,6 +22,7 @@
> #include <linux/ctype.h>
> #include <linux/mutex.h>
> #include <linux/slab.h>
> +#include <linux/perf_event.h>
>
> #include "trace.h"
>
> @@ -1044,6 +1045,83 @@ static struct event_command trigger_traceoff_cmd = {
> .set_filter = set_trigger_filter,
> };
>
> +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> +
> +static void
> +ptoff_trigger(struct event_trigger_data *data, void *rec)
> +{
> + pt_disable();
> +}
> +
> +static void
> +ptoff_count_trigger(struct event_trigger_data *data, void *rec)
> +{
> + if (!data->count)
> + return;
> +
> + if (data->count != -1)
> + (data->count)--;
> +
> + ptoff_trigger(data, rec);
> +}
> +
> +static int
> +ptoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
> + struct event_trigger_data *data)
> +{
> + return event_trigger_print("ptoff", m, (void *)data->count,
> + data->filter_str);
> +}
> +
> +static struct event_trigger_ops ptoff_trigger_ops = {
> + .func = ptoff_trigger,
> + .print = ptoff_trigger_print,
> + .init = event_trigger_init,
> + .free = event_trigger_free,
> +};
> +
> +static struct event_trigger_ops ptoff_count_trigger_ops = {
> + .func = ptoff_count_trigger,
> + .print = ptoff_trigger_print,
> + .init = event_trigger_init,
> + .free = event_trigger_free,
> +};
> +
> +static struct event_trigger_ops *
> +ptoff_get_trigger_ops(char *cmd, char *param)
> +{
> + return param ? &ptoff_count_trigger_ops : &ptoff_trigger_ops;
> +}
> +
> +static struct event_command trigger_ptoff_cmd = {
> + .name = "ptoff",
> + .trigger_type = ETT_PTOFF,
> + .func = event_trigger_callback,
> + .reg = register_trigger,
> + .unreg = unregister_trigger,
> + .get_trigger_ops = ptoff_get_trigger_ops,
> + .set_filter = set_trigger_filter,
> +};
> +
> +static __init int register_trigger_ptoff_cmd(void)
> +{
> + int ret;
> +
> + if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
> + return 0;
> +
> + ret = register_event_command(&trigger_ptoff_cmd);
> + WARN_ON(ret < 0);
> +
> + return ret;
> +}
> +
> +#else
> +
> +static inline int register_trigger_ptoff_cmd(void) { return 0; }
> +
> +#endif
> +
> #ifdef CONFIG_TRACER_SNAPSHOT
> static void
> snapshot_trigger(struct event_trigger_data *data, void *rec)
> @@ -1609,6 +1687,7 @@ __init int register_trigger_cmds(void)
> register_trigger_enable_disable_cmds();
> register_trigger_hist_enable_disable_cmds();
> register_trigger_hist_cmd();
> + register_trigger_ptoff_cmd();
>
> return 0;
> }
> diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
> index 0efa00d80623..80867e3166f7 100644
> --- a/kernel/trace/trace_functions.c
> +++ b/kernel/trace/trace_functions.c
> @@ -15,6 +15,7 @@
> #include <linux/ftrace.h>
> #include <linux/slab.h>
> #include <linux/fs.h>
> +#include <linux/perf_event.h>
>
> #include "trace.h"
>
> @@ -643,6 +644,57 @@ static struct ftrace_func_command ftrace_cpudump_cmd = {
> .func = ftrace_cpudump_callback,
> };
>
> +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> +static void
> +ftrace_ptoff_probe(unsigned long ip, unsigned long parent_ip, void **data)
> +{
> + if (update_count(data))
> + pt_disable();
> +}
> +
> +static int
> +ftrace_ptoff_print(struct seq_file *m, unsigned long ip,
> + struct ftrace_probe_ops *ops, void *data)
> +{
> + return ftrace_probe_print("ptoff", m, ip, data);
> +}
> +
> +static struct ftrace_probe_ops ptoff_probe_ops = {
> + .func = ftrace_ptoff_probe,
> + .print = ftrace_ptoff_print,
> +};
> +
> +static int
> +ftrace_ptoff_callback(struct ftrace_hash *hash,
> + char *glob, char *cmd, char *param, int enable)
> +{
> + struct ftrace_probe_ops *ops;
> +
> + ops = &ptoff_probe_ops;
> +
> + /* Only dump once. */
> + return ftrace_trace_probe_callback(ops, hash, glob, cmd,
> + "1", enable);
> +}
> +
> +static struct ftrace_func_command ftrace_ptoff_cmd = {
> + .name = "ptoff",
> + .func = ftrace_ptoff_callback,
> +};
> +
> +static int register_ptoff_command(void)
> +{
> + if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
> + return 0;
> + return register_ftrace_command(&ftrace_ptoff_cmd);
> +}
> +
> +#else
> +
> +static inline int register_ptoff_command(void) { return 0; }
> +
> +#endif
> +
> static int __init init_func_cmd_traceon(void)
> {
> int ret;
> @@ -667,8 +719,14 @@ static int __init init_func_cmd_traceon(void)
> if (ret)
> goto out_free_dump;
>
> + ret = register_ptoff_command();
> + if (ret)
> + goto out_free_cpudump;
> +
> return 0;
>
> + out_free_cpudump:
> + unregister_ftrace_command(&ftrace_cpudump_cmd);
> out_free_dump:
> unregister_ftrace_command(&ftrace_dump_cmd);
> out_free_stacktrace: