Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target task

From: Peter Zijlstra
Date: Tue Oct 10 2017 - 09:05:08 EST


On Tue, Oct 10, 2017 at 01:33:21PM +0200, Peter Zijlstra wrote:
> But now you've got me looking at 75e8387685f6, which also looks
> completely insane.

The reason I insta stumbled on that patch is that it only addresses the
ftrace situation and doesn't mention the other _5_ places that use this
interface. It doesn't explain why those don't have the problem and if
not, why their solution doesn't work for ftrace.

So all (well syscall and regular tracepoints, didn't check the others)
avoid that problem by simply not registering multiple times at the
tracepoint. Tracepoints use tp_event->perf_refcount and the syscall
things use sys_perf_refcount_{enter,exit} for that.

Doing the same for function trace looks a little something like the
below (after reverting 75e8387685f6)

Except the below doesn't compile because of
kernel/trace/trace_event_filter.c, which is where I lost the plot.

Steve, can we make the below work?

---
include/linux/ftrace.h | 51 --------------------------------------
include/linux/perf_event.h | 3 --
include/linux/trace_events.h | 2 -
kernel/trace/ftrace.c | 53 +++-------------------------------------
kernel/trace/trace_event_perf.c | 48 +++++++++++++++++-------------------
kernel/trace/trace_events.c | 2 -
kernel/trace/trace_kprobe.c | 2 -
kernel/trace/trace_syscalls.c | 4 ---
8 files changed, 29 insertions(+), 136 deletions(-)

--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -125,7 +125,7 @@ ftrace_func_t ftrace_ops_get_func(struct
enum {
FTRACE_OPS_FL_ENABLED = 1 << 0,
FTRACE_OPS_FL_DYNAMIC = 1 << 1,
- FTRACE_OPS_FL_PER_CPU = 1 << 2,
+
FTRACE_OPS_FL_SAVE_REGS = 1 << 3,
FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 4,
FTRACE_OPS_FL_RECURSION_SAFE = 1 << 5,
@@ -204,55 +204,6 @@ int register_ftrace_function(struct ftra
int unregister_ftrace_function(struct ftrace_ops *ops);
void clear_ftrace_function(void);

-/**
- * ftrace_function_local_enable - enable ftrace_ops on current cpu
- *
- * This function enables tracing on current cpu by decreasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_enable(struct ftrace_ops *ops)
-{
- if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
- return;
-
- (*this_cpu_ptr(ops->disabled))--;
-}
-
-/**
- * ftrace_function_local_disable - disable ftrace_ops on current cpu
- *
- * This function disables tracing on current cpu by increasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_disable(struct ftrace_ops *ops)
-{
- if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
- return;
-
- (*this_cpu_ptr(ops->disabled))++;
-}
-
-/**
- * ftrace_function_local_disabled - returns ftrace_ops disabled value
- * on current cpu
- *
- * This function returns value of ftrace_ops::disabled on current cpu.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
-{
- WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU));
- return *this_cpu_ptr(ops->disabled);
-}
-
extern void ftrace_stub(unsigned long a0, unsigned long a1,
struct ftrace_ops *op, struct pt_regs *regs);

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -676,9 +676,6 @@ struct perf_event {
#ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event;
struct event_filter *filter;
-#ifdef CONFIG_FUNCTION_TRACER
- struct ftrace_ops ftrace_ops;
-#endif
#endif

#ifdef CONFIG_CGROUP_PERF
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,8 +173,6 @@ enum trace_reg {
TRACE_REG_PERF_UNREGISTER,
TRACE_REG_PERF_OPEN,
TRACE_REG_PERF_CLOSE,
- TRACE_REG_PERF_ADD,
- TRACE_REG_PERF_DEL,
#endif
};

--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}

-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
- int __percpu *disabled;
-
- if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
- return -EINVAL;
-
- disabled = alloc_percpu(int);
- if (!disabled)
- return -ENOMEM;
-
- ops->disabled = disabled;
- per_cpu_ops_disable_all(ops);
- return 0;
-}
-
static void ftrace_sync(struct work_struct *work)
{
/*
@@ -262,7 +238,7 @@ static ftrace_func_t ftrace_ops_get_list
* If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC |
FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;

@@ -422,11 +398,6 @@ static int __register_ftrace_function(st
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;

- if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
- if (per_cpu_ops_alloc(ops))
- return -ENOMEM;
- }
-
add_ftrace_ops(&ftrace_ops_list, ops);

/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(
{
}

-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
- free_percpu(ops->disabled);
-}
-
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC))
goto free_ops;

return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace
* The same goes for freeing the per_cpu data of the per_cpu
* ops.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC)) {
/*
* We need to do a hard force of sched synchronization.
* This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace

free_ops:
arch_ftrace_trampoline_free(ops);
-
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
}

return 0;
@@ -6066,10 +6029,7 @@ __ftrace_ops_list_func(unsigned long ip,
* If any of the above fails then the op->func() is not executed.
*/
if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
- (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) &&
ftrace_ops_test(op, ip, regs)) {
-
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -6127,10 +6087,7 @@ static void ftrace_ops_assist_func(unsig

preempt_disable_notrace();

- if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) {
- op->func(ip, parent_ip, op, regs);
- }
+ op->func(ip, parent_ip, op, regs);

preempt_enable_notrace();
trace_clear_recursion(bit);
@@ -6154,7 +6111,7 @@ ftrace_func_t ftrace_ops_get_func(struct
* or does per cpu logic, then we need to call the assist handler.
*/
if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
- ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ ops->flags & (FTRACE_OPS_FL_RCU))
return ftrace_ops_assist_func;

return ops->func;
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -253,14 +253,12 @@ int perf_trace_add(struct perf_event *p_
list = this_cpu_ptr(pcpu_list);
hlist_add_head_rcu(&p_event->hlist_entry, list);

- return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+ return 0;
}

void perf_trace_del(struct perf_event *p_event, int flags)
{
- struct trace_event_call *tp_event = p_event->tp_event;
hlist_del_rcu(&p_event->hlist_entry);
- tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}

void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -302,6 +300,10 @@ void perf_trace_buf_update(void *record,
NOKPROBE_SYMBOL(perf_trace_buf_update);

#ifdef CONFIG_FUNCTION_TRACER
+static DEFINE_MUTEX(ftrace_lock);
+static int ftrace_perf_refcount;
+static struct ftrace_ops ftrace_perf_ops;
+
static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
@@ -337,29 +339,31 @@ perf_ftrace_function_call(unsigned long

static int perf_ftrace_function_register(struct perf_event *event)
{
- struct ftrace_ops *ops = &event->ftrace_ops;
+ int ret = 0;

- ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
- ops->func = perf_ftrace_function_call;
- return register_ftrace_function(ops);
-}
+ mutex_lock(&ftrace_lock);
+ if (!ftrace_perf_refcount++) {
+ ftrace_perf_ops.flags = FTRACE_OPS_FL_RCU;
+ ftrace_perf_ops.func = perf_ftrace_function_call;
+ ret = register_ftrace_function(&ftrace_perf_ops);
+ }
+ mutex_unlock(&ftrace_lock);

-static int perf_ftrace_function_unregister(struct perf_event *event)
-{
- struct ftrace_ops *ops = &event->ftrace_ops;
- int ret = unregister_ftrace_function(ops);
- ftrace_free_filter(ops);
return ret;
}

-static void perf_ftrace_function_enable(struct perf_event *event)
+static int perf_ftrace_function_unregister(struct perf_event *event)
{
- ftrace_function_local_enable(&event->ftrace_ops);
-}
+ int ret = 0;

-static void perf_ftrace_function_disable(struct perf_event *event)
-{
- ftrace_function_local_disable(&event->ftrace_ops);
+ mutex_lock(&ftrace_lock);
+ if (!--ftrace_perf_refcount) {
+ ret = unregister_ftrace_function(&ftrace_perf_ops);
+ ftrace_free_filter(&ftrace_perf_ops);
+ }
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
}

int perf_ftrace_event_register(struct trace_event_call *call,
@@ -376,12 +380,6 @@ int perf_ftrace_event_register(struct tr
return perf_ftrace_function_register(data);
case TRACE_REG_PERF_CLOSE:
return perf_ftrace_function_unregister(data);
- case TRACE_REG_PERF_ADD:
- perf_ftrace_function_enable(data);
- return 0;
- case TRACE_REG_PERF_DEL:
- perf_ftrace_function_disable(data);
- return 0;
}

return -EINVAL;
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -312,8 +312,6 @@ int trace_event_reg(struct trace_event_c
return 0;
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
- case TRACE_REG_PERF_ADD:
- case TRACE_REG_PERF_DEL:
return 0;
#endif
}
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1266,8 +1266,6 @@ static int kprobe_register(struct trace_
return disable_trace_kprobe(tk, NULL);
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
- case TRACE_REG_PERF_ADD:
- case TRACE_REG_PERF_DEL:
return 0;
#endif
}
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -775,8 +775,6 @@ static int syscall_enter_register(struct
return 0;
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
- case TRACE_REG_PERF_ADD:
- case TRACE_REG_PERF_DEL:
return 0;
#endif
}
@@ -803,8 +801,6 @@ static int syscall_exit_register(struct
return 0;
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
- case TRACE_REG_PERF_ADD:
- case TRACE_REG_PERF_DEL:
return 0;
#endif
}