Re: [PATCH 2/4] perf/ftrace: Fix function trace events

From: zhouchengming
Date: Wed Oct 11 2017 - 05:03:42 EST


On 2017/10/11 15:45, Peter Zijlstra wrote:
The function-trace<-> perf interface is a tad messed up. Where all
the other trace<-> perf interfaces use a single trace hook
registration and use per-cpu RCU based hlist to iterate the events,
function-trace actually needs multiple hook registrations in order to
minimize function entry patching when filters are present.

The end result is that we iterate events both on the trace hook and on
the hlist, which results in reporting events multiple times.

Since function-trace cannot use the regular scheme, fix it the other
way around, use singleton hlists.

Signed-off-by: Peter Zijlstra (Intel)<peterz@xxxxxxxxxxxxx>
---
include/linux/trace_events.h | 5 ++
kernel/trace/trace_event_perf.c | 82 ++++++++++++++++++++++++----------------
2 files changed, 55 insertions(+), 32 deletions(-)

--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,6 +173,11 @@ enum trace_reg {
TRACE_REG_PERF_UNREGISTER,
TRACE_REG_PERF_OPEN,
TRACE_REG_PERF_CLOSE,
+ /*
+ * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
+ * custom action was taken and the default action is not to be
+ * performed.
+ */
TRACE_REG_PERF_ADD,
TRACE_REG_PERF_DEL,
#endif
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_even
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- struct hlist_head __percpu *pcpu_list;
- struct hlist_head *list;

- pcpu_list = tp_event->perf_events;
- if (WARN_ON_ONCE(!pcpu_list))
- return -EINVAL;
+ /*
+ * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+ * and we need to take the default action of enqueueing our event on
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;

- if (!(flags& PERF_EF_START))
- p_event->hw.state = PERF_HES_STOPPED;
+ if (!(flags& PERF_EF_START))
+ p_event->hw.state = PERF_HES_STOPPED;

Don't we need to check the flags for ftrace perf_event?
So if we should put this outside the if (!tp_event->class->reg()) ?


- list = this_cpu_ptr(pcpu_list);
- hlist_add_head_rcu(&p_event->hlist_entry, list);
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+ }

Now we don't add perf_event to the pcpu_list, so we also can avoid
to alloc pcpu_list for function tp_event in perf_trace_event_reg().

Thanks.


- return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+ return 0;
}

void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
- tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+ /*
+ * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+ * and we need to take the default action of dequeueing our event from
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+ hlist_del_rcu(&p_event->hlist_entry);
}

void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
struct ftrace_entry *entry;
- struct hlist_head *head;
+ struct perf_event *event;
+ struct hlist_head head;
struct pt_regs regs;
int rctx;

- head = this_cpu_ptr(event_function.perf_events);
- if (hlist_empty(head))
+ if ((unsigned long)ops->private != smp_processor_id())
return;

+ event = container_of(ops, struct perf_event, ftrace_ops);
+
+ /*
+ * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+ * the perf code does is hlist_for_each_entry_rcu(), so we can
+ * get away with simply setting the @head.first pointer in order
+ * to create a singular list.
+ */
+ head.first =&event->hlist_entry;
+
#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
sizeof(u64)) - sizeof(u32))

@@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1,&regs, head, NULL);
+ 1,&regs,&head, NULL);

#undef ENTRY_SIZE
}
@@ -339,8 +363,10 @@ static int perf_ftrace_function_register
{
struct ftrace_ops *ops =&event->ftrace_ops;

- ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
- ops->func = perf_ftrace_function_call;
+ ops->flags |= FTRACE_OPS_FL_RCU;
+ ops->func = perf_ftrace_function_call;
+ ops->private = (void *)(unsigned long)nr_cpu_ids;
+
return register_ftrace_function(ops);
}

@@ -352,19 +378,11 @@ static int perf_ftrace_function_unregist
return ret;
}

-static void perf_ftrace_function_enable(struct perf_event *event)
-{
- ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
- ftrace_function_local_disable(&event->ftrace_ops);
-}
-
int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
+ struct perf_event *event = data;
+
switch (type) {
case TRACE_REG_REGISTER:
case TRACE_REG_UNREGISTER:
@@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct tr
case TRACE_REG_PERF_CLOSE:
return perf_ftrace_function_unregister(data);
case TRACE_REG_PERF_ADD:
- perf_ftrace_function_enable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+ return 1;
case TRACE_REG_PERF_DEL:
- perf_ftrace_function_disable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+ return 1;
}

return -EINVAL;



.