Re: [BUG(?)] perf_events: combining multiple tracepoint eventsinto a group produces no counts on member events
From: Frederic Weisbecker
Date: Wed Dec 01 2010 - 13:29:28 EST
On Wed, Dec 01, 2010 at 01:04:37PM +0100, Peter Zijlstra wrote:
> @@ -4833,7 +5101,8 @@ static int perf_event_set_filter(struct
> char *filter_str;
> int ret;
>
> - if (event->attr.type != PERF_TYPE_TRACEPOINT)
> + if (event->attr.type != PERF_TYPE_TRACEPOINT ||
> + event->attr.config == ~0ULL)
> return -EINVAL;
>
> filter_str = strndup_user(arg, PAGE_SIZE);
> @@ -4851,6 +5120,74 @@ static void perf_event_free_filter(struc
> ftrace_profile_free_filter(event);
> }
>
> +static int perf_event_add_tp(struct perf_event *event, int tp_id)
> +{
> + if (event->attr.type != PERF_TYPE_TRACEPOINT &&
Should it be || instead?
> + event->attr.config != ~0ULL)
> + return -EINVAL;
> +
> + return __perf_event_add_tp(event, tp_id);
> +}
> +
> +/*
> + * Called from the exit path, _after_ all events have been detached from it.
> + */
> +static void perf_tp_event_exit(struct task_struct *tsk)
> +{
> + struct perf_tp_idr *idr = tsk->perf_tp_idr;
> +
> + if (!idr)
> + return;
> +
> + idr_remove_all(&idr->idr);
> + idr_destroy(&idr->idr);
> +}
> +
> +static void perf_tp_event_delayed_put(struct task_struct *tsk)
> +{
> + struct perf_tp_idr *idr = tsk->perf_tp_idr;
> +
> + tsk->perf_tp_idr = NULL;
> + kfree(idr);
> +}
> +
> +static int perf_tp_inherit_idr(int id, void *p, void *data)
> +{
> + struct perf_event *child = data;
> +
> + return __perf_event_add_tp(child, id);
> +}
> +
> +static int perf_tp_event_inherit(struct perf_event *parent_event,
> + struct perf_event *child_event)
> +{
> + int ret;
> +
> + if (parent_event->attr.type != PERF_TYPE_TRACEPOINT ||
> + parent_event->attr.config != ~0ULL)
> + return 0;
> +
> + /*
> + * The child is not yet exposed, hence no need to serialize things
> + * on that side.
> + */
> + mutex_lock(&parent_event->tp_idr.lock);
> + ret = idr_for_each(&parent_event->tp_idr.idr,
> + perf_tp_inherit_idr,
> + child_event);
> + mutex_unlock(&parent_event->tp_idr.lock);
> +
> + return ret;
> +}
> +
> +static void perf_tp_event_init_task(struct task_struct *child)
> +{
> + /*
> + * Clear the idr pointer copied from the parent.
> + */
> + child->perf_tp_idr = NULL;
> +}
> +
> #else
>
> static inline void perf_tp_register(void)
> @@ -4866,6 +5203,29 @@ static void perf_event_free_filter(struc
> {
> }
>
> +static int perf_event_add_tp(struct perf_event *event, int tp_id)
> +{
> + return -ENOENT;
> +}
> +
> +static void perf_tp_event_exit(struct task_struct *tsk)
> +{
> +}
> +
> +static void perf_tp_event_delayed_put(struct task_struct *tsk)
> +{
> +}
> +
> +static int perf_tp_event_inherit(struct perf_event *parent_event,
> + struct perf_event *child_event)
> +{
> + return 0;
> +}
> +
> +static void perf_tp_event_init_task()(struct task_struct *child)
> +{
> +}
> +
> #endif /* CONFIG_EVENT_TRACING */
>
> #ifdef CONFIG_HAVE_HW_BREAKPOINT
> @@ -5290,6 +5650,9 @@ perf_event_alloc(struct perf_event_attr
> INIT_LIST_HEAD(&event->sibling_list);
> init_waitqueue_head(&event->waitq);
> init_irq_work(&event->pending, perf_pending_event);
> +#ifdef CONFIG_EVENT_TRACING
> + perf_tp_idr_init(&event->tp_idr);
> +#endif
>
> mutex_init(&event->mmap_mutex);
>
> @@ -5308,6 +5671,7 @@ perf_event_alloc(struct perf_event_attr
>
> if (task) {
> event->attach_state = PERF_ATTACH_TASK;
> + event->hw.event_target = task;
> #ifdef CONFIG_HAVE_HW_BREAKPOINT
> /*
> * hw_breakpoint is a bit difficult here..
> @@ -5353,7 +5717,7 @@ perf_event_alloc(struct perf_event_attr
> if (err) {
> if (event->ns)
> put_pid_ns(event->ns);
> - kfree(event);
> + free_event(event);
> return ERR_PTR(err);
> }
>
> @@ -5694,7 +6058,6 @@ SYSCALL_DEFINE5(perf_event_open,
> }
>
> perf_install_in_context(ctx, event, cpu);
> - ++ctx->generation;
> mutex_unlock(&ctx->mutex);
>
> event->owner = current;
> @@ -5763,7 +6126,6 @@ perf_event_create_kernel_counter(struct
> WARN_ON_ONCE(ctx->parent_ctx);
> mutex_lock(&ctx->mutex);
> perf_install_in_context(ctx, event, cpu);
> - ++ctx->generation;
> mutex_unlock(&ctx->mutex);
>
> return event;
> @@ -5936,6 +6298,8 @@ void perf_event_exit_task(struct task_st
>
> for_each_task_context_nr(ctxn)
> perf_event_exit_task_context(child, ctxn);
> +
> + perf_tp_event_exit(child);
> }
>
> static void perf_free_event(struct perf_event *event,
> @@ -5998,6 +6362,8 @@ void perf_event_delayed_put(struct task_
>
> for_each_task_context_nr(ctxn)
> WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
> +
> + perf_tp_event_delayed_put(task);
> }
>
> /*
> @@ -6013,6 +6379,7 @@ inherit_event(struct perf_event *parent_
> {
> struct perf_event *child_event;
> unsigned long flags;
> + int ret;
>
> /*
> * Instead of creating recursive hierarchies of events,
> @@ -6030,6 +6397,13 @@ inherit_event(struct perf_event *parent_
> NULL);
> if (IS_ERR(child_event))
> return child_event;
> +
> + ret = perf_tp_event_inherit(parent_event, child_event);
> + if (ret) {
> + free_event(child_event);
> + return ERR_PTR(ret);
> + }
> +
> get_ctx(child_ctx);
>
> /*
> @@ -6134,9 +6508,7 @@ inherit_task_group(struct perf_event *ev
> child->perf_event_ctxp[ctxn] = child_ctx;
> }
>
> - ret = inherit_group(event, parent, parent_ctx,
> - child, child_ctx);
> -
> + ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
> if (ret)
> *inherited_all = 0;
>
> @@ -6157,9 +6529,6 @@ int perf_event_init_context(struct task_
>
> child->perf_event_ctxp[ctxn] = NULL;
>
> - mutex_init(&child->perf_event_mutex);
> - INIT_LIST_HEAD(&child->perf_event_list);
> -
> if (likely(!parent->perf_event_ctxp[ctxn]))
> return 0;
>
> @@ -6236,6 +6605,11 @@ int perf_event_init_task(struct task_str
> {
> int ctxn, ret;
>
> + mutex_init(&child->perf_event_mutex);
> + INIT_LIST_HEAD(&child->perf_event_list);
> +
> + perf_tp_event_init_task(child);
> +
> for_each_task_context_nr(ctxn) {
> ret = perf_event_init_context(child, ctxn);
> if (ret)
> Index: linux-2.6/kernel/trace/trace_event_perf.c
> ===================================================================
> --- linux-2.6.orig/kernel/trace/trace_event_perf.c
> +++ linux-2.6/kernel/trace/trace_event_perf.c
> @@ -8,6 +8,7 @@
> #include <linux/module.h>
> #include <linux/kprobes.h>
> #include "trace.h"
> +#include "trace_output.h"
>
> static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
>
> @@ -47,9 +48,7 @@ static int perf_trace_event_perm(struct
> static int perf_trace_event_init(struct ftrace_event_call *tp_event,
> struct perf_event *p_event)
> {
> - struct hlist_head __percpu *list;
> int ret;
> - int cpu;
>
> ret = perf_trace_event_perm(tp_event, p_event);
> if (ret)
> @@ -61,15 +60,6 @@ static int perf_trace_event_init(struct
>
> ret = -ENOMEM;
>
> - list = alloc_percpu(struct hlist_head);
> - if (!list)
> - goto fail;
> -
> - for_each_possible_cpu(cpu)
> - INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
> -
> - tp_event->perf_events = list;
> -
> if (!total_ref_count) {
> char __percpu *buf;
> int i;
> @@ -100,63 +90,40 @@ static int perf_trace_event_init(struct
> }
> }
>
> - if (!--tp_event->perf_refcount) {
> - free_percpu(tp_event->perf_events);
> - tp_event->perf_events = NULL;
> - }
> + --tp_event->perf_refcount;
>
> return ret;
> }
>
> -int perf_trace_init(struct perf_event *p_event)
> +int perf_trace_init(struct perf_event *p_event, int event_id)
> {
> struct ftrace_event_call *tp_event;
> - int event_id = p_event->attr.config;
> + struct trace_event *t_event;
> int ret = -EINVAL;
>
> + trace_event_read_lock();
> + t_event = ftrace_find_event(event_id);
> + if (!t_event)
> + goto out;
> +
> + tp_event = container_of(t_event, struct ftrace_event_call, event);
> +
> mutex_lock(&event_mutex);
> - list_for_each_entry(tp_event, &ftrace_events, list) {
> - if (tp_event->event.type == event_id &&
> - tp_event->class && tp_event->class->reg &&
> - try_module_get(tp_event->mod)) {
> - ret = perf_trace_event_init(tp_event, p_event);
> - if (ret)
> - module_put(tp_event->mod);
> - break;
> - }
> + if (tp_event->class && tp_event->class->reg &&
> + try_module_get(tp_event->mod)) {
> + ret = perf_trace_event_init(tp_event, p_event);
> + if (ret)
> + module_put(tp_event->mod);
> }
> mutex_unlock(&event_mutex);
> +out:
> + trace_event_read_unlock();
>
> return ret;
> }
>
> -int perf_trace_add(struct perf_event *p_event, int flags)
> -{
> - struct ftrace_event_call *tp_event = p_event->tp_event;
> - struct hlist_head __percpu *pcpu_list;
> - struct hlist_head *list;
> -
> - pcpu_list = tp_event->perf_events;
> - if (WARN_ON_ONCE(!pcpu_list))
> - return -EINVAL;
> -
> - if (!(flags & PERF_EF_START))
> - p_event->hw.state = PERF_HES_STOPPED;
> -
> - list = this_cpu_ptr(pcpu_list);
> - hlist_add_head_rcu(&p_event->hlist_entry, list);
> -
> - return 0;
> -}
> -
> -void perf_trace_del(struct perf_event *p_event, int flags)
> -{
> - hlist_del_rcu(&p_event->hlist_entry);
> -}
> -
> -void perf_trace_destroy(struct perf_event *p_event)
> +static void __perf_trace_destroy(struct ftrace_event_call *tp_event)
> {
> - struct ftrace_event_call *tp_event = p_event->tp_event;
> int i;
>
> mutex_lock(&event_mutex);
> @@ -171,9 +138,6 @@ void perf_trace_destroy(struct perf_even
> */
> tracepoint_synchronize_unregister();
>
> - free_percpu(tp_event->perf_events);
> - tp_event->perf_events = NULL;
> -
> if (!--total_ref_count) {
> for (i = 0; i < PERF_NR_CONTEXTS; i++) {
> free_percpu(perf_trace_buf[i]);
> @@ -185,6 +149,27 @@ void perf_trace_destroy(struct perf_even
> mutex_unlock(&event_mutex);
> }
>
> +void perf_trace_destroy(struct perf_event *p_event)
> +{
> + __perf_trace_destroy(p_event->tp_event);
> +}
Is this a leftover? It doesn't seem to be used anymore.
Other than that, the whole looks good. May be I need to have
one more look at the trace_output.c changes, but the conversion
from hlist to idr seemed fine at a glance.
Adding Steve in Cc.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/