Re: [PATCH v5 4/4]: perf/core: complete replace of lists by rb trees for pinned and flexible groups at perf_event_context
From: Alexey Budankov
Date: Tue Jul 18 2017 - 04:41:09 EST
Hi,
Are there any new comments so far? Could you please suggest further steps forward?
Thanks,
Alexey
On 10.07.2017 16:03, Alexey Budankov wrote:
> perf/core: complete replace of lists by rb trees for pinned and
> flexible groups at perf_event_context
>
> By default, the userspace perf tool opens per-cpu task-bound events
> when sampling, so for N logical events requested by the user, the tool
> will open N * NR_CPUS events.
>
> In the kernel, we mux events with a hrtimer, periodically rotating the
> flexible group list and trying to schedule each group in turn. We skip
> groups whose cpu filter doesn't match. So when we get unlucky, we can
> walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer invocation.
>
> This has been observed to result in significant overhead when running
> the STREAM benchmark on 272 core Xeon Phi systems.
>
> One way to avoid this is to place our events into an rb tree sorted by
> CPU filter, so that our hrtimer can skip to the current CPU's
> list and ignore everything else.
>
> This patch implements complete replacement of lists by rb trees for
> pinned and flexible groups.
>
> The patch set was tested on Xeon Phi using perf_fuzzer and tests
> from here: https://github.com/deater/perf_event_tests
>
> The full patch set (v1-4) is attached for convenience.
>
> Branch revision:
> * perf/core 007b811b4041989ec2dc91b9614aa2c41332723e
> Merge tag 'perf-core-for-mingo-4.13-20170719' of
> git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
>
> Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
> ---
> include/linux/perf_event.h | 20 +---------
> kernel/events/core.c | 94 ++++++++++++++++------------------------------
> 2 files changed, 34 insertions(+), 80 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 7b2cddf..8e1967f 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -603,13 +603,6 @@ struct perf_event {
> */
> struct list_head group_list;
> /*
> - * Entry into the group_list list above;
> - * the entry may be attached to the self group_list list above
> - * in case the event is directly attached to the tree;
> - */
> - struct list_head group_list_entry;
> -
> - /*
> * We need storage to track the entries in perf_pmu_migrate_context; we
> * cannot use the event_entry because of RCU and we want to keep the
> * group in tact which avoids us using the other two entries.
> @@ -749,15 +742,6 @@ struct perf_event {
> #endif /* CONFIG_PERF_EVENTS */
> };
>
> -/*
> - * event groups keep group leader events arranged as an rb tree with
> - * event->cpu key and as a list for the whole tree iterations;
> - */
> -struct perf_event_groups {
> - struct list_head list;
> - struct rb_root tree;
> -};
> -
> /**
> * struct perf_event_context - event context structure
> *
> @@ -778,8 +762,8 @@ struct perf_event_context {
> struct mutex mutex;
>
> struct list_head active_ctx_list;
> - struct perf_event_groups pinned_groups;
> - struct perf_event_groups flexible_groups;
> + struct rb_root pinned_groups;
> + struct rb_root flexible_groups;
> struct list_head event_list;
> int nr_events;
> int nr_active;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index bddcb87..5142434 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1466,7 +1466,7 @@ static enum event_type_t get_event_type(struct perf_event *event)
> * Extract pinned or flexible groups from the context
> * based on event attrs bits;
> */
> -static struct perf_event_groups *
> +static struct rb_root *
> get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
> {
> if (event->attr.pinned)
> @@ -1476,11 +1476,11 @@ get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
> }
>
> static void
> -perf_event_groups_insert(struct perf_event_groups *groups,
> +perf_event_groups_insert(struct rb_root *groups,
> struct perf_event *event);
>
> static void
> -perf_event_groups_delete(struct perf_event_groups *groups,
> +perf_event_groups_delete(struct rb_root *groups,
> struct perf_event *event);
>
> /*
> @@ -1490,7 +1490,7 @@ perf_event_groups_delete(struct perf_event_groups *groups,
> static void
> add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
> {
> - struct perf_event_groups *groups;
> + struct rb_root *groups;
>
> groups = get_event_groups(event, ctx);
> perf_event_groups_insert(groups, event);
> @@ -1502,48 +1502,28 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
> static void
> del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
> {
> - struct perf_event_groups *groups;
> + struct rb_root *groups;
>
> groups = get_event_groups(event, ctx);
> perf_event_groups_delete(groups, event);
> }
>
> /*
> - * Helper function to test if event groups are empty;
> - */
> -static int
> -perf_event_groups_empty(struct perf_event_groups *groups)
> -{
> - return list_empty(&groups->list);
> -}
> -
> -/*
> - * Helper function to Initialize event groups object;
> - */
> -static void
> -perf_event_groups_init(struct perf_event_groups *groups)
> -{
> - INIT_LIST_HEAD(&groups->list);
> - groups->tree = RB_ROOT;
> -}
> -
> -/*
> * Insert a group into a tree using event->cpu as a key. If event->cpu node
> * is already attached to the tree then the event is added to the attached
> * group's group_list list.
> */
> static void
> -perf_event_groups_insert(struct perf_event_groups *groups,
> - struct perf_event *event)
> +perf_event_groups_insert(struct rb_root *groups, struct perf_event *event)
> {
> struct rb_node **node;
> struct rb_node *parent;
> struct perf_event *node_event;
>
> WARN_ON_ONCE(!groups || !event);
> - WARN_ON_ONCE(!list_empty(&event->group_list_entry));
> + WARN_ON_ONCE(!list_empty(&event->group_entry));
>
> - node = &groups->tree.rb_node;
> + node = &groups->rb_node;
> parent = *node;
>
> while (*node) {
> @@ -1556,16 +1536,16 @@ perf_event_groups_insert(struct perf_event_groups *groups,
> } else if (event->cpu > node_event->cpu) {
> node = &parent->rb_right;
> } else {
> - list_add_tail(&event->group_list_entry,
> + list_add_tail(&event->group_entry,
> &node_event->group_list);
> return;
> }
> }
>
> - list_add_tail(&event->group_list_entry, &event->group_list);
> + list_add_tail(&event->group_entry, &event->group_list);
>
> rb_link_node(&event->group_node, parent, node);
> - rb_insert_color(&event->group_node, &groups->tree);
> + rb_insert_color(&event->group_node, groups);
> }
>
> /*
> @@ -1573,30 +1553,28 @@ perf_event_groups_insert(struct perf_event_groups *groups,
> * it also detaches all groups on the group's group_list list.
> */
> static void
> -perf_event_groups_delete(struct perf_event_groups *groups,
> - struct perf_event *event)
> +perf_event_groups_delete(struct rb_root *groups, struct perf_event *event)
> {
> struct perf_event *next;
>
> WARN_ON_ONCE(!event);
> - WARN_ON_ONCE(list_empty(&event->group_list_entry));
> + WARN_ON_ONCE(list_empty(&event->group_entry));
>
> - list_del_init(&event->group_list_entry);
> + list_del_init(&event->group_entry);
>
> if (!RB_EMPTY_NODE(&event->group_node)) {
> WARN_ON_ONCE(!groups);
> - if (!RB_EMPTY_ROOT(&groups->tree)) {
> + if (!RB_EMPTY_ROOT(groups)) {
> if (list_empty(&event->group_list)) {
> rb_erase(&event->group_node, &groups->tree);
> } else {
> next = list_first_entry(&event->group_list,
> - struct perf_event, group_list_entry);
> + struct perf_event, group_entry);
> list_replace_init(&event->group_list,
> &next->group_list);
> rb_replace_node(&event->group_node,
> - &next->group_node, &groups->tree);
> + &next->group_node, groups);
> }
> -
> }
> RB_CLEAR_NODE(&event->group_node);
> }
> @@ -1606,14 +1584,14 @@ perf_event_groups_delete(struct perf_event_groups *groups,
> * Find group list by a cpu key and rotate it.
> */
> static void
> -perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
> +perf_event_groups_rotate(struct rb_root *groups, int cpu)
> {
> struct rb_node *node;
> struct perf_event *node_event;
>
> WARN_ON_ONCE(!groups);
>
> - node = groups->tree.rb_node;
> + node = groups->rb_node;
>
> while (node) {
> node_event = container_of(node,
> @@ -1638,7 +1616,7 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
> typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);
>
> static void
> -perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
> +perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu,
> perf_event_groups_iterate_f callback, void *data)
> {
> struct rb_node *node;
> @@ -1646,7 +1624,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
>
> WARN_ON_ONCE(!groups);
>
> - node = groups->tree.rb_node;
> + node = groups->rb_node;
>
> while (node) {
> node_event = container_of(node,
> @@ -1658,7 +1636,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
> node = node->rb_right;
> } else {
> list_for_each_entry(event, &node_event->group_list,
> - group_list_entry)
> + group_entry)
> callback(event, data);
> break;
> }
> @@ -1670,26 +1648,20 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
> * Iteration stops if the callback returns non zero.
> */
> static int
> -perf_event_groups_iterate(struct perf_event_groups *groups,
> +perf_event_groups_iterate(struct rb_root *groups,
> perf_event_groups_iterate_f callback, void *data)
> {
> int ret = 0;
> - struct perf_event *event;
> + struct rb_node *node;
>
> - WARN_ON_ONCE(!groups);
> + struct perf_event *node_event, *event;
>
> - list_for_each_entry(event, &groups->list, group_list_entry) {
> - ret = callback(event, data);
> - if (ret)
> - break;
> - }
> -
> - /* will replace itration above in patch v5 4/4
> + WARN_ON_ONCE(!groups);
>
> for (node = rb_first(groups); node; node = rb_next(node)) {
> node_event = container_of(node, struct perf_event, group_node);
> list_for_each_entry(event, &node_event->group_list,
> - group_list_entry) {
> + group_entry) {
> WARN_ON_ONCE(!(event->cpu == node_event->cpu));
> ret = callback(event, data);
> if (ret) {
> @@ -1698,8 +1670,6 @@ perf_event_groups_iterate(struct perf_event_groups *groups,
> }
> }
>
> - */
> -
> return ret;
> }
>
> @@ -2965,7 +2935,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
> .cpuctx = cpuctx,
> .ctx = ctx
> };
> +
> int cpu = smp_processor_id();
> +
> lockdep_assert_held(&ctx->lock);
>
> if (likely(!ctx->nr_events)) {
> @@ -3399,7 +3371,6 @@ ctx_sched_in(struct perf_event_context *ctx,
> .ctx = ctx
> };
> int cpu = smp_processor_id();
> -
> lockdep_assert_held(&ctx->lock);
>
> if (likely(!ctx->nr_events))
> @@ -3490,7 +3461,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
> * However, if task's ctx is not carrying any pinned
> * events, no need to flip the cpuctx's events around.
> */
> - if (!perf_event_groups_empty(&ctx->pinned_groups))
> + if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
> cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
> perf_event_sched_in(cpuctx, ctx, task, mux);
> perf_pmu_enable(ctx->pmu);
> @@ -4057,8 +4028,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
> raw_spin_lock_init(&ctx->lock);
> mutex_init(&ctx->mutex);
> INIT_LIST_HEAD(&ctx->active_ctx_list);
> - perf_event_groups_init(&ctx->pinned_groups);
> - perf_event_groups_init(&ctx->flexible_groups);
> + ctx->pinned_groups = RB_ROOT;
> + ctx->flexible_groups = RB_ROOT;
> INIT_LIST_HEAD(&ctx->event_list);
> atomic_set(&ctx->refcount, 1);
> }
> @@ -9695,7 +9666,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> INIT_LIST_HEAD(&event->sibling_list);
> RB_CLEAR_NODE(&event->group_node);
> INIT_LIST_HEAD(&event->group_list);
> - INIT_LIST_HEAD(&event->group_list_entry);
> INIT_LIST_HEAD(&event->rb_entry);
> INIT_LIST_HEAD(&event->active_entry);
> INIT_LIST_HEAD(&event->addr_filters.list);
>