Re: [PATCH v6 1/3] perf/core: use rb trees for pinned/flexible groups

From: Alexey Budankov
Date: Thu Aug 03 2017 - 16:30:23 EST


On 03.08.2017 16:00, Peter Zijlstra wrote:
> On Wed, Aug 02, 2017 at 11:13:54AM +0300, Alexey Budankov wrote:
>> This patch moves event groups into rb tree sorted by CPU, so that
>> multiplexing hrtimer interrupt handler would be able skipping to the current
>> CPU's list and ignore groups allocated for the other CPUs.
>>
>> New API for manipulating event groups in the trees is implemented as well
>> as adoption on the API in the current implementation.
>>
>> Because perf_event_groups_iterate() API provides capability to execute
>> a callback for every event group in a tree, adoption of the API introduces
>> some code that packs and unpacks arguments of functions existing in
>> the implementation as well as adjustments of their calling signatures
>> e.g. ctx_pinned_sched_in(), ctx_flexible_sched_in() and inherit_task_group().
>
> This does not speak of why we need group_list.
>
>> Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
>> ---
>> include/linux/perf_event.h | 18 ++-
>> kernel/events/core.c | 389 +++++++++++++++++++++++++++++++++------------
>> 2 files changed, 306 insertions(+), 101 deletions(-)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index a3b873f..282f121 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -572,6 +572,20 @@ struct perf_event {
>> */
>> struct list_head group_entry;
>> struct list_head sibling_list;
>> + /*
>> + * Node on the pinned or flexible tree located at the event context;
>> + * the node may be empty in case its event is not directly attached
>> + * to the tree but to group_list list of the event directly
>> + * attached to the tree;
>> + */
>> + struct rb_node group_node;
>> + /*
>> + * List keeps groups allocated for the same cpu;
>> + * the list may be empty in case its event is not directly
>> + * attached to the tree but to group_list list of the event directly
>> + * attached to the tree;
>> + */
>> + struct list_head group_list;
>>
>> /*
>> * We need storage to track the entries in perf_pmu_migrate_context; we
>> @@ -741,8 +755,8 @@ struct perf_event_context {
>> struct mutex mutex;
>>
>> struct list_head active_ctx_list;
>> - struct list_head pinned_groups;
>> - struct list_head flexible_groups;
>> + struct rb_root pinned_groups;
>> + struct rb_root flexible_groups;
>> struct list_head event_list;
>> int nr_events;
>> int nr_active;
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 426c2ff..0a4f619 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -1466,8 +1466,12 @@ static enum event_type_t get_event_type(struct perf_event *event)
>> return event_type;
>> }
>>
>> -static struct list_head *
>> -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
>> +/*
>> + * Extract pinned or flexible groups from the context
>> + * based on event attrs bits;
>> + */
>> +static struct rb_root *
>> +get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
>> {
>> if (event->attr.pinned)
>> return &ctx->pinned_groups;
>> @@ -1475,6 +1479,160 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
>> return &ctx->flexible_groups;
>> }
>>
>> +static void
>> +perf_event_groups_insert(struct rb_root *groups,
>> + struct perf_event *event);
>> +
>> +static void
>> +perf_event_groups_delete(struct rb_root *groups,
>> + struct perf_event *event);
>
> Can't we do away with these fwd declarations by simple reordering of the
> function definitions?

Ok. I will clean this up.

>
>> +/*
>> + * Helper function to insert event into the pinned or
>> + * flexible groups;
>> + */
>> +static void
>> +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
>> +{
>> + struct rb_root *groups;
>> +
>> + groups = get_event_groups(event, ctx);
>> + perf_event_groups_insert(groups, event);
>> +}
>> +
>> +/*
>> + * Helper function to delete event from its groups;
>> + */
>> +static void
>> +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
>> +{
>> + struct rb_root *groups;
>> +
>> + groups = get_event_groups(event, ctx);
>> + perf_event_groups_delete(groups, event);
>> +}
>> +
>> +/*
>> + * Insert a group into a tree using event->cpu as a key. If event->cpu node
>> + * is already attached to the tree then the event is added to the attached
>> + * group's group_list list.
>> + */
>> +static void
>> +perf_event_groups_insert(struct rb_root *groups,
>> + struct perf_event *event)
>> +{
>> + struct rb_node **node;
>> + struct rb_node *parent;
>> + struct perf_event *node_event;
>> +
>> + node = &groups->rb_node;
>> + parent = *node;
>> +
>> + while (*node) {
>> + parent = *node;
>> + node_event = container_of(*node,
>> + struct perf_event, group_node);
>> +
>> + if (event->cpu < node_event->cpu) {
>> + node = &parent->rb_left;
>> + } else if (event->cpu > node_event->cpu) {
>> + node = &parent->rb_right;
>
> I would much prefer you use a comparator like:

Ok. I Will do.

>
> static always_inline int
> perf_event_less(struct perf_event *left, struct perf_event *right)
> {
> if (left->cpu < right_cpu)
> return 1;
>
> return 0;
> }
>
> That way we can add additional order. In specific ARM also wants things
> ordered on PMU for their big.LITTLE stuff.
>
>> + } else {
>> + list_add_tail(&event->group_entry,
>> + &node_event->group_list);
>> + return;
>
> Urgh, so this is what you want that list for... why not keep duplicates
> in the tree itself and iterate that?
>
>> + }
>> + }
>> +
>> + list_add_tail(&event->group_entry, &event->group_list);
>> +
>> + rb_link_node(&event->group_node, parent, node);
>> + rb_insert_color(&event->group_node, groups);
>> +}
>
>> +/*
>> + * Find group list by a cpu key and rotate it.
>> + */
>> +static void
>> +perf_event_groups_rotate(struct rb_root *groups, int cpu)
>> +{
>> + struct rb_node *node;
>> + struct perf_event *node_event;
>> +
>> + node = groups->rb_node;
>> +
>> + while (node) {
>> + node_event = container_of(node,
>> + struct perf_event, group_node);
>> +
>> + if (cpu < node_event->cpu) {
>> + node = node->rb_left;
>> + } else if (cpu > node_event->cpu) {
>> + node = node->rb_right;
>> + } else {
>> + list_rotate_left(&node_event->group_list);
>> + break;
>> + }
>> + }
>> +}
>
> Ah, you worry about how to rotate inside a tree?

Exactly.

>
> You can do that by adding (run)time based ordering, and you'll end up
> with a runtime based scheduler.

Do you mean replacing a CPU indexed rb_tree of lists with
an CPU indexed rb_tree of counter indexed rb_trees?

>
> A trivial variant keeps a simple counter per tree that is incremented
> for each rotation. That should end up with the events ordered exactly
> like the list. And if you have that comparator like above, expressing
> that additional ordering becomes simple ;-)
>
> Something like:
>
> struct group {
> u64 vtime;
> rb_tree tree;
> };
>
> bool event_less(left, right)
> {
> if (left->cpu < right->cpu)
> return true;
>
> if (left->cpu > right_cpu)
> return false;
>
> if (left->vtime < right->vtime)
> return true;
>
> return false;
> }
>
> insert_group(group, event, tail)
> {
> if (tail)
> event->vtime = ++group->vtime;
>
> tree_insert(&group->root, event);
> }
>
> Then every time you use insert_group(.tail=1) it goes to the end of that
> CPU's 'list'.
>

Could you elaborate more on how to implement rotation?

Do you mean the rotation of rb_tree? So that the iteration order
of the counter indexed rb_tree would coincide with iteration
order of a list after rotation?

And I then still need struct rb_tree group_tree in perf_event structure:

+ /*
+ * Node on the pinned or flexible tree located at the event context;
+ * the node may be empty in case its event is not directly attached
+ * to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct rb_node group_node;
+ /*
+ * List keeps groups allocated for the same cpu;
+ * the list may be empty in case its event is not directly
+ * attached to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct rb_tree group_tree;

>
> The added benefit is that it then becomes fairly simple to improve upon
> the RR scheduling, which suffers a bunch of boundary conditions where
> the task runtimes mis-align with the rotation window.
>
>> +typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);
>
> We already have perf_iterate_f, the only difference appears to be that
> this has a return value. Surely these can be unified.

Ok. I will unify at some point.

>
>> +/*
>> + * Iterate event groups and call provided callback for every group in the tree.
>> + * Iteration stops if the callback returns non zero.
>> + */
>> +static int
>> +perf_event_groups_iterate(struct rb_root *groups,
>> + perf_event_groups_iterate_f callback, void *data)
>> +{
>> + int ret = 0;
>> + struct rb_node *node;
>> + struct perf_event *node_event, *event;
>
> In general we prefer variable definitions to be ordered on line length,
> longest first. So the exact opposite of what you have here.

Accepted.

>
>> +
>> + for (node = rb_first(groups); node; node = rb_next(node)) {
>> + node_event = container_of(node, struct perf_event, group_node);
>> + list_for_each_entry(event, &node_event->group_list,
>> + group_entry) {
>> + ret = callback(event, data);
>> + if (ret) {
>> + return ret;
>> + }
>> + }
>> + }
>> +
>> + return ret;
>> +}
>> +
>> /*
>> * Add a event from the lists for its context.
>> * Must be called with ctx->mutex and ctx->lock held.
>
>> @@ -1869,6 +2023,22 @@ group_sched_out(struct perf_event *group_event,
>> cpuctx->exclusive = 0;
>> }
>>
>> +struct group_sched_params {
>> + struct perf_cpu_context *cpuctx;
>> + struct perf_event_context *ctx;
>> + int can_add_hw;
>> +};
>> +
>> +static int
>> +group_sched_out_callback(struct perf_event *event, void *data)
>> +{
>> + struct group_sched_params *params = data;
>> +
>> + group_sched_out(event, params->cpuctx, params->ctx);
>> +
>> + return 0;
>> +}
>
> Right, C sucks.. or possibly you've chosen the wrong pattern.

Consequences of new API integration with a callback in signature :)

>
> So the alternative is something like:
>
> #define for_each_group_event(event, group, cpu, pmu, field) \
> for (event = rb_entry_safe(group_first(group, cpu, pmu),\
> typeof(*event), field); \
> event && event->cpu == cpu && event->pmu == pmu; \
> event = rb_entry_safe(rb_next(&event->field), \
> typeof(*event), field))
>
>
> And then you can write things like:
>
> for_each_group_event(event, group, cpu, pmu)
> group_sched_out(event, cpuctx, ctx);
>
>
>> +
>> #define DETACH_GROUP 0x01UL
>>
>> /*
>> @@ -2712,7 +2882,10 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>> enum event_type_t event_type)
>> {
>> int is_active = ctx->is_active;
>> - struct perf_event *event;
>> + struct group_sched_params params = {
>> + .cpuctx = cpuctx,
>> + .ctx = ctx
>> + };
>>
>> lockdep_assert_held(&ctx->lock);
>>
>> @@ -2759,13 +2932,13 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>>
>> perf_pmu_disable(ctx->pmu);
>> if (is_active & EVENT_PINNED) {
>> - list_for_each_entry(event, &ctx->pinned_groups, group_entry)
>> - group_sched_out(event, cpuctx, ctx);
>> + perf_event_groups_iterate(&ctx->pinned_groups,
>> + group_sched_out_callback, &params);
>
> So here I would expect to not iterate events where event->cpu !=
> smp_processor_id() (and ideally not where event->pmu != ctx->pmu).
>

We still need to iterate thru all groups on thread context switch in
and out as well as iterate thru cpu == -1 list (software events) additionally
to smp_processor_id() list from multiplexing timer interrupt handler.

>> }
>>
>> if (is_active & EVENT_FLEXIBLE) {
>> - list_for_each_entry(event, &ctx->flexible_groups, group_entry)
>> - group_sched_out(event, cpuctx, ctx);
>> + perf_event_groups_iterate(&ctx->flexible_groups,
>> + group_sched_out_callback, &params);
>
> Idem.
>
>> }
>> perf_pmu_enable(ctx->pmu);
>> }
>
>
> I think the rest of the patch is just plumbing to make the above useful.
> Let me know if I missed something of value.
>