[PATCH v5 4/4]: perf/core: complete replace of lists by rb trees for pinned and flexible groups at perf_event_context

From: Alexey Budankov
Date: Mon Jul 10 2017 - 09:04:26 EST


perf/core: complete replace of lists by rb trees for pinned and
flexible groups at perf_event_context

By default, the userspace perf tool opens per-cpu task-bound events
when sampling, so for N logical events requested by the user, the tool
will open N * NR_CPUS events.

In the kernel, we mux events with a hrtimer, periodically rotating the
flexible group list and trying to schedule each group in turn. We skip
groups whose cpu filter doesn't match. So when we get unlucky, we can
walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer invocation.

This has been observed to result in significant overhead when running
the STREAM benchmark on 272 core Xeon Phi systems.

One way to avoid this is to place our events into an rb tree sorted by
CPU filter, so that our hrtimer can skip to the current CPU's
list and ignore everything else.

This patch implements complete replacement of lists by rb trees for
pinned and flexible groups.

The patch set was tested on Xeon Phi using perf_fuzzer and tests
from here: https://github.com/deater/perf_event_tests

The full patch set (v1-4) is attached for convenience.

Branch revision:
* perf/core 007b811b4041989ec2dc91b9614aa2c41332723e
Merge tag 'perf-core-for-mingo-4.13-20170719' of
git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 20 +---------
kernel/events/core.c | 94 ++++++++++++++++------------------------------
2 files changed, 34 insertions(+), 80 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7b2cddf..8e1967f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -603,13 +603,6 @@ struct perf_event {
*/
struct list_head group_list;
/*
- * Entry into the group_list list above;
- * the entry may be attached to the self group_list list above
- * in case the event is directly attached to the tree;
- */
- struct list_head group_list_entry;
-
- /*
* We need storage to track the entries in perf_pmu_migrate_context; we
* cannot use the event_entry because of RCU and we want to keep the
* group in tact which avoids us using the other two entries.
@@ -749,15 +742,6 @@ struct perf_event {
#endif /* CONFIG_PERF_EVENTS */
};

-/*
- * event groups keep group leader events arranged as an rb tree with
- * event->cpu key and as a list for the whole tree iterations;
- */
-struct perf_event_groups {
- struct list_head list;
- struct rb_root tree;
-};
-
/**
* struct perf_event_context - event context structure
*
@@ -778,8 +762,8 @@ struct perf_event_context {
struct mutex mutex;

struct list_head active_ctx_list;
- struct perf_event_groups pinned_groups;
- struct perf_event_groups flexible_groups;
+ struct rb_root pinned_groups;
+ struct rb_root flexible_groups;
struct list_head event_list;
int nr_events;
int nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bddcb87..5142434 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1466,7 +1466,7 @@ static enum event_type_t get_event_type(struct perf_event *event)
* Extract pinned or flexible groups from the context
* based on event attrs bits;
*/
-static struct perf_event_groups *
+static struct rb_root *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
@@ -1476,11 +1476,11 @@ get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
}

static void
-perf_event_groups_insert(struct perf_event_groups *groups,
+perf_event_groups_insert(struct rb_root *groups,
struct perf_event *event);

static void
-perf_event_groups_delete(struct perf_event_groups *groups,
+perf_event_groups_delete(struct rb_root *groups,
struct perf_event *event);

/*
@@ -1490,7 +1490,7 @@ perf_event_groups_delete(struct perf_event_groups *groups,
static void
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event_groups *groups;
+ struct rb_root *groups;

groups = get_event_groups(event, ctx);
perf_event_groups_insert(groups, event);
@@ -1502,48 +1502,28 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
static void
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event_groups *groups;
+ struct rb_root *groups;

groups = get_event_groups(event, ctx);
perf_event_groups_delete(groups, event);
}

/*
- * Helper function to test if event groups are empty;
- */
-static int
-perf_event_groups_empty(struct perf_event_groups *groups)
-{
- return list_empty(&groups->list);
-}
-
-/*
- * Helper function to Initialize event groups object;
- */
-static void
-perf_event_groups_init(struct perf_event_groups *groups)
-{
- INIT_LIST_HEAD(&groups->list);
- groups->tree = RB_ROOT;
-}
-
-/*
* Insert a group into a tree using event->cpu as a key. If event->cpu node
* is already attached to the tree then the event is added to the attached
* group's group_list list.
*/
static void
-perf_event_groups_insert(struct perf_event_groups *groups,
- struct perf_event *event)
+perf_event_groups_insert(struct rb_root *groups, struct perf_event *event)
{
struct rb_node **node;
struct rb_node *parent;
struct perf_event *node_event;

WARN_ON_ONCE(!groups || !event);
- WARN_ON_ONCE(!list_empty(&event->group_list_entry));
+ WARN_ON_ONCE(!list_empty(&event->group_entry));

- node = &groups->tree.rb_node;
+ node = &groups->rb_node;
parent = *node;

while (*node) {
@@ -1556,16 +1536,16 @@ perf_event_groups_insert(struct perf_event_groups *groups,
} else if (event->cpu > node_event->cpu) {
node = &parent->rb_right;
} else {
- list_add_tail(&event->group_list_entry,
+ list_add_tail(&event->group_entry,
&node_event->group_list);
return;
}
}

- list_add_tail(&event->group_list_entry, &event->group_list);
+ list_add_tail(&event->group_entry, &event->group_list);

rb_link_node(&event->group_node, parent, node);
- rb_insert_color(&event->group_node, &groups->tree);
+ rb_insert_color(&event->group_node, groups);
}

/*
@@ -1573,30 +1553,28 @@ perf_event_groups_insert(struct perf_event_groups *groups,
* it also detaches all groups on the group's group_list list.
*/
static void
-perf_event_groups_delete(struct perf_event_groups *groups,
- struct perf_event *event)
+perf_event_groups_delete(struct rb_root *groups, struct perf_event *event)
{
struct perf_event *next;

WARN_ON_ONCE(!event);
- WARN_ON_ONCE(list_empty(&event->group_list_entry));
+ WARN_ON_ONCE(list_empty(&event->group_entry));

- list_del_init(&event->group_list_entry);
+ list_del_init(&event->group_entry);

if (!RB_EMPTY_NODE(&event->group_node)) {
WARN_ON_ONCE(!groups);
- if (!RB_EMPTY_ROOT(&groups->tree)) {
+ if (!RB_EMPTY_ROOT(groups)) {
if (list_empty(&event->group_list)) {
rb_erase(&event->group_node, &groups->tree);
} else {
next = list_first_entry(&event->group_list,
- struct perf_event, group_list_entry);
+ struct perf_event, group_entry);
list_replace_init(&event->group_list,
&next->group_list);
rb_replace_node(&event->group_node,
- &next->group_node, &groups->tree);
+ &next->group_node, groups);
}
-
}
RB_CLEAR_NODE(&event->group_node);
}
@@ -1606,14 +1584,14 @@ perf_event_groups_delete(struct perf_event_groups *groups,
* Find group list by a cpu key and rotate it.
*/
static void
-perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
+perf_event_groups_rotate(struct rb_root *groups, int cpu)
{
struct rb_node *node;
struct perf_event *node_event;

WARN_ON_ONCE(!groups);

- node = groups->tree.rb_node;
+ node = groups->rb_node;

while (node) {
node_event = container_of(node,
@@ -1638,7 +1616,7 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);

static void
-perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
+perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu,
perf_event_groups_iterate_f callback, void *data)
{
struct rb_node *node;
@@ -1646,7 +1624,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,

WARN_ON_ONCE(!groups);

- node = groups->tree.rb_node;
+ node = groups->rb_node;

while (node) {
node_event = container_of(node,
@@ -1658,7 +1636,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
node = node->rb_right;
} else {
list_for_each_entry(event, &node_event->group_list,
- group_list_entry)
+ group_entry)
callback(event, data);
break;
}
@@ -1670,26 +1648,20 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
* Iteration stops if the callback returns non zero.
*/
static int
-perf_event_groups_iterate(struct perf_event_groups *groups,
+perf_event_groups_iterate(struct rb_root *groups,
perf_event_groups_iterate_f callback, void *data)
{
int ret = 0;
- struct perf_event *event;
+ struct rb_node *node;

- WARN_ON_ONCE(!groups);
+ struct perf_event *node_event, *event;

- list_for_each_entry(event, &groups->list, group_list_entry) {
- ret = callback(event, data);
- if (ret)
- break;
- }
-
- /* will replace itration above in patch v5 4/4
+ WARN_ON_ONCE(!groups);

for (node = rb_first(groups); node; node = rb_next(node)) {
node_event = container_of(node, struct perf_event, group_node);
list_for_each_entry(event, &node_event->group_list,
- group_list_entry) {
+ group_entry) {
WARN_ON_ONCE(!(event->cpu == node_event->cpu));
ret = callback(event, data);
if (ret) {
@@ -1698,8 +1670,6 @@ perf_event_groups_iterate(struct perf_event_groups *groups,
}
}

- */
-
return ret;
}

@@ -2965,7 +2935,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
.cpuctx = cpuctx,
.ctx = ctx
};
+
int cpu = smp_processor_id();
+
lockdep_assert_held(&ctx->lock);

if (likely(!ctx->nr_events)) {
@@ -3399,7 +3371,6 @@ ctx_sched_in(struct perf_event_context *ctx,
.ctx = ctx
};
int cpu = smp_processor_id();
-
lockdep_assert_held(&ctx->lock);

if (likely(!ctx->nr_events))
@@ -3490,7 +3461,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!perf_event_groups_empty(&ctx->pinned_groups))
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
perf_event_sched_in(cpuctx, ctx, task, mux);
perf_pmu_enable(ctx->pmu);
@@ -4057,8 +4028,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
- perf_event_groups_init(&ctx->pinned_groups);
- perf_event_groups_init(&ctx->flexible_groups);
+ ctx->pinned_groups = RB_ROOT;
+ ctx->flexible_groups = RB_ROOT;
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
}
@@ -9695,7 +9666,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->sibling_list);
RB_CLEAR_NODE(&event->group_node);
INIT_LIST_HEAD(&event->group_list);
- INIT_LIST_HEAD(&event->group_list_entry);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
include/linux/perf_event.h | 55 +++--
kernel/events/core.c | 604 +++++++++++++++++++++++++++++++++------------
2 files changed, 491 insertions(+), 168 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a6358..8e1967f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -550,6 +550,22 @@ struct pmu_event_list {
struct list_head list;
};

+struct perf_event_tstamp {
+ /*
+ * These are timestamps used for computing total_time_enabled
+ * and total_time_running when the event is in INACTIVE or
+ * ACTIVE state, measured in nanoseconds from an arbitrary point
+ * in time.
+ * enabled: the notional time when the event was enabled
+ * running: the notional time when the event was scheduled on
+ * stopped: in INACTIVE state, the notional time when the
+ * event was scheduled off.
+ */
+ u64 enabled;
+ u64 running;
+ u64 stopped;
+};
+
/**
* struct perf_event - performance event kernel representation:
*/
@@ -572,7 +588,20 @@ struct perf_event {
*/
struct list_head group_entry;
struct list_head sibling_list;
-
+ /*
+ * Node on the pinned or flexible tree located at the event context;
+ * the node may be empty in case its event is not directly attached
+ * to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct rb_node group_node;
+ /*
+ * List keeps groups allocated for the same cpu;
+ * the list may be empty in case its event is not directly
+ * attached to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct list_head group_list;
/*
* We need storage to track the entries in perf_pmu_migrate_context; we
* cannot use the event_entry because of RCU and we want to keep the
@@ -611,19 +640,11 @@ struct perf_event {
u64 total_time_running;

/*
- * These are timestamps used for computing total_time_enabled
- * and total_time_running when the event is in INACTIVE or
- * ACTIVE state, measured in nanoseconds from an arbitrary point
- * in time.
- * tstamp_enabled: the notional time when the event was enabled
- * tstamp_running: the notional time when the event was scheduled on
- * tstamp_stopped: in INACTIVE state, the notional time when the
- * event was scheduled off.
+ * tstamp points to the tstamp_data object below or to the object
+ * located at the event context;
*/
- u64 tstamp_enabled;
- u64 tstamp_running;
- u64 tstamp_stopped;
-
+ struct perf_event_tstamp *tstamp;
+ struct perf_event_tstamp tstamp_data;
/*
* timestamp shadows the actual context timing but it can
* be safely used in NMI interrupt context. It reflects the
@@ -741,8 +762,8 @@ struct perf_event_context {
struct mutex mutex;

struct list_head active_ctx_list;
- struct list_head pinned_groups;
- struct list_head flexible_groups;
+ struct rb_root pinned_groups;
+ struct rb_root flexible_groups;
struct list_head event_list;
int nr_events;
int nr_active;
@@ -758,6 +779,10 @@ struct perf_event_context {
*/
u64 time;
u64 timestamp;
+ /*
+ * Context cache for filtered out events;
+ */
+ struct perf_event_tstamp tstamp_data;

/*
* These fields let us detect when two contexts have both
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc63f8d..2d02f75 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -555,11 +555,11 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;

static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
+ enum event_type_t event_type, int mux);

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task);
+ struct task_struct *task, int mux);

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -701,6 +701,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
struct perf_cpu_context *cpuctx;
struct list_head *list;
unsigned long flags;
+ int mux = 0;

/*
* Disable interrupts and preemption to avoid this CPU's
@@ -716,7 +717,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
perf_pmu_disable(cpuctx->ctx.pmu);

if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
/*
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
@@ -735,7 +736,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
*/
cpuctx->cgrp = perf_cgroup_from_task(task,
&cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
}
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -865,10 +866,10 @@ perf_cgroup_mark_enabled(struct perf_event *event,

event->cgrp_defer_enabled = 0;

- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ event->tstamp->enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->tstamp->enabled = tstamp - sub->total_time_enabled;
sub->cgrp_defer_enabled = 0;
}
}
@@ -1378,6 +1379,9 @@ static void update_context_time(struct perf_event_context *ctx)

ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ ctx->tstamp_data.running += ctx->time - ctx->tstamp_data.stopped;
+ ctx->tstamp_data.stopped = ctx->time;
}

static u64 perf_event_time(struct perf_event *event)
@@ -1419,16 +1423,16 @@ static void update_event_times(struct perf_event *event)
else if (ctx->is_active)
run_end = ctx->time;
else
- run_end = event->tstamp_stopped;
+ run_end = event->tstamp->stopped;

- event->total_time_enabled = run_end - event->tstamp_enabled;
+ event->total_time_enabled = run_end - event->tstamp->enabled;

if (event->state == PERF_EVENT_STATE_INACTIVE)
- run_end = event->tstamp_stopped;
+ run_end = event->tstamp->stopped;
else
run_end = perf_event_time(event);

- event->total_time_running = run_end - event->tstamp_running;
+ event->total_time_running = run_end - event->tstamp->running;

}

@@ -1458,8 +1462,12 @@ static enum event_type_t get_event_type(struct perf_event *event)
return event_type;
}

-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits;
+ */
+static struct rb_root *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
return &ctx->pinned_groups;
@@ -1467,6 +1475,204 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
return &ctx->flexible_groups;
}

+static void
+perf_event_groups_insert(struct rb_root *groups,
+ struct perf_event *event);
+
+static void
+perf_event_groups_delete(struct rb_root *groups,
+ struct perf_event *event);
+
+/*
+ * Helper function to insert event into the pinned or
+ * flexible groups;
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct rb_root *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Helper function to delete event from its groups;
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct rb_root *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_event_groups_insert(struct rb_root *groups, struct perf_event *event)
+{
+ struct rb_node **node;
+ struct rb_node *parent;
+ struct perf_event *node_event;
+
+ WARN_ON_ONCE(!groups || !event);
+ WARN_ON_ONCE(!list_empty(&event->group_entry));
+
+ node = &groups->rb_node;
+ parent = *node;
+
+ while (*node) {
+ parent = *node;
+ node_event = container_of(*node,
+ struct perf_event, group_node);
+
+ if (event->cpu < node_event->cpu) {
+ node = &parent->rb_left;
+ } else if (event->cpu > node_event->cpu) {
+ node = &parent->rb_right;
+ } else {
+ list_add_tail(&event->group_entry,
+ &node_event->group_list);
+ return;
+ }
+ }
+
+ list_add_tail(&event->group_entry, &event->group_list);
+
+ rb_link_node(&event->group_node, parent, node);
+ rb_insert_color(&event->group_node, groups);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_event_groups_delete(struct rb_root *groups, struct perf_event *event)
+{
+ struct perf_event *next;
+
+ WARN_ON_ONCE(!event);
+ WARN_ON_ONCE(list_empty(&event->group_entry));
+
+ list_del_init(&event->group_entry);
+
+ if (!RB_EMPTY_NODE(&event->group_node)) {
+ WARN_ON_ONCE(!groups);
+ if (!RB_EMPTY_ROOT(groups)) {
+ if (list_empty(&event->group_list)) {
+ rb_erase(&event->group_node, groups);
+ } else {
+ next = list_first_entry(&event->group_list,
+ struct perf_event, group_entry);
+ list_replace_init(&event->group_list,
+ &next->group_list);
+ rb_replace_node(&event->group_node,
+ &next->group_node, groups);
+
+ }
+ }
+ RB_CLEAR_NODE(&event->group_node);
+ }
+}
+
+/*
+ * Find group list by a cpu key and rotate it.
+ */
+static void
+perf_event_groups_rotate(struct rb_root *groups, int cpu)
+{
+ struct rb_node *node;
+ struct perf_event *node_event;
+
+ WARN_ON_ONCE(!groups);
+
+ node = groups->rb_node;
+
+ while (node) {
+ node_event = container_of(node,
+ struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ list_rotate_left(&node_event->group_list);
+ break;
+ }
+ }
+}
+
+/*
+ * Find group_list list by a cpu key and call provided callback for every
+ * group on the list.
+ */
+
+typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);
+
+static void
+perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu,
+ perf_event_groups_iterate_f callback, void *data)
+{
+ struct rb_node *node;
+ struct perf_event *event, *node_event;
+
+ WARN_ON_ONCE(!groups);
+
+ node = groups->rb_node;
+
+ while (node) {
+ node_event = container_of(node,
+ struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ list_for_each_entry(event, &node_event->group_list,
+ group_entry)
+ callback(event, data);
+ break;
+ }
+ }
+}
+
+/*
+ * Iterate event groups and call provided callback for every group in the tree.
+ * Iteration stops if the callback returns non zero.
+ */
+static int
+perf_event_groups_iterate(struct rb_root *groups,
+ perf_event_groups_iterate_f callback, void *data)
+{
+ int ret = 0;
+ struct rb_node *node;
+ struct perf_event *event, *node_event;
+
+ WARN_ON_ONCE(!groups);
+
+ for (node = rb_first(groups); node; node = rb_next(node)) {
+ node_event = container_of(node, struct perf_event, group_node);
+ list_for_each_entry(event, &node_event->group_list,
+ group_entry) {
+ WARN_ON_ONCE(!(event->cpu == node_event->cpu));
+ ret = callback(event, data);
+ if (ret) {
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -1485,12 +1691,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
- struct list_head *list;
-
event->group_caps = event->event_caps;
-
- list = ctx_group_list(event, ctx);
- list_add_tail(&event->group_entry, list);
+ add_event_to_groups(event, ctx);
}

list_update_cgroup_event(event, ctx, true);
@@ -1681,7 +1883,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
list_del_rcu(&event->event_entry);

if (event->group_leader == event)
- list_del_init(&event->group_entry);
+ del_event_from_groups(event, ctx);

update_group_times(event);

@@ -1701,7 +1903,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
- struct list_head *list = NULL;

lockdep_assert_held(&event->ctx->lock);

@@ -1722,22 +1923,23 @@ static void perf_group_detach(struct perf_event *event)
goto out;
}

- if (!list_empty(&event->group_entry))
- list = &event->group_entry;
-
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- if (list)
- list_move_tail(&sibling->group_entry, list);
+
sibling->group_leader = sibling;

/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;

+ if (!list_empty(&event->group_entry)) {
+ list_del_init(&sibling->group_entry);
+ add_event_to_groups(sibling, event->ctx);
+ }
+
WARN_ON_ONCE(sibling->ctx != event->ctx);
}

@@ -1806,9 +2008,13 @@ event_sched_out(struct perf_event *event,
*/
if (event->state == PERF_EVENT_STATE_INACTIVE &&
!event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
+ delta = tstamp - event->tstamp->stopped;
+ event->tstamp->running += delta;
+ event->tstamp->stopped = tstamp;
+ if (event->tstamp != &event->tstamp_data) {
+ event->tstamp_data = *event->tstamp;
+ event->tstamp = &event->tstamp_data;
+ }
}

if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -1816,7 +2022,7 @@ event_sched_out(struct perf_event *event,

perf_pmu_disable(event->pmu);

- event->tstamp_stopped = tstamp;
+ event->tstamp->stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
@@ -1861,6 +2067,22 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}

+struct group_sched_params {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ int can_add_hw;
+};
+
+static int
+group_sched_out_callback(struct perf_event *event, void *data)
+{
+ struct group_sched_params *params = data;
+
+ group_sched_out(event, params->cpuctx, params->ctx);
+
+ return 0;
+}
+
#define DETACH_GROUP 0x01UL

/*
@@ -2091,7 +2313,7 @@ event_sched_in(struct perf_event *event,
goto out;
}

- event->tstamp_running += tstamp - event->tstamp_stopped;
+ event->tstamp->running += tstamp - event->tstamp->stopped;

if (!is_software_event(event))
cpuctx->active_oncpu++;
@@ -2163,8 +2385,8 @@ group_sched_in(struct perf_event *group_event,
simulate = true;

if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
+ event->tstamp->running += now - event->tstamp->stopped;
+ event->tstamp->stopped = now;
} else {
event_sched_out(event, cpuctx, ctx);
}
@@ -2216,43 +2438,45 @@ static void add_event_to_ctx(struct perf_event *event,

list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ event->tstamp->enabled = tstamp;
+ event->tstamp->running = tstamp;
+ event->tstamp->stopped = tstamp;
}

static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
+ enum event_type_t event_type, int mux);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task);
+ struct task_struct *task, int mux);

static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
enum event_type_t event_type)
{
+ int mux = 0;
+
if (!cpuctx->task_ctx)
return;

if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;

- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, cpuctx, event_type, mux);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
- struct task_struct *task)
+ struct task_struct *task, int mux)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
}

/*
@@ -2276,6 +2500,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
{
enum event_type_t ctx_event_type = event_type & EVENT_ALL;
bool cpu_event = !!(event_type & EVENT_CPU);
+ int mux = 0;

/*
* If pinned groups are involved, flexible groups also need to be
@@ -2296,11 +2521,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
+ cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);

- perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx, current, mux);
perf_pmu_enable(cpuctx->ctx.pmu);
}

@@ -2318,6 +2543,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
+ int mux = 0;

raw_spin_lock(&cpuctx->ctx.lock);
if (ctx->task) {
@@ -2344,7 +2570,7 @@ static int __perf_install_in_context(void *info)
}

if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2463,10 +2689,10 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);

event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ event->tstamp->enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->tstamp->enabled = tstamp - sub->total_time_enabled;
}
}

@@ -2480,13 +2706,14 @@ static void __perf_event_enable(struct perf_event *event,
{
struct perf_event *leader = event->group_leader;
struct perf_event_context *task_ctx;
+ int mux = 0;

if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state <= PERF_EVENT_STATE_ERROR)
return;

if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);

__perf_event_mark_enabled(event);

@@ -2496,7 +2723,7 @@ static void __perf_event_enable(struct perf_event *event,
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
return;
}

@@ -2505,7 +2732,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
return;
}

@@ -2701,10 +2928,14 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);

static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type, int mux)
{
int is_active = ctx->is_active;
- struct perf_event *event;
+ struct group_sched_params params = {
+ .cpuctx = cpuctx,
+ .ctx = ctx
+ };
+ int cpu = smp_processor_id();

lockdep_assert_held(&ctx->lock);

@@ -2751,13 +2982,27 @@ static void ctx_sched_out(struct perf_event_context *ctx,

perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
- list_for_each_entry(event, &ctx->pinned_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ if (mux) {
+ perf_event_groups_iterate_cpu(&ctx->pinned_groups, -1,
+ group_sched_out_callback, &params);
+ perf_event_groups_iterate_cpu(&ctx->pinned_groups, cpu,
+ group_sched_out_callback, &params);
+ } else {
+ perf_event_groups_iterate(&ctx->pinned_groups,
+ group_sched_out_callback, &params);
+ }
}

if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry(event, &ctx->flexible_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ if (mux) {
+ perf_event_groups_iterate_cpu(&ctx->flexible_groups, -1,
+ group_sched_out_callback, &params);
+ perf_event_groups_iterate_cpu(&ctx->flexible_groups, cpu,
+ group_sched_out_callback, &params);
+ } else {
+ perf_event_groups_iterate(&ctx->flexible_groups,
+ group_sched_out_callback, &params);
+ }
}
perf_pmu_enable(ctx->pmu);
}
@@ -3046,78 +3291,85 @@ void __perf_event_task_sched_out(struct task_struct *task,
* Called with IRQs disabled
*/
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type, int mux)
{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+ ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
}

-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static int
+ctx_pinned_sched_in(struct perf_event *event, void *data)
{
- struct perf_event *event;
+ struct group_sched_params *params = data;

- list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- if (!event_filter_match(event))
- continue;
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+ if (!event_filter_match(event)) {
+ if (event->tstamp != &params->ctx->tstamp_data)
+ event->tstamp = &params->ctx->tstamp_data;
+ return 0;
+ }

- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, params->ctx);

- if (group_can_go_on(event, cpuctx, 1))
- group_sched_in(event, cpuctx, ctx);
+ if (group_can_go_on(event, params->cpuctx, 1))
+ group_sched_in(event, params->cpuctx, params->ctx);

- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
- }
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_ERROR;
}
+
+ return 0;
}

-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static int
+ctx_flexible_sched_in(struct perf_event *event, void *data)
{
- struct perf_event *event;
- int can_add_hw = 1;
+ struct group_sched_params *params = data;

- list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
- /* Ignore events in OFF or ERROR state */
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- /*
- * Listen to the 'cpu' scheduling filter constraint
- * of events:
- */
- if (!event_filter_match(event))
- continue;
+ /* Ignore events in OFF or ERROR state */
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+ /*
+ * Listen to the 'cpu' scheduling filter constraint
+ * of events:
+ */
+ if (!event_filter_match(event)) {
+ if (event->tstamp != &params->ctx->tstamp_data)
+ event->tstamp = &params->ctx->tstamp_data;
+ return 0;
+ }

- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, params->ctx);

- if (group_can_go_on(event, cpuctx, can_add_hw)) {
- if (group_sched_in(event, cpuctx, ctx))
- can_add_hw = 0;
- }
+ if (group_can_go_on(event, params->cpuctx, params->can_add_hw)) {
+ if (group_sched_in(event, params->cpuctx, params->ctx))
+ params->can_add_hw = 0;
}
+
+ return 0;
}

static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task)
+ struct task_struct *task, int mux)
{
int is_active = ctx->is_active;
- u64 now;
+ struct group_sched_params params = {
+ .cpuctx = cpuctx,
+ .ctx = ctx
+ };
+ int cpu = smp_processor_id();

lockdep_assert_held(&ctx->lock);

@@ -3136,7 +3388,7 @@ ctx_sched_in(struct perf_event_context *ctx,

if (is_active & EVENT_TIME) {
/* start ctx time */
- now = perf_clock();
+ u64 now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);
}
@@ -3145,27 +3397,56 @@ ctx_sched_in(struct perf_event_context *ctx,
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+
+ if (is_active & EVENT_PINNED) {
+ if (mux) {
+ perf_event_groups_iterate_cpu(&ctx->pinned_groups,
+ -1, ctx_pinned_sched_in,
+ &params);
+ perf_event_groups_iterate_cpu(&ctx->pinned_groups,
+ cpu, ctx_pinned_sched_in,
+ &params);
+ } else {
+ perf_event_groups_iterate(&ctx->pinned_groups,
+ ctx_pinned_sched_in,
+ &params);
+ }
+ }

/* Then walk through the lower prio flexible groups */
- if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ if (is_active & EVENT_FLEXIBLE) {
+ if (mux) {
+ params.can_add_hw = 1;
+ perf_event_groups_iterate_cpu(&ctx->flexible_groups,
+ -1, ctx_flexible_sched_in,
+ &params);
+ params.can_add_hw = 1;
+ perf_event_groups_iterate_cpu(&ctx->flexible_groups,
+ cpu, ctx_flexible_sched_in,
+ &params);
+ } else {
+ params.can_add_hw = 1;
+ perf_event_groups_iterate(&ctx->flexible_groups,
+ ctx_flexible_sched_in,
+ &params);
+ }
+ }
}

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task)
+ struct task_struct *task, int mux)
{
struct perf_event_context *ctx = &cpuctx->ctx;

- ctx_sched_in(ctx, cpuctx, event_type, task);
+ ctx_sched_in(ctx, cpuctx, event_type, task, mux);
}

static void perf_event_context_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
+ int mux = 0;

cpuctx = __get_cpu_context(ctx);
if (cpuctx->task_ctx == ctx)
@@ -3181,9 +3462,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!list_empty(&ctx->pinned_groups))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
+ perf_event_sched_in(cpuctx, ctx, task, mux);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
}
@@ -3416,14 +3697,19 @@ static void rotate_ctx(struct perf_event_context *ctx)
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
- if (!ctx->rotate_disable)
- list_rotate_left(&ctx->flexible_groups);
+ if (!ctx->rotate_disable) {
+ int cpu = smp_processor_id();
+
+ perf_event_groups_rotate(&ctx->flexible_groups, -1);
+ perf_event_groups_rotate(&ctx->flexible_groups, cpu);
+ }
}

static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
int rotate = 0;
+ int mux = 1;

if (cpuctx->ctx.nr_events) {
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
@@ -3442,15 +3728,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);

- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
if (ctx)
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, mux);

rotate_ctx(&cpuctx->ctx);
if (ctx)
rotate_ctx(ctx);

- perf_event_sched_in(cpuctx, ctx, current);
+ perf_event_sched_in(cpuctx, ctx, current, mux);

perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3502,6 +3788,7 @@ static void perf_event_enable_on_exec(int ctxn)
struct perf_event *event;
unsigned long flags;
int enabled = 0;
+ int mux = 0;

local_irq_save(flags);
ctx = current->perf_event_ctxp[ctxn];
@@ -3510,7 +3797,7 @@ static void perf_event_enable_on_exec(int ctxn)

cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -3523,7 +3810,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
}
perf_ctx_unlock(cpuctx, ctx);

@@ -3743,8 +4030,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
- INIT_LIST_HEAD(&ctx->pinned_groups);
- INIT_LIST_HEAD(&ctx->flexible_groups);
+ ctx->pinned_groups = RB_ROOT;
+ ctx->flexible_groups = RB_ROOT;
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
}
@@ -4843,8 +5130,8 @@ static void calc_timer_values(struct perf_event *event,

*now = perf_clock();
ctx_time = event->shadow_ctx_time + *now;
- *enabled = ctx_time - event->tstamp_enabled;
- *running = ctx_time - event->tstamp_running;
+ *enabled = ctx_time - event->tstamp->enabled;
+ *running = ctx_time - event->tstamp->running;
}

static void perf_event_init_userpage(struct perf_event *event)
@@ -9379,6 +9666,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->group_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
+ RB_CLEAR_NODE(&event->group_node);
+ INIT_LIST_HEAD(&event->group_list);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
@@ -9392,6 +9681,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
raw_spin_lock_init(&event->addr_filters.lock);

atomic_long_set(&event->refcount, 1);
+ event->tstamp = &event->tstamp_data;
event->cpu = cpu;
event->attr = *attr;
event->group_leader = group_leader;
@@ -10767,6 +11057,14 @@ static int inherit_group(struct perf_event *parent_event,
return 0;
}

+struct inherit_task_group_params {
+ struct task_struct *parent;
+ struct perf_event_context *parent_ctx;
+ struct task_struct *child;
+ int ctxn;
+ int inherited_all;
+};
+
/*
* Creates the child task context and tries to inherit the event-group.
*
@@ -10779,20 +11077,18 @@ static int inherit_group(struct perf_event *parent_event,
* - <0 on error
*/
static int
-inherit_task_group(struct perf_event *event, struct task_struct *parent,
- struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
- int *inherited_all)
+inherit_task_group(struct perf_event *event, void *data)
{
int ret;
struct perf_event_context *child_ctx;
+ struct inherit_task_group_params *params = data;

if (!event->attr.inherit) {
- *inherited_all = 0;
+ params->inherited_all = 0;
return 0;
}

- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = params->child->perf_event_ctxp[params->ctxn];
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -10800,18 +11096,19 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(params->parent_ctx->pmu,
+ params->child);
if (!child_ctx)
return -ENOMEM;

- child->perf_event_ctxp[ctxn] = child_ctx;
+ params->child->perf_event_ctxp[params->ctxn] = child_ctx;
}

- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
+ ret = inherit_group(event, params->parent, params->parent_ctx,
+ params->child, child_ctx);

if (ret)
- *inherited_all = 0;
+ params->inherited_all = 0;

return ret;
}
@@ -10823,11 +11120,15 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
- struct perf_event *event;
struct task_struct *parent = current;
- int inherited_all = 1;
unsigned long flags;
int ret = 0;
+ struct inherit_task_group_params params = {
+ .parent = parent,
+ .child = child,
+ .ctxn = ctxn,
+ .inherited_all = 1
+ };

if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
@@ -10840,6 +11141,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
if (!parent_ctx)
return 0;

+ params.parent_ctx = parent_ctx;
+
/*
* No need to check if parent_ctx != NULL here; since we saw
* it non-NULL earlier, the only reason for it to become NULL
@@ -10857,13 +11160,10 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
- list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
- ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
- if (ret)
- goto out_unlock;
- }
-
+ ret = perf_event_groups_iterate(&parent_ctx->pinned_groups,
+ inherit_task_group, &params);
+ if (ret)
+ goto out_unlock;
/*
* We can't hold ctx->lock when iterating the ->flexible_group list due
* to allocations, but we need to prevent rotation because
@@ -10873,19 +11173,17 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

- list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
- ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
- if (ret)
- goto out_unlock;
- }
+ ret = perf_event_groups_iterate(&parent_ctx->flexible_groups,
+ inherit_task_group, &params);
+ if (ret)
+ goto out_unlock;

raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;

child_ctx = child->perf_event_ctxp[ctxn];

- if (child_ctx && inherited_all) {
+ if (child_ctx && params.inherited_all) {
/*
* Mark the child context as a clone of the parent
* context, or of whatever the parent is a clone of.