[PATCH v9 2/2] perf/core: add rotation switch to skip to the current CPU's events list on mux interrupt

From: Alexey Budankov
Date: Thu Aug 31 2017 - 06:03:52 EST


This patch implements rotation switch that triggers skipping to the
current CPU's events list at mulitplexing hrtimer interrupt
handler as well as adoption of the switch in the existing
implementation.

The value of rotation switch may be the one of defined by
ROTATION_DISABLED or ROTATION_ENABLED macros.

ctx->rotate_disable bool flag is renamed to ctx->rotation
and also employs ROTATION_DISABLE and ROTATION_ENABLED macros.

perf_event_groups_iterate_cpu() API is introduced to implement
iteration through the certain CPU groups list skipping groups
allocated for the other CPUs.

Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 2 +-
kernel/events/core.c | 213 +++++++++++++++++++++++++++++----------------
2 files changed, 141 insertions(+), 74 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2e58da..9970b4f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -756,7 +756,7 @@ struct perf_event_context {
int is_active;
int nr_stat;
int nr_freq;
- int rotate_disable;
+ int rotation;
atomic_t refcount;
struct task_struct *task;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5ef0f05..7b67d17 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -430,6 +430,9 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}

+#define ROTATION_DISABLED 0
+#define ROTATION_ENABLED 1
+
static int perf_rotate_context(struct perf_cpu_context *cpuctx);

int perf_proc_update_handler(struct ctl_table *table, int write,
@@ -556,11 +559,11 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;

static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
+ enum event_type_t event_type, int rotation);

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task);
+ struct task_struct *task, int rotation);

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -717,7 +720,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
perf_pmu_disable(cpuctx->ctx.pmu);

if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL, ROTATION_DISABLED);
/*
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
@@ -736,7 +739,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
*/
cpuctx->cgrp = perf_cgroup_from_task(task,
&cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, ROTATION_DISABLED);
}
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -1002,21 +1005,21 @@ list_update_cgroup_event(struct perf_event *event,
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
- int rotations = 0;
+ int rotation = 0;

WARN_ON(!irqs_disabled());

cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ rotation = perf_rotate_context(cpuctx);

raw_spin_lock(&cpuctx->hrtimer_lock);
- if (rotations)
+ if (rotation)
hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
else
cpuctx->hrtimer_active = 0;
raw_spin_unlock(&cpuctx->hrtimer_lock);

- return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
+ return rotation ? HRTIMER_RESTART : HRTIMER_NORESTART;
}

static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
@@ -1396,6 +1399,17 @@ static u64 perf_event_time(struct perf_event *event)
return ctx ? ctx->time : 0;
}

+void perf_event_tstamp_update(struct perf_event *event)
+{
+ u64 tstamp, delta;
+
+ tstamp = perf_event_time(event);
+ delta = tstamp - event->tstamp_stopped;
+
+ event->tstamp_running += delta;
+ event->tstamp_stopped = tstamp;
+}
+
/*
* Update the total_time_enabled and total_time_running fields for a event.
*/
@@ -1410,6 +1424,9 @@ static void update_event_times(struct perf_event *event)
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;

+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_tstamp_update(event);
+
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
@@ -1648,6 +1665,15 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
typeof(*event), node))

/*
+ * Iterate event groups with cpu == key.
+ */
+#define perf_event_groups_for_each_cpu(event, key, groups, node) \
+ for (event = perf_event_groups_first(groups, key); \
+ event && event->cpu == key; \
+ event = rb_entry_safe(rb_next(&event->node), \
+ typeof(*event), node))
+
+/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -1970,9 +1996,6 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
- u64 delta;
-
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);

@@ -1983,18 +2006,15 @@ event_sched_out(struct perf_event *event,
* via read() for time_enabled, time_running:
*/
if (event->state == PERF_EVENT_STATE_INACTIVE &&
- !event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
- }
+ !event_filter_match(event))
+ perf_event_tstamp_update(event);

if (event->state != PERF_EVENT_STATE_ACTIVE)
return;

perf_pmu_disable(event->pmu);

- event->tstamp_stopped = tstamp;
+ event->tstamp_stopped = perf_event_time(event);
event->pmu->del(event, 0);
event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
@@ -2294,7 +2314,6 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = ctx->pmu;
- u64 now = ctx->time;
bool simulate = false;

if (group_event->state == PERF_EVENT_STATE_OFF)
@@ -2340,12 +2359,10 @@ group_sched_in(struct perf_event *group_event,
if (event == partial_group)
simulate = true;

- if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
- } else {
+ if (simulate)
+ perf_event_tstamp_update(event);
+ else
event_sched_out(event, cpuctx, ctx);
- }
}
event_sched_out(group_event, cpuctx, ctx);

@@ -2475,12 +2492,12 @@ static void add_event_to_ctx(struct perf_event *event,

static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
+ enum event_type_t event_type, int rotation);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task);
+ struct task_struct *task, int rotation);

static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
@@ -2492,19 +2509,19 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;

- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, cpuctx, event_type, ROTATION_DISABLED);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
- struct task_struct *task)
+ struct task_struct *task, int rotation)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, rotation);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, rotation);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, rotation);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, rotation);
}

/*
@@ -2548,11 +2565,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
+ cpu_ctx_sched_out(cpuctx, ctx_event_type, ROTATION_DISABLED);
else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, ROTATION_DISABLED);

- perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx, current, ROTATION_DISABLED);
perf_pmu_enable(cpuctx->ctx.pmu);
}

@@ -2596,7 +2613,7 @@ static int __perf_install_in_context(void *info)
}

if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, ROTATION_DISABLED);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2739,7 +2756,7 @@ static void __perf_event_enable(struct perf_event *event,
return;

if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, ROTATION_DISABLED);

__perf_event_mark_enabled(event);

@@ -2749,7 +2766,7 @@ static void __perf_event_enable(struct perf_event *event,
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, ROTATION_DISABLED);
return;
}

@@ -2758,7 +2775,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, ROTATION_DISABLED);
return;
}

@@ -2954,10 +2971,11 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);

static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type, int rotation)
{
int is_active = ctx->is_active;
struct perf_event *event;
+ int sw = -1, cpu = smp_processor_id();

lockdep_assert_held(&ctx->lock);

@@ -3004,13 +3022,35 @@ static void ctx_sched_out(struct perf_event_context *ctx,

perf_pmu_disable(ctx->pmu);

- if (is_active & EVENT_PINNED)
- perf_event_groups_for_each(event, &ctx->pinned_groups, group_node)
- group_sched_out(event, cpuctx, ctx);
+ if (is_active & EVENT_PINNED) {
+ if (rotation == ROTATION_ENABLED) {
+ perf_event_groups_for_each_cpu(event, cpu,
+ &ctx->pinned_groups, group_node)
+ group_sched_out(event, cpuctx, ctx);
+ perf_event_groups_for_each_cpu(event, sw,
+ &ctx->pinned_groups, group_node)
+ group_sched_out(event, cpuctx, ctx);
+ } else {
+ perf_event_groups_for_each(event,
+ &ctx->pinned_groups, group_node)
+ group_sched_out(event, cpuctx, ctx);
+ }
+ }

- if (is_active & EVENT_FLEXIBLE)
- perf_event_groups_for_each(event, &ctx->flexible_groups, group_node)
- group_sched_out(event, cpuctx, ctx);
+ if (is_active & EVENT_FLEXIBLE) {
+ if (rotation == ROTATION_ENABLED) {
+ perf_event_groups_for_each_cpu(event, cpu,
+ &ctx->flexible_groups, group_node)
+ group_sched_out(event, cpuctx, ctx);
+ perf_event_groups_for_each_cpu(event, sw,
+ &ctx->flexible_groups, group_node)
+ group_sched_out(event, cpuctx, ctx);
+ } else {
+ perf_event_groups_for_each(event,
+ &ctx->flexible_groups, group_node)
+ group_sched_out(event, cpuctx, ctx);
+ }
+ }

perf_pmu_enable(ctx->pmu);
}
@@ -3299,37 +3339,63 @@ void __perf_event_task_sched_out(struct task_struct *task,
* Called with IRQs disabled
*/
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type, int rotation)
{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+ ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, rotation);
}

static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+ struct perf_cpu_context *cpuctx, int rotation)
{
+ int sw = -1, cpu = smp_processor_id();
struct perf_event *event;

- perf_event_groups_for_each(event, &ctx->pinned_groups, group_node)
- pinned_group_sched_in(event, ctx, cpuctx);
+ if (rotation == ROTATION_ENABLED) {
+ perf_event_groups_for_each_cpu(event, sw,
+ &ctx->pinned_groups, group_node)
+ pinned_group_sched_in(event, ctx, cpuctx);
+ perf_event_groups_for_each_cpu(event, cpu,
+ &ctx->pinned_groups, group_node)
+ pinned_group_sched_in(event, ctx, cpuctx);
+ } else {
+ perf_event_groups_for_each(event,
+ &ctx->pinned_groups, group_node)
+ pinned_group_sched_in(event,ctx, cpuctx);
+ }
}

static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+ struct perf_cpu_context *cpuctx, int rotation)
{
+ int sw = -1, cpu = smp_processor_id();
struct perf_event *event;
int can_add_hw = 1;

- perf_event_groups_for_each(event, &ctx->flexible_groups, group_node)
- flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
+ if (rotation == ROTATION_ENABLED) {
+ perf_event_groups_for_each_cpu(event, sw,
+ &ctx->flexible_groups, group_node)
+ flexible_group_sched_in(event, ctx, cpuctx,
+ &can_add_hw);
+ can_add_hw = 1;
+ perf_event_groups_for_each_cpu(event, cpu,
+ &ctx->flexible_groups, group_node)
+ flexible_group_sched_in(event, ctx, cpuctx,
+ &can_add_hw);
+ } else {
+ perf_event_groups_for_each(event,
+ &ctx->flexible_groups, group_node)
+ flexible_group_sched_in(event, ctx, cpuctx,
+ &can_add_hw);
+ }
}

static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task)
+ struct task_struct *task, int rotation)
{
int is_active = ctx->is_active;

@@ -3359,20 +3425,20 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, cpuctx, rotation);

/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, cpuctx, rotation);
}

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
- struct task_struct *task)
+ struct task_struct *task, int rotation)
{
struct perf_event_context *ctx = &cpuctx->ctx;

- ctx_sched_in(ctx, cpuctx, event_type, task);
+ ctx_sched_in(ctx, cpuctx, event_type, task, rotation);
}

static void perf_event_context_sched_in(struct perf_event_context *ctx,
@@ -3402,8 +3468,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, ROTATION_DISABLED);
+ perf_event_sched_in(cpuctx, ctx, task, ROTATION_DISABLED);
perf_pmu_enable(ctx->pmu);

unlock:
@@ -3638,7 +3704,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
- if (!ctx->rotate_disable) {
+ if (ctx->rotation == ROTATION_ENABLED) {
int sw = -1, cpu = smp_processor_id();

perf_event_groups_rotate(&ctx->flexible_groups, sw);
@@ -3649,40 +3715,40 @@ static void rotate_ctx(struct perf_event_context *ctx)
static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
- int rotate = 0;
+ int rotation = ROTATION_DISABLED;

if (cpuctx->ctx.nr_events) {
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- rotate = 1;
+ rotation = ROTATION_ENABLED;
}

ctx = cpuctx->task_ctx;
if (ctx && ctx->nr_events) {
if (ctx->nr_events != ctx->nr_active)
- rotate = 1;
+ rotation = ROTATION_ENABLED;
}

- if (!rotate)
+ if (rotation == ROTATION_DISABLED)
goto done;

perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);

- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, rotation);
if (ctx)
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, rotation);

rotate_ctx(&cpuctx->ctx);
if (ctx)
rotate_ctx(ctx);

- perf_event_sched_in(cpuctx, ctx, current);
+ perf_event_sched_in(cpuctx, ctx, current, rotation);

perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
done:

- return rotate;
+ return rotation;
}

void perf_event_task_tick(void)
@@ -3736,7 +3802,7 @@ static void perf_event_enable_on_exec(int ctxn)

cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, ROTATION_DISABLED);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -3749,7 +3815,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, ROTATION_DISABLED);
}
perf_ctx_unlock(cpuctx, ctx);

@@ -3986,6 +4052,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
+ ctx->rotation = ROTATION_ENABLED;
}

static struct perf_event_context *
@@ -11162,7 +11229,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
* rotate_ctx() will change the list from interrupt context.
*/
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
- parent_ctx->rotate_disable = 1;
+ parent_ctx->rotation = ROTATION_DISABLED;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

perf_event_groups_for_each(event, &parent_ctx->flexible_groups, group_node) {
@@ -11173,7 +11240,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}

raw_spin_lock_irqsave(&parent_ctx->lock, flags);
- parent_ctx->rotate_disable = 0;
+ parent_ctx->rotation = ROTATION_ENABLED;

child_ctx = child->perf_event_ctxp[ctxn];