Re: [PATCH v7 2/2] perf/core: add mux switch to skip to the current CPU's events list on mux interrupt
From: Alexander Shishkin
Date: Wed Aug 23 2017 - 08:03:06 EST
Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx> writes:
> This patch implements mux switch that triggers skipping to the
> current CPU's events list at mulitplexing hrtimer interrupt
> handler as well as adoption of the switch in the existing
> implementation.
>
> perf_event_groups_iterate_cpu() API is introduced to implement
> iteration thru the certain CPU groups list skipping groups
"through"
> allocated for the other CPUs.
>
> Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
> ---
> kernel/events/core.c | 193 ++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 137 insertions(+), 56 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 08ccfb2..aeb0f81 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -556,11 +556,11 @@ void perf_sample_event_took(u64 sample_len_ns)
> static atomic64_t perf_event_id;
>
> static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> - enum event_type_t event_type);
> + enum event_type_t event_type, int mux);
>
> static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> enum event_type_t event_type,
> - struct task_struct *task);
> + struct task_struct *task, int mux);
>
> static void update_context_time(struct perf_event_context *ctx);
> static u64 perf_event_time(struct perf_event *event);
> @@ -702,6 +702,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
> struct perf_cpu_context *cpuctx;
> struct list_head *list;
> unsigned long flags;
> + int mux = 0;
>
> /*
> * Disable interrupts and preemption to avoid this CPU's
> @@ -717,7 +718,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
> perf_pmu_disable(cpuctx->ctx.pmu);
>
> if (mode & PERF_CGROUP_SWOUT) {
> - cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> + cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
> /*
> * must not be done before ctxswout due
> * to event_filter_match() in event_sched_out()
> @@ -736,7 +737,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
> */
> cpuctx->cgrp = perf_cgroup_from_task(task,
> &cpuctx->ctx);
> - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
'mux' is always zero in this function, isn't it?
> }
> perf_pmu_enable(cpuctx->ctx.pmu);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -1613,8 +1614,16 @@ perf_event_groups_rotate(struct rb_root *groups, int cpu)
> */
> #define perf_event_groups_for_each(event, iter, tree, node, list, link) \
> for (iter = rb_first(tree); iter; iter = rb_next(iter)) \
> - list_for_each_entry(event, &(rb_entry(iter, \
> - typeof(*event), node)->list), link)
> + list_for_each_entry(event, &(rb_entry(iter, \
> + typeof(*event), node)->list), link)
Is this an indentation change? What is it doing here?
> +
> +/*
> + * Iterate event groups related to specific cpu.
> + */
> +#define perf_event_groups_for_each_cpu(event, cpu, tree, list, link) \
> + list = perf_event_groups_get_list(tree, cpu); \
> + if (list) \
> + list_for_each_entry(event, list, link)
..or not, if there's no list.
>
> /*
> * Add a event from the lists for its context.
> @@ -2397,36 +2406,38 @@ static void add_event_to_ctx(struct perf_event *event,
>
> static void ctx_sched_out(struct perf_event_context *ctx,
> struct perf_cpu_context *cpuctx,
> - enum event_type_t event_type);
> + enum event_type_t event_type, int mux);
> static void
> ctx_sched_in(struct perf_event_context *ctx,
> struct perf_cpu_context *cpuctx,
> enum event_type_t event_type,
> - struct task_struct *task);
> + struct task_struct *task, int mux);
>
> static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
> struct perf_event_context *ctx,
> enum event_type_t event_type)
> {
> + int mux = 0;
> +
> if (!cpuctx->task_ctx)
> return;
>
> if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
> return;
>
> - ctx_sched_out(ctx, cpuctx, event_type);
> + ctx_sched_out(ctx, cpuctx, event_type, mux);
Just use 0.
> }
>
> static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
> struct perf_event_context *ctx,
> - struct task_struct *task)
> + struct task_struct *task, int mux)
> {
> - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
> + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
> if (ctx)
> - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
> - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
> + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
> + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
> if (ctx)
> - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
> + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
> }
>
> /*
> @@ -2450,6 +2461,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
> {
> enum event_type_t ctx_event_type = event_type & EVENT_ALL;
> bool cpu_event = !!(event_type & EVENT_CPU);
> + int mux = 0;
>
> /*
> * If pinned groups are involved, flexible groups also need to be
> @@ -2470,11 +2482,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
> * - otherwise, do nothing more.
> */
> if (cpu_event)
> - cpu_ctx_sched_out(cpuctx, ctx_event_type);
> + cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
> else if (ctx_event_type & EVENT_PINNED)
> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>
> - perf_event_sched_in(cpuctx, task_ctx, current);
> + perf_event_sched_in(cpuctx, task_ctx, current, mux);
Also mux==0 in all cases in this function.
> perf_pmu_enable(cpuctx->ctx.pmu);
> }
>
> @@ -2491,7 +2503,7 @@ static int __perf_install_in_context(void *info)
> struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
> struct perf_event_context *task_ctx = cpuctx->task_ctx;
> bool reprogram = true;
> - int ret = 0;
> + int ret = 0, mux =0;
>
> raw_spin_lock(&cpuctx->ctx.lock);
> if (ctx->task) {
> @@ -2518,7 +2530,7 @@ static int __perf_install_in_context(void *info)
> }
>
> if (reprogram) {
> - ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
> add_event_to_ctx(event, ctx);
> ctx_resched(cpuctx, task_ctx, get_event_type(event));
> } else {
> @@ -2655,13 +2667,14 @@ static void __perf_event_enable(struct perf_event *event,
> {
> struct perf_event *leader = event->group_leader;
> struct perf_event_context *task_ctx;
> + int mux = 0;
>
> if (event->state >= PERF_EVENT_STATE_INACTIVE ||
> event->state <= PERF_EVENT_STATE_ERROR)
> return;
>
> if (ctx->is_active)
> - ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>
> __perf_event_mark_enabled(event);
>
> @@ -2671,7 +2684,7 @@ static void __perf_event_enable(struct perf_event *event,
> if (!event_filter_match(event)) {
> if (is_cgroup_event(event))
> perf_cgroup_defer_enabled(event);
> - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
> + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
> return;
> }
>
> @@ -2680,7 +2693,7 @@ static void __perf_event_enable(struct perf_event *event,
> * then don't put it on unless the group is on.
> */
> if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
> + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
And here.
> return;
> }
>
> @@ -2876,11 +2889,13 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
>
> static void ctx_sched_out(struct perf_event_context *ctx,
> struct perf_cpu_context *cpuctx,
> - enum event_type_t event_type)
> + enum event_type_t event_type, int mux)
> {
> int is_active = ctx->is_active;
> + struct list_head *group_list;
> struct perf_event *event;
> struct rb_node *node;
> + int sw = -1, cpu = smp_processor_id();
Same thing seems to be happening with 'sw'.
> lockdep_assert_held(&ctx->lock);
>
> if (likely(!ctx->nr_events)) {
> @@ -2926,17 +2941,47 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>
> perf_pmu_disable(ctx->pmu);
>
> - if (is_active & EVENT_PINNED)
> - perf_event_groups_for_each(event, node,
> - &ctx->pinned_groups, group_node,
> - group_list, group_entry)
> - group_sched_out(event, cpuctx, ctx);
> + if (is_active & EVENT_PINNED) {
> + if (mux) {
So it's 'rotate', really.
> + perf_event_groups_for_each_cpu(event, cpu,
> + &ctx->pinned_groups,
> + group_list, group_entry) {
> + group_sched_out(event, cpuctx, ctx);
> + }
> + perf_event_groups_for_each_cpu(event, sw,
> + &ctx->pinned_groups,
> + group_list, group_entry) {
> + group_sched_out(event, cpuctx, ctx);
> + }
> + } else {
> + perf_event_groups_for_each(event, node,
> + &ctx->pinned_groups, group_node,
> + group_list, group_entry) {
> + group_sched_out(event, cpuctx, ctx);
> + }
> + }
> + }
>
> - if (is_active & EVENT_FLEXIBLE)
> - perf_event_groups_for_each(event, node,
> - &ctx->flexible_groups, group_node,
> - group_list, group_entry)
> - group_sched_out(event, cpuctx, ctx);
> + if (is_active & EVENT_FLEXIBLE) {
> + if (mux) {
> + perf_event_groups_for_each_cpu(event, cpu,
> + &ctx->flexible_groups,
> + group_list, group_entry) {
> + group_sched_out(event, cpuctx, ctx);
> + }
> + perf_event_groups_for_each_cpu(event, sw,
> + &ctx->flexible_groups,
> + group_list, group_entry) {
> + group_sched_out(event, cpuctx, ctx);
> + }
> + } else {
> + perf_event_groups_for_each(event, node,
> + &ctx->flexible_groups, group_node,
> + group_list, group_entry) {
> + group_sched_out(event, cpuctx, ctx);
> + }
> + }
> + }
>
> perf_pmu_enable(ctx->pmu);
> }
> @@ -3225,9 +3270,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
> * Called with IRQs disabled
> */
> static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> - enum event_type_t event_type)
> + enum event_type_t event_type, int mux)
> {
> - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
> + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
> }
>
> static void
> @@ -3287,11 +3332,13 @@ static void
> ctx_sched_in(struct perf_event_context *ctx,
> struct perf_cpu_context *cpuctx,
> enum event_type_t event_type,
> - struct task_struct *task)
> + struct task_struct *task, int mux)
> {
> int is_active = ctx->is_active;
> + struct list_head *group_list;
> struct perf_event *event;
> struct rb_node *node;
> + int sw = -1, cpu = smp_processor_id();
>
> lockdep_assert_held(&ctx->lock);
>
> @@ -3319,35 +3366,69 @@ ctx_sched_in(struct perf_event_context *ctx,
> * First go through the list and put on any pinned groups
> * in order to give them the best chance of going on.
> */
> - if (is_active & EVENT_PINNED)
> - perf_event_groups_for_each(event, node,
> - &ctx->pinned_groups, group_node,
> - group_list, group_entry)
> - ctx_pinned_sched_in(event, cpuctx, ctx);
> + if (is_active & EVENT_PINNED) {
> + if (mux) {
> + perf_event_groups_for_each_cpu(event, sw,
> + &ctx->pinned_groups,
> + group_list, group_entry) {
> + ctx_pinned_sched_in(event, cpuctx, ctx);
> + }
> + perf_event_groups_for_each_cpu(event, cpu,
> + &ctx->pinned_groups,
> + group_list, group_entry) {
> + ctx_pinned_sched_in(event, cpuctx, ctx);
> + }
> + } else {
> + perf_event_groups_for_each(event, node,
> + &ctx->pinned_groups, group_node,
> + group_list, group_entry) {
> + ctx_pinned_sched_in(event, cpuctx, ctx);
> + }
> + }
> + }
>
> /* Then walk through the lower prio flexible groups */
> if (is_active & EVENT_FLEXIBLE) {
> int can_add_hw = 1;
> - perf_event_groups_for_each(event, node,
> - &ctx->flexible_groups, group_node,
> - group_list, group_entry)
> - ctx_flexible_sched_in(event, cpuctx, ctx, &can_add_hw);
> + if (mux) {
> + perf_event_groups_for_each_cpu(event, sw,
> + &ctx->flexible_groups,
> + group_list, group_entry) {
> + ctx_flexible_sched_in(event, cpuctx,
> + ctx, &can_add_hw);
> + }
> + can_add_hw = 1;
> + perf_event_groups_for_each_cpu(event, cpu,
> + &ctx->flexible_groups,
> + group_list, group_entry) {
> + ctx_flexible_sched_in(event, cpuctx,
> + ctx, &can_add_hw);
> + }
> + } else {
> + perf_event_groups_for_each(event, node,
> + &ctx->flexible_groups, group_node,
> + group_list, group_entry) {
> + ctx_flexible_sched_in(event, cpuctx,
> + ctx, &can_add_hw);
> + }
> + }
> }
> }
>
> static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
> enum event_type_t event_type,
> - struct task_struct *task)
> + struct task_struct *task, int mux)
> {
> struct perf_event_context *ctx = &cpuctx->ctx;
>
> - ctx_sched_in(ctx, cpuctx, event_type, task);
> + ctx_sched_in(ctx, cpuctx, event_type, task, mux);
> }
>
> static void perf_event_context_sched_in(struct perf_event_context *ctx,
> struct task_struct *task)
> {
> struct perf_cpu_context *cpuctx;
> + int mux = 0;
>
> cpuctx = __get_cpu_context(ctx);
> if (cpuctx->task_ctx == ctx)
> @@ -3371,8 +3452,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
> * events, no need to flip the cpuctx's events around.
> */
> if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> - perf_event_sched_in(cpuctx, ctx, task);
> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
> + perf_event_sched_in(cpuctx, ctx, task, mux);
> perf_pmu_enable(ctx->pmu);
>
> unlock:
> @@ -3618,7 +3699,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
> static int perf_rotate_context(struct perf_cpu_context *cpuctx)
> {
> struct perf_event_context *ctx = NULL;
> - int rotate = 0;
> + int rotate = 0, mux = 1;
>
> if (cpuctx->ctx.nr_events) {
> if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
> @@ -3637,15 +3718,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> perf_pmu_disable(cpuctx->ctx.pmu);
>
> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
It's '1'.