Re: [PATCH v7 2/2] perf/core: add mux switch to skip to the current CPU's events list on mux interrupt
From: Alexey Budankov
Date: Wed Aug 23 2017 - 14:12:44 EST
On 23.08.2017 14:54, Alexander Shishkin wrote:
> Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx> writes:
>
>> This patch implements mux switch that triggers skipping to the
>> current CPU's events list at mulitplexing hrtimer interrupt
>> handler as well as adoption of the switch in the existing
>> implementation.
>>
>> perf_event_groups_iterate_cpu() API is introduced to implement
>> iteration thru the certain CPU groups list skipping groups
>
> "through"
>
>> allocated for the other CPUs.
>>
>> Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
>> ---
>> kernel/events/core.c | 193 ++++++++++++++++++++++++++++++++++++---------------
>> 1 file changed, 137 insertions(+), 56 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 08ccfb2..aeb0f81 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -556,11 +556,11 @@ void perf_sample_event_took(u64 sample_len_ns)
>> static atomic64_t perf_event_id;
>>
>> static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> - enum event_type_t event_type);
>> + enum event_type_t event_type, int mux);
>>
>> static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>> enum event_type_t event_type,
>> - struct task_struct *task);
>> + struct task_struct *task, int mux);
>>
>> static void update_context_time(struct perf_event_context *ctx);
>> static u64 perf_event_time(struct perf_event *event);
>> @@ -702,6 +702,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>> struct perf_cpu_context *cpuctx;
>> struct list_head *list;
>> unsigned long flags;
>> + int mux = 0;
>>
>> /*
>> * Disable interrupts and preemption to avoid this CPU's
>> @@ -717,7 +718,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>> perf_pmu_disable(cpuctx->ctx.pmu);
>>
>> if (mode & PERF_CGROUP_SWOUT) {
>> - cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> + cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
>> /*
>> * must not be done before ctxswout due
>> * to event_filter_match() in event_sched_out()
>> @@ -736,7 +737,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>> */
>> cpuctx->cgrp = perf_cgroup_from_task(task,
>> &cpuctx->ctx);
>> - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>> + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
>
> 'mux' is always zero in this function, isn't it?
>
>> }
>> perf_pmu_enable(cpuctx->ctx.pmu);
>> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> @@ -1613,8 +1614,16 @@ perf_event_groups_rotate(struct rb_root *groups, int cpu)
>> */
>> #define perf_event_groups_for_each(event, iter, tree, node, list, link) \
>> for (iter = rb_first(tree); iter; iter = rb_next(iter)) \
>> - list_for_each_entry(event, &(rb_entry(iter, \
>> - typeof(*event), node)->list), link)
>> + list_for_each_entry(event, &(rb_entry(iter, \
>> + typeof(*event), node)->list), link)
>
> Is this an indentation change? What is it doing here?
>
>> +
>> +/*
>> + * Iterate event groups related to specific cpu.
>> + */
>> +#define perf_event_groups_for_each_cpu(event, cpu, tree, list, link) \
>> + list = perf_event_groups_get_list(tree, cpu); \
>> + if (list) \
>> + list_for_each_entry(event, list, link)
>
> ..or not, if there's no list.
>
>>
>> /*
>> * Add a event from the lists for its context.
>> @@ -2397,36 +2406,38 @@ static void add_event_to_ctx(struct perf_event *event,
>>
>> static void ctx_sched_out(struct perf_event_context *ctx,
>> struct perf_cpu_context *cpuctx,
>> - enum event_type_t event_type);
>> + enum event_type_t event_type, int mux);
>> static void
>> ctx_sched_in(struct perf_event_context *ctx,
>> struct perf_cpu_context *cpuctx,
>> enum event_type_t event_type,
>> - struct task_struct *task);
>> + struct task_struct *task, int mux);
>>
>> static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> struct perf_event_context *ctx,
>> enum event_type_t event_type)
>> {
>> + int mux = 0;
>> +
>> if (!cpuctx->task_ctx)
>> return;
>>
>> if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
>> return;
>>
>> - ctx_sched_out(ctx, cpuctx, event_type);
>> + ctx_sched_out(ctx, cpuctx, event_type, mux);
>
> Just use 0.
Well, I intentionally made this ugly switch named over local variable
to add clarity into the code so I simply wonder - why do you prefer
the unnamed variant?
>
>> }
>>
>> static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>> struct perf_event_context *ctx,
>> - struct task_struct *task)
>> + struct task_struct *task, int mux)
>> {
>> - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
>> + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
>> if (ctx)
>> - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
>> - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
>> + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
>> + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
>> if (ctx)
>> - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
>> + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
>> }
>>
>> /*
>> @@ -2450,6 +2461,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>> {
>> enum event_type_t ctx_event_type = event_type & EVENT_ALL;
>> bool cpu_event = !!(event_type & EVENT_CPU);
>> + int mux = 0;
>>
>> /*
>> * If pinned groups are involved, flexible groups also need to be
>> @@ -2470,11 +2482,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>> * - otherwise, do nothing more.
>> */
>> if (cpu_event)
>> - cpu_ctx_sched_out(cpuctx, ctx_event_type);
>> + cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
>> else if (ctx_event_type & EVENT_PINNED)
>> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>>
>> - perf_event_sched_in(cpuctx, task_ctx, current);
>> + perf_event_sched_in(cpuctx, task_ctx, current, mux);
>
> Also mux==0 in all cases in this function.
>
>> perf_pmu_enable(cpuctx->ctx.pmu);
>> }
>>
>> @@ -2491,7 +2503,7 @@ static int __perf_install_in_context(void *info)
>> struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
>> struct perf_event_context *task_ctx = cpuctx->task_ctx;
>> bool reprogram = true;
>> - int ret = 0;
>> + int ret = 0, mux =0;
>>
>> raw_spin_lock(&cpuctx->ctx.lock);
>> if (ctx->task) {
>> @@ -2518,7 +2530,7 @@ static int __perf_install_in_context(void *info)
>> }
>>
>> if (reprogram) {
>> - ctx_sched_out(ctx, cpuctx, EVENT_TIME);
>> + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>> add_event_to_ctx(event, ctx);
>> ctx_resched(cpuctx, task_ctx, get_event_type(event));
>> } else {
>> @@ -2655,13 +2667,14 @@ static void __perf_event_enable(struct perf_event *event,
>> {
>> struct perf_event *leader = event->group_leader;
>> struct perf_event_context *task_ctx;
>> + int mux = 0;
>>
>> if (event->state >= PERF_EVENT_STATE_INACTIVE ||
>> event->state <= PERF_EVENT_STATE_ERROR)
>> return;
>>
>> if (ctx->is_active)
>> - ctx_sched_out(ctx, cpuctx, EVENT_TIME);
>> + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>>
>> __perf_event_mark_enabled(event);
>>
>> @@ -2671,7 +2684,7 @@ static void __perf_event_enable(struct perf_event *event,
>> if (!event_filter_match(event)) {
>> if (is_cgroup_event(event))
>> perf_cgroup_defer_enabled(event);
>> - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
>> + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
>> return;
>> }
>>
>> @@ -2680,7 +2693,7 @@ static void __perf_event_enable(struct perf_event *event,
>> * then don't put it on unless the group is on.
>> */
>> if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
>> - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
>> + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
>
> And here.
>
>> return;
>> }
>>
>> @@ -2876,11 +2889,13 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
>>
>> static void ctx_sched_out(struct perf_event_context *ctx,
>> struct perf_cpu_context *cpuctx,
>> - enum event_type_t event_type)
>> + enum event_type_t event_type, int mux)
>> {
>> int is_active = ctx->is_active;
>> + struct list_head *group_list;
>> struct perf_event *event;
>> struct rb_node *node;
>> + int sw = -1, cpu = smp_processor_id();
>
> Same thing seems to be happening with 'sw'.
>
>> lockdep_assert_held(&ctx->lock);
>>
>> if (likely(!ctx->nr_events)) {
>> @@ -2926,17 +2941,47 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>>
>> perf_pmu_disable(ctx->pmu);
>>
>> - if (is_active & EVENT_PINNED)
>> - perf_event_groups_for_each(event, node,
>> - &ctx->pinned_groups, group_node,
>> - group_list, group_entry)
>> - group_sched_out(event, cpuctx, ctx);
>> + if (is_active & EVENT_PINNED) {
>> + if (mux) {
>
> So it's 'rotate', really.
Which 'rotate' do you mean? If the local variable from
perf_rotate_context() then yes - it may be passed to this function
call from there as mux value, but logically they are still different.
>
>> + perf_event_groups_for_each_cpu(event, cpu,
>> + &ctx->pinned_groups,
>> + group_list, group_entry) {
>> + group_sched_out(event, cpuctx, ctx);
>> + }
>> + perf_event_groups_for_each_cpu(event, sw,
>> + &ctx->pinned_groups,
>> + group_list, group_entry) {
>> + group_sched_out(event, cpuctx, ctx);
>> + }
>> + } else {
>> + perf_event_groups_for_each(event, node,
>> + &ctx->pinned_groups, group_node,
>> + group_list, group_entry) {
>> + group_sched_out(event, cpuctx, ctx);
>> + }
>> + }
>> + }
>>
>> - if (is_active & EVENT_FLEXIBLE)
>> - perf_event_groups_for_each(event, node,
>> - &ctx->flexible_groups, group_node,
>> - group_list, group_entry)
>> - group_sched_out(event, cpuctx, ctx);
>> + if (is_active & EVENT_FLEXIBLE) {
>> + if (mux) {
>> + perf_event_groups_for_each_cpu(event, cpu,
>> + &ctx->flexible_groups,
>> + group_list, group_entry) {
>> + group_sched_out(event, cpuctx, ctx);
>> + }
>> + perf_event_groups_for_each_cpu(event, sw,
>> + &ctx->flexible_groups,
>> + group_list, group_entry) {
>> + group_sched_out(event, cpuctx, ctx);
>> + }
>> + } else {
>> + perf_event_groups_for_each(event, node,
>> + &ctx->flexible_groups, group_node,
>> + group_list, group_entry) {
>> + group_sched_out(event, cpuctx, ctx);
>> + }
>> + }
>> + }
>>
>> perf_pmu_enable(ctx->pmu);
>> }
>> @@ -3225,9 +3270,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
>> * Called with IRQs disabled
>> */
>> static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> - enum event_type_t event_type)
>> + enum event_type_t event_type, int mux)
>> {
>> - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
>> + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
>> }
>>
>> static void
>> @@ -3287,11 +3332,13 @@ static void
>> ctx_sched_in(struct perf_event_context *ctx,
>> struct perf_cpu_context *cpuctx,
>> enum event_type_t event_type,
>> - struct task_struct *task)
>> + struct task_struct *task, int mux)
>> {
>> int is_active = ctx->is_active;
>> + struct list_head *group_list;
>> struct perf_event *event;
>> struct rb_node *node;
>> + int sw = -1, cpu = smp_processor_id();
>>
>> lockdep_assert_held(&ctx->lock);
>>
>> @@ -3319,35 +3366,69 @@ ctx_sched_in(struct perf_event_context *ctx,
>> * First go through the list and put on any pinned groups
>> * in order to give them the best chance of going on.
>> */
>> - if (is_active & EVENT_PINNED)
>> - perf_event_groups_for_each(event, node,
>> - &ctx->pinned_groups, group_node,
>> - group_list, group_entry)
>> - ctx_pinned_sched_in(event, cpuctx, ctx);
>> + if (is_active & EVENT_PINNED) {
>> + if (mux) {
>> + perf_event_groups_for_each_cpu(event, sw,
>> + &ctx->pinned_groups,
>> + group_list, group_entry) {
>> + ctx_pinned_sched_in(event, cpuctx, ctx);
>> + }
>> + perf_event_groups_for_each_cpu(event, cpu,
>> + &ctx->pinned_groups,
>> + group_list, group_entry) {
>> + ctx_pinned_sched_in(event, cpuctx, ctx);
>> + }
>> + } else {
>> + perf_event_groups_for_each(event, node,
>> + &ctx->pinned_groups, group_node,
>> + group_list, group_entry) {
>> + ctx_pinned_sched_in(event, cpuctx, ctx);
>> + }
>> + }
>> + }
>>
>> /* Then walk through the lower prio flexible groups */
>> if (is_active & EVENT_FLEXIBLE) {
>> int can_add_hw = 1;
>> - perf_event_groups_for_each(event, node,
>> - &ctx->flexible_groups, group_node,
>> - group_list, group_entry)
>> - ctx_flexible_sched_in(event, cpuctx, ctx, &can_add_hw);
>> + if (mux) {
>> + perf_event_groups_for_each_cpu(event, sw,
>> + &ctx->flexible_groups,
>> + group_list, group_entry) {
>> + ctx_flexible_sched_in(event, cpuctx,
>> + ctx, &can_add_hw);
>> + }
>> + can_add_hw = 1;
>> + perf_event_groups_for_each_cpu(event, cpu,
>> + &ctx->flexible_groups,
>> + group_list, group_entry) {
>> + ctx_flexible_sched_in(event, cpuctx,
>> + ctx, &can_add_hw);
>> + }
>> + } else {
>> + perf_event_groups_for_each(event, node,
>> + &ctx->flexible_groups, group_node,
>> + group_list, group_entry) {
>> + ctx_flexible_sched_in(event, cpuctx,
>> + ctx, &can_add_hw);
>> + }
>> + }
>> }
>> }
>>
>> static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>> enum event_type_t event_type,
>> - struct task_struct *task)
>> + struct task_struct *task, int mux)
>> {
>> struct perf_event_context *ctx = &cpuctx->ctx;
>>
>> - ctx_sched_in(ctx, cpuctx, event_type, task);
>> + ctx_sched_in(ctx, cpuctx, event_type, task, mux);
>> }
>>
>> static void perf_event_context_sched_in(struct perf_event_context *ctx,
>> struct task_struct *task)
>> {
>> struct perf_cpu_context *cpuctx;
>> + int mux = 0;
>>
>> cpuctx = __get_cpu_context(ctx);
>> if (cpuctx->task_ctx == ctx)
>> @@ -3371,8 +3452,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
>> * events, no need to flip the cpuctx's events around.
>> */
>> if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
>> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> - perf_event_sched_in(cpuctx, ctx, task);
>> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>> + perf_event_sched_in(cpuctx, ctx, task, mux);
>> perf_pmu_enable(ctx->pmu);
>>
>> unlock:
>> @@ -3618,7 +3699,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
>> static int perf_rotate_context(struct perf_cpu_context *cpuctx)
>> {
>> struct perf_event_context *ctx = NULL;
>> - int rotate = 0;
>> + int rotate = 0, mux = 1;
>>
>> if (cpuctx->ctx.nr_events) {
>> if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
>> @@ -3637,15 +3718,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
>> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> perf_pmu_disable(cpuctx->ctx.pmu);
>>
>> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>
> It's '1'.
>
>