Re: [PATCH] perf: Fix mux_interval hrtimer wreckage

From: Stephane Eranian
Date: Tue May 27 2014 - 10:09:53 EST


On Tue, May 20, 2014 at 11:02 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> Subject: perf: Fix mux_interval hrtimer wreckage
> From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Date: Tue May 20 10:09:32 CEST 2014
>
> Thomas stumbled over the hrtimer_forward_now() in
> perf_event_mux_interval_ms_store() and noticed its broken-ness.
>
> You cannot just change the expiry time of an active timer, it will
> destroy the red-black tree order and cause havoc.
>
> Change it to (re)start the timer instead, (re)starting a timer will
> dequeue and enqueue a timer and therefore preserve rb-tree order.
>
> Since we cannot enqueue remotely, wrap the thing in
> cpu_function_call(), this however mandates that we restrict ourselves
> to online cpus. Also serialize the entire setting so we don't get
> multiple concurrent threads trying to update to different values.
>
> Also fix a problem in perf_mux_hrtimer_restart(), checking against
> hrtimer_active() can actually loose us the timer when timer->state ==
> HRTIMER_STATE_CALLBACK and the callback has already decided NORESTART.
>
> Furthermore it doesn't make any sense to test
> hrtimer_callback_running() when we already tested hrtimer_active(),
> but with the above change, we explicitly must call it when
> callback_running.
>
> Lastly, rename a few functions:
>
> s/perf_cpu_hrtimer_/perf_mux_hrtimer_/ -- because I could not find
> the mux timer function
>
> s/\<hr\>/timer/ -- because that's the normal way of calling things.
>
> Fixes: 62b856397927 ("perf: Add sysfs entry to adjust multiplexing interval per PMU")
> Cc: Stephane Eranian <eranian@xxxxxxxxxx>
> Reported-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Link: http://lkml.kernel.org/n/tip-ife5kqgnt7mviatc9fakz8wk@xxxxxxxxxxxxxx

So, I tested this patch on tip.git and it panics my kernels as soon as
I multiplex
events. For instance running:
$ perf stat -e cycles,cycles,cycles,cycles,cycles,cycles dd
if=/dev/urandom of=/dev/null count=10000000

It dies as shown below, so something is wrong in the setup of that hrtimer now.

[ 101.092337] RIP: 0010:[<ffffffffac06afad>] [<ffffffffac06afad>]
__run_hrtimer.isra.31+0x9d/0x100
[ 101.101231] RSP: 0000:ffff88021fa43ed8 EFLAGS: 00010002
[ 101.106537] RAX: 00000000cf46cf46 RBX: ffff88021fa55718 RCX: 0000000000000018
[ 101.113664] RDX: 000000000000cf46 RSI: 000000177bc14ea6 RDI: ffff88021fa4d0a0
[ 101.120807] RBP: ffff88021fa43ef8 R08: 000000177bfe4df9 R09: 00000000000004c4
[ 101.127949] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88021fa4d0a0
[ 101.135083] R13: ffff88021fa4d0e0 R14: 0000000000000001 R15: 7fffffffffffffff
[ 101.142201] FS: 0000000001e7e860(0063) GS:ffff88021fa40000(0000)
knlGS:0000000000000000
[ 101.150289] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 101.156035] CR2: 00007f1139eb9000 CR3: 0000000215d84000 CR4: 00000000000407e0
[ 101.163154] Stack:
[ 101.165171] ffff88021fa4d0e0 000000177bc12e1e ffff88021fa4d0a0
0000000000000001
[ 101.172652] ffff88021fa43f68 ffffffffac06b615 000000177bc12e1e
ffff88021fa4d1d8
[ 101.180113] ffff88021fa4d198 ffff88021fa4d158 0000000300000002
000000177bc12e1e
[ 101.187577] Call Trace:
[ 101.190020] <IRQ>
[ 101.191942] [<ffffffffac06b615>] hrtimer_interrupt+0xf5/0x230
[ 101.197986] [<ffffffffac02fd76>] local_apic_timer_interrupt+0x36/0x60
[ 101.204507] [<ffffffffac0303ee>] smp_apic_timer_interrupt+0x3e/0x60
[ 101.210864] [<ffffffffac5e068a>] apic_timer_interrupt+0x6a/0x70

> ---
> kernel/events/core.c | 67 ++++++++++++++++++++++++++++-----------------------
> 1 file changed, 38 insertions(+), 29 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -45,9 +45,11 @@
>
> #include <asm/irq_regs.h>
>
> +typedef int (*remote_function_f)(void *);
> +
> struct remote_function_call {
> struct task_struct *p;
> - int (*func)(void *info);
> + remote_function_f func;
> void *info;
> int ret;
> };
> @@ -80,7 +82,7 @@ static void remote_function(void *data)
> * -EAGAIN - when the process moved away
> */
> static int
> -task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
> +task_function_call(struct task_struct *p, remote_function_f func, void *info)
> {
> struct remote_function_call data = {
> .p = p,
> @@ -104,7 +106,7 @@ task_function_call(struct task_struct *p
> *
> * returns: @func return value or -ENXIO when the cpu is offline
> */
> -static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
> +static int cpu_function_call(int cpu, remote_function_f func, void *info)
> {
> struct remote_function_call data = {
> .p = NULL,
> @@ -759,7 +761,7 @@ perf_cgroup_mark_enabled(struct perf_eve
> /*
> * function must be called with interrupts disbled
> */
> -static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
> +static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
> {
> struct perf_cpu_context *cpuctx;
> enum hrtimer_restart ret = HRTIMER_NORESTART;
> @@ -783,7 +785,7 @@ static enum hrtimer_restart perf_cpu_hrt
> }
>
> /* CPU is going down */
> -void perf_cpu_hrtimer_cancel(int cpu)
> +void perf_mux_hrtimer_cancel(int cpu)
> {
> struct perf_cpu_context *cpuctx;
> struct pmu *pmu;
> @@ -810,11 +812,11 @@ void perf_cpu_hrtimer_cancel(int cpu)
> local_irq_restore(flags);
> }
>
> -static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
> +static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
> {
> - struct hrtimer *hr = &cpuctx->hrtimer;
> + struct hrtimer *timer = &cpuctx->hrtimer;
> struct pmu *pmu = cpuctx->ctx.pmu;
> - int timer;
> + u64 interval;
>
> /* no multiplexing needed for SW PMU */
> if (pmu->task_ctx_nr == perf_sw_context)
> @@ -824,31 +826,32 @@ static void __perf_cpu_hrtimer_init(stru
> * check default is sane, if not set then force to
> * default interval (1/tick)
> */
> - timer = pmu->hrtimer_interval_ms;
> - if (timer < 1)
> - timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
> + interval = pmu->hrtimer_interval_ms;
> + if (interval < 1)
> + interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
>
> - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
> + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
>
> - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
> - hr->function = perf_cpu_hrtimer_handler;
> + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
> + timer->function = perf_mux_hrtimer_handler;
> }
>
> -static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
> +static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
> {
> - struct hrtimer *hr = &cpuctx->hrtimer;
> + struct hrtimer *timer = &cpuctx->hrtimer;
> struct pmu *pmu = cpuctx->ctx.pmu;
>
> /* not for SW PMU */
> if (pmu->task_ctx_nr == perf_sw_context)
> - return;
> + return 0;
>
> - if (hrtimer_active(hr))
> - return;
> + if (hrtimer_is_queued(timer))
> + return 0;
>
> - if (!hrtimer_callback_running(hr))
> - __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
> - 0, HRTIMER_MODE_REL_PINNED, 0);
> + __hrtimer_start_range_ns(timer, cpuctx->hrtimer_interval,
> + 0, HRTIMER_MODE_REL_PINNED, 0);
> +
> + return 0;
> }
>
> void perf_pmu_disable(struct pmu *pmu)
> @@ -1746,7 +1749,7 @@ group_sched_in(struct perf_event *group_
>
> if (event_sched_in(group_event, cpuctx, ctx)) {
> pmu->cancel_txn(pmu);
> - perf_cpu_hrtimer_restart(cpuctx);
> + perf_mux_hrtimer_restart(cpuctx);
> return -EAGAIN;
> }
>
> @@ -1793,7 +1796,7 @@ group_sched_in(struct perf_event *group_
>
> pmu->cancel_txn(pmu);
>
> - perf_cpu_hrtimer_restart(cpuctx);
> + perf_mux_hrtimer_restart(cpuctx);
>
> return -EAGAIN;
> }
> @@ -2061,7 +2064,7 @@ static int __perf_event_enable(void *inf
> */
> if (leader != event) {
> group_sched_out(leader, cpuctx, ctx);
> - perf_cpu_hrtimer_restart(cpuctx);
> + perf_mux_hrtimer_restart(cpuctx);
> }
> if (leader->attr.pinned) {
> update_group_times(leader);
> @@ -6396,6 +6399,8 @@ perf_event_mux_interval_ms_show(struct d
> return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
> }
>
> +static DEFINE_MUTEX(mux_interval_mutex);
> +
> static ssize_t
> perf_event_mux_interval_ms_store(struct device *dev,
> struct device_attribute *attr,
> @@ -6415,17 +6420,21 @@ perf_event_mux_interval_ms_store(struct
> if (timer == pmu->hrtimer_interval_ms)
> return count;
>
> + mutex_lock(&mux_interval_mutex);
> pmu->hrtimer_interval_ms = timer;
>
> /* update all cpuctx for this PMU */
> - for_each_possible_cpu(cpu) {
> + get_online_cpus();
> + for_each_online_cpu(cpu) {
> struct perf_cpu_context *cpuctx;
> cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
>
> - if (hrtimer_active(&cpuctx->hrtimer))
> - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
> + cpu_function_call(cpu,
> + (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
> }
> + put_online_cpus();
> + mutex_unlock(&mux_interval_mutex);
>
> return count;
> }
> @@ -6531,7 +6540,7 @@ int perf_pmu_register(struct pmu *pmu, c
> cpuctx->ctx.type = cpu_context;
> cpuctx->ctx.pmu = pmu;
>
> - __perf_cpu_hrtimer_init(cpuctx, cpu);
> + __perf_mux_hrtimer_init(cpuctx, cpu);
>
> INIT_LIST_HEAD(&cpuctx->rotation_list);
> cpuctx->unique_pmu = pmu;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/