Re: [BUG] 2.6.37-rc3 massive interactivity regression on ARM

From: Venkatesh Pallipadi
Date: Thu Dec 09 2010 - 12:43:46 EST


On Thu, Dec 9, 2010 at 4:52 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Wed, 2010-12-08 at 15:31 -0800, Venkatesh Pallipadi wrote:
>
>> [PATCH] update_rq_clock() with irq_time should handle 64 bit wraparound
>>
>> update_rq_clock with IRQ_TIME_ACCOUNTING assumed 64 bit sched_clock that
>> practically will not wraparound. Change to code to handle wraparounds,
>> still preventing rq->clock_task from going backwards.
>
> All we should do is deal with the 64wrap, not filter backward motion, I
> came up with something like the below.

The original intent of this check
- if (rq->clock - irq_time > rq->clock_task)
was to filter the potential backward motion. As otherwise, downstream
users of clock_task can see
huge positive numbers from curr - prev calculations.

The same problem will be there with below code, with irq_delta >
delta, clock_task can go backwards which is not good.
+ delta -= irq_delta;
+ rq->clock_task += delta;

The reason for this is rq->clock and irqtime updates kind of happen
independently and specifically, if a rq->clock update happens while we
are in a softirq, we may have this case of going backwards on the next
update.

>
> I think for now the best solution to the early wrap problem for ARM is
> for them to select HAVE_UNSTABLE_SCHED_CLOCK, it will mostly deal with
> the short wrap by filtering out the occasional negative value.
>
> Then later we can look at cleaning/breaking-up the kernel/sched_clock.c
> code to provide smaller bits of functionality and possibly merging it
> with some of the clocksource code.
>

Yes. That should resolve the early wrap.


Thanks,
Venki

> ---
>  kernel/sched.c |   54 +++++++++++++++++++++++++-----------------------------
>  1 files changed, 25 insertions(+), 29 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 8e885c1..0fb7de8 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -642,23 +642,19 @@ static inline struct task_group *task_group(struct task_struct *p)
>
>  #endif /* CONFIG_CGROUP_SCHED */
>
> -static u64 irq_time_cpu(int cpu);
> -static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
> +static void update_rq_clock_task(struct rq *rq, s64 delta);
>
>  inline void update_rq_clock(struct rq *rq)
>  {
> -       int cpu = cpu_of(rq);
> -       u64 irq_time;
> +       s64 delta;
>
>        if (rq->skip_clock_update)
>                return;
>
> -       rq->clock = sched_clock_cpu(cpu);
> -       irq_time = irq_time_cpu(cpu);
> -       if (rq->clock - irq_time > rq->clock_task)
> -               rq->clock_task = rq->clock - irq_time;
> +       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
> +       rq->clock += delta;
> +       update_rq_clock_task(rq, delta);
>
> -       sched_irq_time_avg_update(rq, irq_time);
>  }
>
>  /*
> @@ -1817,14 +1813,6 @@ void disable_sched_clock_irqtime(void)
>        sched_clock_irqtime = 0;
>  }
>
> -static u64 irq_time_cpu(int cpu)
> -{
> -       if (!sched_clock_irqtime)
> -               return 0;
> -
> -       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
> -}
> -
>  void account_system_vtime(struct task_struct *curr)
>  {
>        unsigned long flags;
> @@ -1855,25 +1843,33 @@ void account_system_vtime(struct task_struct *curr)
>  }
>  EXPORT_SYMBOL_GPL(account_system_vtime);
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
> +static inline u64 irq_time_cpu(int cpu)
>  {
> -       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
> -               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
> -               rq->prev_irq_time = curr_irq_time;
> -               sched_rt_avg_update(rq, delta_irq);
> -       }
> +       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
>  }
>
> -#else
> -
> -static u64 irq_time_cpu(int cpu)
> +static void update_rq_clock_task(struct rq *rq, s64 delta)
>  {
> -       return 0;
> +       s64 irq_delta;
> +
> +       irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
> +       rq->prev_irq_time += irq_delta;
> +
> +       delta -= irq_delta;
> +       rq->clock_task += delta;
> +
> +       if (irq_delta && sched_feat(NONIRQ_POWER))
> +               sched_rt_avg_update(rq, irq_delta);
>  }
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
> +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> -#endif
> +static inline void update_rq_clock_task(struct rq *rq, s64 delta)
> +{
> +       rq->clock_task += delta;
> +}
> +
> +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
>
>  #include "sched_idletask.c"
>  #include "sched_fair.c"
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/