Re: [PATCH 2/2 v4] sched: Rewrite per entity runnable load average tracking
From: Peter Zijlstra
Date: Mon Jul 28 2014 - 09:51:39 EST
> +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> {
> + int decayed;
>
> + if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> + long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> + cfs_rq->avg.load_avg = subtract_until_zero(cfs_rq->avg.load_avg, r);
> + r *= LOAD_AVG_MAX;
> + cfs_rq->avg.load_sum = subtract_until_zero(cfs_rq->avg.load_sum, r);
> }
>
> + decayed = __update_load_avg(now, &cfs_rq->avg, cfs_rq->load.weight);
>
> +#ifndef CONFIG_64BIT
> + if (cfs_rq->avg.last_update_time != cfs_rq->load_last_update_time_copy) {
> + smp_wmb();
> + cfs_rq->load_last_update_time_copy = cfs_rq->avg.last_update_time;
> + }
> +#endif
>
> + return decayed;
> +}
So on every cfs_rq update we first process the 'pending' removals, then
decay and then store the current timestamp.
> +static inline void enqueue_entity_load_avg(struct sched_entity *se)
> {
> + struct sched_avg *sa = &se->avg;
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> + u64 now = cfs_rq_clock_task(cfs_rq);
> + int migrated = 0, decayed;
>
> + if (sa->last_update_time == 0) {
> + sa->last_update_time = now;
>
> + if (entity_is_task(se))
> + migrated = 1;
> }
> + else
> + __update_load_avg(now, sa, se->on_rq * se->load.weight);
>
> + decayed = update_cfs_rq_load_avg(now, cfs_rq);
>
> + if (migrated) {
> + cfs_rq->avg.load_avg += sa->load_avg;
> + cfs_rq->avg.load_sum += sa->load_sum;
> }
>
> + if (decayed || migrated)
> + update_tg_load_avg(cfs_rq);
> }
On enqueue we add ourselves to the cfs_rq.. and assume the entity is
'current' wrt updates since we did that when we just pulled it from the
old rq.
> @@ -4551,18 +4382,34 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
> {
> struct sched_entity *se = &p->se;
> struct cfs_rq *cfs_rq = cfs_rq_of(se);
> + u64 last_update_time;
>
> /*
> + * Task on old CPU catches up with its old cfs_rq, and subtract itself from
> + * the cfs_rq (task must be off the queue now).
> */
> +#ifndef CONFIG_64BIT
> + u64 last_update_time_copy;
> +
> + do {
> + last_update_time_copy = cfs_rq->load_last_update_time_copy;
> + smp_rmb();
> + last_update_time = cfs_rq->avg.last_update_time;
> + } while (last_update_time != last_update_time_copy);
> +#else
> + last_update_time = cfs_rq->avg.last_update_time;
> +#endif
> + __update_load_avg(last_update_time, &se->avg, 0);
> + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
> +
> + /*
> + * We are supposed to update the task to "current" time, then its up to date
> + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
> + * what current time is, so simply throw away the out-of-date time. This
> + * will result in the wakee task is less decayed, but giving the wakee more
> + * load sounds not bad.
> + */
> + se->avg.last_update_time = 0;
>
> /* We have migrated, no longer consider this task hot */
> se->exec_start = 0;
And here we try and make good on that assumption. The thing I worry
about is what happens if the machine is entirely idle...
What guarantees an semi up-to-date cfs_rq->avg.last_update_time.
Attachment:
pgpbxmdOVYlPn.pgp
Description: PGP signature