Re: [PATCH v3 0/5] sync a se with its cfs_rq when att(det)aching it

From: Byungchul Park
Date: Thu Aug 20 2015 - 01:26:49 EST


On Thu, Aug 20, 2015 at 03:17:21AM +0200, Peter Zijlstra wrote:
>
> I did something like this on top.. please have a look at the XXX and
> integrate.

yes, i will do it.

>
> ---
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2664,8 +2664,8 @@ static inline u64 cfs_rq_clock_task(stru
> /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
> static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> {
> - int decayed;
> struct sched_avg *sa = &cfs_rq->avg;
> + int decayed;
>
> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> @@ -2695,15 +2695,16 @@ static inline int update_cfs_rq_load_avg
> static inline void update_load_avg(struct sched_entity *se, int update_tg)
> {
> struct cfs_rq *cfs_rq = cfs_rq_of(se);
> - int cpu = cpu_of(rq_of(cfs_rq));
> u64 now = cfs_rq_clock_task(cfs_rq);
> + int cpu = cpu_of(rq_of(cfs_rq));
>
> /*
> * Track task load average for carrying it to new CPU after migrated, and
> * track group sched_entity load average for task_h_load calc in migration
> */
> __update_load_avg(now, cpu, &se->avg,
> - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
> + se->on_rq * scale_load_down(se->load.weight),
> + cfs_rq->curr == se, NULL);
>
> if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
> update_tg_load_avg(cfs_rq, 0);
> @@ -2718,9 +2719,10 @@ static void attach_entity_load_avg(struc
> * update here. we have to update it with prev cfs_rq just before changing
> * se's cfs_rq, and get here soon.
> */
> - if (se->avg.last_update_time)
> + if (se->avg.last_update_time) {
> __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> - &se->avg, 0, 0, NULL);
> + &se->avg, 0, 0, NULL);
> + }
>
> se->avg.last_update_time = cfs_rq->avg.last_update_time;
> cfs_rq->avg.load_avg += se->avg.load_avg;
> @@ -2732,17 +2734,13 @@ static void attach_entity_load_avg(struc
> static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> - &se->avg, se->on_rq * scale_load_down(se->load.weight),
> - cfs_rq->curr == se, NULL);
> + &se->avg, se->on_rq * scale_load_down(se->load.weight),
> + cfs_rq->curr == se, NULL);
>
> - cfs_rq->avg.load_avg =
> - max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
> - cfs_rq->avg.load_sum =
> - max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
> - cfs_rq->avg.util_avg =
> - max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
> - cfs_rq->avg.util_sum =
> - max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
> + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
> + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
> + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
> + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
> }
>
> /* Add the load generated by se into cfs_rq's load average */
> @@ -2751,14 +2749,14 @@ enqueue_entity_load_avg(struct cfs_rq *c
> {
> struct sched_avg *sa = &se->avg;
> u64 now = cfs_rq_clock_task(cfs_rq);
> - int migrated = 0, decayed;
> + int migrated, decayed;
>
> - if (sa->last_update_time == 0)
> - migrated = 1;
> - else
> + migrated = !sa->last_update_time;
> + if (!migrated) {
> __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
> se->on_rq * scale_load_down(se->load.weight),
> cfs_rq->curr == se, NULL);
> + }
>
> decayed = update_cfs_rq_load_avg(now, cfs_rq);
>
> @@ -2781,7 +2779,7 @@ dequeue_entity_load_avg(struct cfs_rq *c
> cfs_rq->runnable_load_avg =
> max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
> cfs_rq->runnable_load_sum =
> - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
> + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
> }
>
> /*
> @@ -2849,6 +2847,11 @@ static inline void
> dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> static inline void remove_entity_load_avg(struct sched_entity *se) {}
>
> +static inline void
> +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> +static inline void
> +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> +
> static inline int idle_balance(struct rq *rq)
> {
> return 0;
> @@ -7915,29 +7918,50 @@ prio_changed_fair(struct rq *rq, struct
> check_preempt_curr(rq, p, 0);
> }
>
> -static void detach_task_cfs_rq(struct task_struct *p, int queued)
> +static inline bool vruntime_normalized(struct task_struct *p)
> {
> + int queued = task_on_rq_queued(p);
> struct sched_entity *se = &p->se;
> - struct cfs_rq *cfs_rq = cfs_rq_of(se);
>
> /*
> - * Ensure the task's vruntime is normalized, so that after attaching
> - * back it to a cfs_rq the enqueue_entity() will do the right thing.
> - *
> * If it's queued, then the dequeue_entity(.flags=0) will already
> - * have normalized the vruntime, if it's !queued, then only when
> - * the task is sleeping it still has a non-normalized vruntime.
> - *
> + * have normalized the vruntime.
> + */
> + if (queued)
> + return true;
> +
> + /*
> * When !queued, vruntime of the task has usually NOT been normalized.
> * But there are some cases where it has already been normalized:
> - *
> - * - Moving a forked child which is waiting for being woken up by
> + */
> +
> + /*
> + * - a forked child which is waiting for being woken up by
> * wake_up_new_task().
> - * - Moving a task which has been woken up by try_to_wake_up() and
> + */
> + if (!se->sum_exec_runtime || p->state == TASK_WAKING)
> + return true;
> +
> + /*
> + * - a task which has been woken up by try_to_wake_up() and
> * waiting for actually being woken up by sched_ttwu_pending().
> + *
> + * XXX this appears wrong!! check history,
> + * we appear to always set queued and RUNNING under the same lock instance
> + * might be from before TASK_WAKING ?
> */
> - if (!queued && p->state != TASK_RUNNING &&
> - se->sum_exec_runtime && p->state != TASK_WAKING) {
> + if (p->state == TASK_RUNNING)
> + return true;
> +
> + return false;
> +}
> +
> +static void detach_task_cfs_rq(struct task_struct *p)
> +{
> + struct sched_entity *se = &p->se;
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + if (!vruntime_normalized(p)) {
> /*
> * Fix up our vruntime so that the current sleep doesn't
> * cause 'unlimited' sleep bonus.
> @@ -7946,10 +7970,8 @@ static void detach_task_cfs_rq(struct ta
> se->vruntime -= cfs_rq->min_vruntime;
> }
>
> -#ifdef CONFIG_SMP
> /* Catch up with the cfs_rq and remove our load when we leave */
> detach_entity_load_avg(cfs_rq, se);
> -#endif
> }
>
> static void attach_task_cfs_rq(struct task_struct *p, int queued)
> @@ -7965,49 +7987,23 @@ static void attach_task_cfs_rq(struct ta
> se->depth = se->parent ? se->parent->depth + 1 : 0;
> #endif
>
> -#ifdef CONFIG_SMP
> /* synchronize task with its cfs_rq */
> attach_entity_load_avg(cfs_rq, se);
> -#endif
>
> - /*
> - * Ensure the task has a non-normalized vruntime when attaching it
> - * to a cfs_rq, so that enqueue_entity() at wake-up time will do
> - * the right thing.
> - *
> - * If it's queued, then the enqueue_entity(.flags=0) makes the task
> - * has non-normalized vruntime, if it's !queued, then it still has
> - * a normalized vruntime.
> - *
> - * When !queued, a task usually should have a non-normalized vruntime
> - * after being attached to a cfs_rq. But there are some cases where
> - * we should keep it normalized:
> - *
> - * - Moving a forked child which is waiting for being woken up by
> - * wake_up_new_task().
> - * - Moving a task which has been woken up by try_to_wake_up() and
> - * waiting for actually being woken up by sched_ttwu_pending().
> - */
> - if (!queued && p->state != TASK_RUNNING &&
> - se->sum_exec_runtime && p->state != TASK_WAKING)
> + if (!vruntime_normalized(p))
> se->vruntime += cfs_rq->min_vruntime;
> }
>
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> - detach_task_cfs_rq(p, task_on_rq_queued(p));
> + detach_task_cfs_rq(p);
> }
>
> -/*
> - * We switched to the sched_fair class.
> - */
> static void switched_to_fair(struct rq *rq, struct task_struct *p)
> {
> - int queued = task_on_rq_queued(p);
> -
> - attach_task_cfs_rq(p, queued);
> + attach_task_cfs_rq(p);
>
> - if (queued) {
> + if (task_on_rq_queued(p)) {
> /*
> * We were most likely switched from sched_rt, so
> * kick off the schedule if running, otherwise just see
> @@ -8056,8 +8052,7 @@ static void task_move_group_fair(struct
> {
> detach_task_cfs_rq(p, queued);
> set_task_rq(p, task_cpu(p));
> -
> - /* tell se's cfs_rq has been changed */
> + /* tell se's cfs_rq has been changed -- migrated */
> p->se.avg.last_update_time = 0;
> attach_task_cfs_rq(p, queued);
> }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/