Re: [RFC][PATCH 1/3] sched: Rewrite tg_shares_up

From: Paul Turner
Date: Thu Sep 02 2010 - 23:10:28 EST


On Sat, Aug 28, 2010 at 11:30 PM, Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> wrote:
> By tracking a per-cpu load-avg for each cfs_rq and folding it into a
> global task_group load on each tick we can rework tg_shares_up to be
> strictly per-cpu.
>
> This should improve cpu-cgroup performance for smp systems
> significantly.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> ---
>  include/linux/sched.h   |    2
>  kernel/sched.c          |  173 ++++++++++++------------------------------------
>  kernel/sched_debug.c    |   11 ++-
>  kernel/sched_fair.c     |  153 ++++++++++++++++++++++++++----------------
>  kernel/sched_features.h |    2
>  kernel/sysctl.c         |   17 ----
>  6 files changed, 147 insertions(+), 211 deletions(-)
>
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -1858,8 +1858,6 @@ static inline void wake_up_idle_cpu(int
>  extern unsigned int sysctl_sched_latency;
>  extern unsigned int sysctl_sched_min_granularity;
>  extern unsigned int sysctl_sched_wakeup_granularity;
> -extern unsigned int sysctl_sched_shares_ratelimit;
> -extern unsigned int sysctl_sched_shares_thresh;
>  extern unsigned int sysctl_sched_child_runs_first;
>
>  enum sched_tunable_scaling {
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -253,6 +253,8 @@ struct task_group {
>        /* runqueue "owned" by this group on each cpu */
>        struct cfs_rq **cfs_rq;
>        unsigned long shares;
> +
> +       atomic_t load_weight;
>  #endif
>
>  #ifdef CONFIG_RT_GROUP_SCHED
> @@ -359,15 +361,11 @@ struct cfs_rq {
>         */
>        unsigned long h_load;
>
> -       /*
> -        * this cpu's part of tg->shares
> -        */
> -       unsigned long shares;
> +       u64 load_avg;
> +       u64 load_period;
> +       u64 load_stamp;
>
> -       /*
> -        * load.weight at the time we set shares
> -        */
> -       unsigned long rq_weight;
> +       unsigned long load_contribution;
>  #endif
>  #endif
>  };
> @@ -793,20 +791,6 @@ late_initcall(sched_init_debug);
>  const_debug unsigned int sysctl_sched_nr_migrate = 32;
>
>  /*
> - * ratelimit for updating the group shares.
> - * default: 0.25ms
> - */
> -unsigned int sysctl_sched_shares_ratelimit = 250000;
> -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
> -
> -/*
> - * Inject some fuzzyness into changing the per-cpu group shares
> - * this avoids remote rq-locks at the expense of fairness.
> - * default: 4
> - */
> -unsigned int sysctl_sched_shares_thresh = 4;
> -
> -/*
>  * period over which we average the RT time consumption, measured
>  * in ms.
>  *
> @@ -1351,6 +1335,12 @@ static inline void update_load_sub(struc
>        lw->inv_weight = 0;
>  }
>
> +static inline void update_load_set(struct load_weight *lw, unsigned long w)
> +{
> +       lw->weight = w;
> +       lw->inv_weight = 0;
> +}
> +
>  /*
>  * To aid in avoiding the subversion of "niceness" due to uneven distribution
>  * of tasks with abnormal "nice" values across CPUs the contribution that
> @@ -1539,97 +1529,44 @@ static unsigned long cpu_avg_load_per_ta
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>
> -static __read_mostly unsigned long __percpu *update_shares_data;
> -
> -static void __set_se_shares(struct sched_entity *se, unsigned long shares);
> -
> -/*
> - * Calculate and set the cpu's group shares.
> - */
> -static void update_group_shares_cpu(struct task_group *tg, int cpu,
> -                                   unsigned long sd_shares,
> -                                   unsigned long sd_rq_weight,
> -                                   unsigned long *usd_rq_weight)
> -{
> -       unsigned long shares, rq_weight;
> -       int boost = 0;
> -
> -       rq_weight = usd_rq_weight[cpu];
> -       if (!rq_weight) {
> -               boost = 1;
> -               rq_weight = NICE_0_LOAD;
> -       }
> -
> -       /*
> -        *             \Sum_j shares_j * rq_weight_i
> -        * shares_i =  -----------------------------
> -        *                  \Sum_j rq_weight_j
> -        */
> -       shares = (sd_shares * rq_weight) / sd_rq_weight;
> -       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
> -
> -       if (abs(shares - tg->se[cpu]->load.weight) >
> -                       sysctl_sched_shares_thresh) {
> -               struct rq *rq = cpu_rq(cpu);
> -               unsigned long flags;
> -
> -               raw_spin_lock_irqsave(&rq->lock, flags);
> -               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
> -               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
> -               __set_se_shares(tg->se[cpu], shares);
> -               raw_spin_unlock_irqrestore(&rq->lock, flags);
> -       }
> -}
> +static void update_cfs_load(struct cfs_rq *cfs_rq);
> +static void update_cfs_shares(struct cfs_rq *cfs_rq);
>
>  /*
> - * Re-compute the task group their per cpu shares over the given domain.
> - * This needs to be done in a bottom-up fashion because the rq weight of a
> - * parent group depends on the shares of its child groups.
> + * update tg->load_weight by folding this cpu's load_avg
>  */
>  static int tg_shares_up(struct task_group *tg, void *data)
>  {
> -       unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
> -       unsigned long *usd_rq_weight;
> -       struct sched_domain *sd = data;
> +       unsigned long load_avg;
> +       struct cfs_rq *cfs_rq;
>        unsigned long flags;
> -       int i;
> +       int cpu = (long)data;
> +       struct rq *rq;
>
> -       if (!tg->se[0])
> +       if (!tg->se[cpu])
>                return 0;
>
> -       local_irq_save(flags);
> -       usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
> -
> -       for_each_cpu(i, sched_domain_span(sd)) {
> -               weight = tg->cfs_rq[i]->load.weight;
> -               usd_rq_weight[i] = weight;
> -
> -               rq_weight += weight;
> -               /*
> -                * If there are currently no tasks on the cpu pretend there
> -                * is one of average load so that when a new task gets to
> -                * run here it will not get delayed by group starvation.
> -                */
> -               if (!weight)
> -                       weight = NICE_0_LOAD;
> +       rq = cpu_rq(cpu);
> +       cfs_rq = tg->cfs_rq[cpu];
>
> -               sum_weight += weight;
> -               shares += tg->cfs_rq[i]->shares;
> -       }
> +       raw_spin_lock_irqsave(&rq->lock, flags);
>
> -       if (!rq_weight)
> -               rq_weight = sum_weight;
> +       update_rq_clock(rq);
> +       update_cfs_load(cfs_rq);
>
> -       if ((!shares && rq_weight) || shares > tg->shares)
> -               shares = tg->shares;
> +       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
> +       load_avg -= cfs_rq->load_contribution;
>
> -       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
> -               shares = tg->shares;
> +       atomic_add(load_avg, &tg->load_weight);
> +       cfs_rq->load_contribution += load_avg;
>
> -       for_each_cpu(i, sched_domain_span(sd))
> -               update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
> +       /*
> +        * We need to update shares after updating tg->load_weight in
> +        * order to adjust the weight of groups with long running tasks.
> +        */
> +       update_cfs_shares(cfs_rq);
>
> -       local_irq_restore(flags);
> +       raw_spin_unlock_irqrestore(&rq->lock, flags);
>
>        return 0;
>  }
> @@ -1648,7 +1585,7 @@ static int tg_load_down(struct task_grou
>                load = cpu_rq(cpu)->load.weight;
>        } else {
>                load = tg->parent->cfs_rq[cpu]->h_load;
> -               load *= tg->cfs_rq[cpu]->shares;
> +               load *= tg->se[cpu]->load.weight;
>                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
>        }
>
> @@ -1657,21 +1594,16 @@ static int tg_load_down(struct task_grou
>        return 0;
>  }
>
> -static void update_shares(struct sched_domain *sd)
> +static void update_shares(long cpu)
>  {
> -       s64 elapsed;
> -       u64 now;
> -
>        if (root_task_group_empty())
>                return;
>
> -       now = local_clock();
> -       elapsed = now - sd->last_update;
> +       /*
> +        * XXX: replace with an on-demand list
> +        */
>
> -       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
> -               sd->last_update = now;
> -               walk_tg_tree(tg_nop, tg_shares_up, sd);
> -       }
> +       walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
>  }
>
>  static void update_h_load(long cpu)
> @@ -1681,7 +1613,7 @@ static void update_h_load(long cpu)
>
>  #else
>
> -static inline void update_shares(struct sched_domain *sd)
> +static inline void update_shares(int cpu)
>  {
>  }
>
> @@ -1806,15 +1738,6 @@ static void double_rq_unlock(struct rq *
>
>  #endif
>
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
> -{
> -#ifdef CONFIG_SMP
> -       cfs_rq->shares = shares;
> -#endif
> -}
> -#endif
> -
>  static void calc_load_account_idle(struct rq *this_rq);
>  static void update_sysctl(void);
>  static int get_update_sysctl_factor(void);
> @@ -5400,7 +5323,6 @@ static void update_sysctl(void)
>        SET_SYSCTL(sched_min_granularity);
>        SET_SYSCTL(sched_latency);
>        SET_SYSCTL(sched_wakeup_granularity);
> -       SET_SYSCTL(sched_shares_ratelimit);
>  #undef SET_SYSCTL
>  }
>
> @@ -7652,8 +7574,7 @@ static void init_tg_cfs_entry(struct tas
>                se->cfs_rq = parent->my_q;
>
>        se->my_q = cfs_rq;
> -       se->load.weight = tg->shares;
> -       se->load.inv_weight = 0;
> +       update_load_set(&se->load, tg->shares);

Given now instantaneous update of shares->load on enqueue/dequeue
initialization to 0 would result in sane(r) sums across tg->se->load.
Only relevant for debug though.

>        se->parent = parent;
>  }
>  #endif
> @@ -7746,10 +7667,6 @@ void __init sched_init(void)
>
>  #endif /* CONFIG_CGROUP_SCHED */
>
> -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
> -       update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
> -                                           __alignof__(unsigned long));
> -#endif
>        for_each_possible_cpu(i) {
>                struct rq *rq;
>
> @@ -8317,8 +8234,7 @@ static void __set_se_shares(struct sched
>        if (on_rq)
>                dequeue_entity(cfs_rq, se, 0);
>
> -       se->load.weight = shares;
> -       se->load.inv_weight = 0;
> +       update_load_set(&se->load, shares);
>
>        if (on_rq)
>                enqueue_entity(cfs_rq, se, 0);
> @@ -8375,7 +8291,6 @@ int sched_group_set_shares(struct task_g
>                /*
>                 * force a rebalance
>                 */
> -               cfs_rq_set_shares(tg->cfs_rq[i], 0);
>                set_se_shares(tg->se[i], shares);

I think a update_cfs_shares is wanted instead here, this will
potentially over-commit everything until we hit tg_shares_up (e.g.
long running task case).

Ironically, the heavy weight full enqueue/dequeue in the
__set_se_shares path will actually fix up the weights ignoring the
passed weight for the se->on_rq case.

I think both functions can be knocked out and just replaced with a
<lock> <update load> <update shares> <unlock>

Although.. for total correctness this update should probably be hierarchical.

>        }
>
> Index: linux-2.6/kernel/sched_debug.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_debug.c
> +++ linux-2.6/kernel/sched_debug.c
> @@ -204,13 +204,18 @@ void print_cfs_rq(struct seq_file *m, in
>                        SPLIT_NS(spread0));
>        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
>        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
> +       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
> +                       SPLIT_NS(cfs_rq->load_avg));
> +       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
> +                       SPLIT_NS(cfs_rq->load_period));
> +       SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
> +                       cfs_rq->load_contribution);
> +       SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
> +                       atomic_read(&tg->load_weight));
>
>        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
>                        cfs_rq->nr_spread_over);
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -#ifdef CONFIG_SMP
> -       SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
> -#endif
>        print_cfs_group_stats(m, cpu, cfs_rq->tg);
>  #endif
>  }
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl
>        WRT_SYSCTL(sched_min_granularity);
>        WRT_SYSCTL(sched_latency);
>        WRT_SYSCTL(sched_wakeup_granularity);
> -       WRT_SYSCTL(sched_shares_ratelimit);
>  #undef WRT_SYSCTL
>
>        return 0;
> @@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cf
>                list_add(&se->group_node, &cfs_rq->tasks);
>        }
>        cfs_rq->nr_running++;
> -       se->on_rq = 1;
>  }
>
>  static void
> @@ -647,9 +645,84 @@ account_entity_dequeue(struct cfs_rq *cf
>                list_del_init(&se->group_node);
>        }
>        cfs_rq->nr_running--;
> -       se->on_rq = 0;
>  }
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +static void update_cfs_load(struct cfs_rq *cfs_rq)
> +{
> +       u64 period = sched_avg_period();

This is a pretty large history window; while it should overlap the
update period for obvious reasons, intuition suggests a smaller window
(e.g. 2 x sched_latency) would probably be preferable here in terms of
reducing over-commit and reducing convergence time.

I'll run some benchmarks and see how it impacts fairness.

> +       u64 now = rq_of(cfs_rq)->clock;
> +       u64 delta = now - cfs_rq->load_stamp;
> +

Is is meaningful/useful to maintain cfs_rq->load for the rq->cfs_rq case?

> +       cfs_rq->load_stamp = now;
> +       cfs_rq->load_period += delta;
> +       cfs_rq->load_avg += delta * cfs_rq->load.weight;
> +
> +       while (cfs_rq->load_period > period) {
> +               /*
> +                * Inline assembly required to prevent the compiler
> +                * optimising this loop into a divmod call.
> +                * See __iter_div_u64_rem() for another example of this.
> +                */
> +               asm("" : "+rm" (cfs_rq->load_period));
> +               cfs_rq->load_period /= 2;
> +               cfs_rq->load_avg /= 2;
> +       }
> +}
> +
> +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
> +                           unsigned long weight)
> +{
> +       if (se->on_rq)
> +               account_entity_dequeue(cfs_rq, se);
> +
> +       update_load_set(&se->load, weight);
> +
> +       if (se->on_rq)
> +               account_entity_enqueue(cfs_rq, se);
> +}
> +
> +static void update_cfs_shares(struct cfs_rq *cfs_rq)
> +{
> +       struct task_group *tg;
> +       struct sched_entity *se;
> +       unsigned long load_weight, load, shares;
> +
> +       if (!cfs_rq)
> +               return;
> +
> +       tg = cfs_rq->tg;
> +       se = tg->se[cpu_of(rq_of(cfs_rq))];
> +       if (!se)
> +               return;
> +
> +       load = cfs_rq->load.weight;
> +
> +       load_weight = atomic_read(&tg->load_weight);
> +       load_weight -= cfs_rq->load_contribution;
> +       load_weight += load;
> +
> +       shares = (tg->shares * load);
> +       if (load_weight)
> +               shares /= load_weight;
> +
> +       if (shares < MIN_SHARES)
> +               shares = MIN_SHARES;
> +       if (shares > tg->shares)
> +               shares = tg->shares;
> +
> +       reweight_entity(cfs_rq_of(se), se, shares);
> +}
> +#else /* CONFIG_FAIR_GROUP_SCHED */
> +static inline void update_cfs_load(struct cfs_rq *cfs_rq)
> +{
> +}
> +
> +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
> +{
> +}
> +#endif /* CONFIG_FAIR_GROUP_SCHED */
> +
>  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>  #ifdef CONFIG_SCHEDSTATS
> @@ -771,7 +844,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>         * Update run-time statistics of the 'current'.
>         */
>        update_curr(cfs_rq);
> +       update_cfs_load(cfs_rq);
>        account_entity_enqueue(cfs_rq, se);
> +       update_cfs_shares(group_cfs_rq(se));

Don't we want to be updating the queuing cfs_rq's shares here?

The owned cfs_rq's share proportion isn't going to change as a result
of being enqueued -- and is guaranteed to be hit by a previous queuing
cfs_rq update in the initial enqueue case.

>
>        if (flags & ENQUEUE_WAKEUP) {
>                place_entity(cfs_rq, se, 0);
> @@ -782,6 +857,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>        check_spread(cfs_rq, se);
>        if (se != cfs_rq->curr)
>                __enqueue_entity(cfs_rq, se);
> +       se->on_rq = 1;
>  }
>
>  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
> @@ -825,8 +901,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>
>        if (se != cfs_rq->curr)
>                __dequeue_entity(cfs_rq, se);
> +       se->on_rq = 0;
> +       update_cfs_load(cfs_rq);
>        account_entity_dequeue(cfs_rq, se);
>        update_min_vruntime(cfs_rq);
> +       update_cfs_shares(group_cfs_rq(se));
>
>        /*
>         * Normalize the entity after updating the min_vruntime because the
> @@ -1055,6 +1134,9 @@ enqueue_task_fair(struct rq *rq, struct
>                flags = ENQUEUE_WAKEUP;
>        }
>
> +       for_each_sched_entity(se)
> +               update_cfs_shares(group_cfs_rq(se));

If the queuing cfs_rq is used above then group_cfs_rq is redundant
here, cfs_rq_of can be used.

Also, the respective load should be updated here.

> +
>        hrtick_update(rq);
>  }
>
> @@ -1071,12 +1153,16 @@ static void dequeue_task_fair(struct rq
>        for_each_sched_entity(se) {
>                cfs_rq = cfs_rq_of(se);
>                dequeue_entity(cfs_rq, se, flags);
> +
>                /* Don't dequeue parent if it has other entities besides us */
>                if (cfs_rq->load.weight)
>                        break;
>                flags |= DEQUEUE_SLEEP;
>        }
>
> +       for_each_sched_entity(se)
> +               update_cfs_shares(group_cfs_rq(se));
> +
>        hrtick_update(rq);
>  }
>
> @@ -1143,51 +1229,20 @@ static void task_waking_fair(struct rq *
>  * Adding load to a group doesn't make a group heavier, but can cause movement
>  * of group shares between cpus. Assuming the shares were perfectly aligned one
>  * can calculate the shift in shares.
> - *
> - * The problem is that perfectly aligning the shares is rather expensive, hence
> - * we try to avoid doing that too often - see update_shares(), which ratelimits
> - * this change.
> - *
> - * We compensate this by not only taking the current delta into account, but
> - * also considering the delta between when the shares were last adjusted and
> - * now.
> - *
> - * We still saw a performance dip, some tracing learned us that between
> - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
> - * significantly. Therefore try to bias the error in direction of failing
> - * the affine wakeup.
> - *
>  */
> -static long effective_load(struct task_group *tg, int cpu,
> -               long wl, long wg)
> +static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
>  {
>        struct sched_entity *se = tg->se[cpu];
>
>        if (!tg->parent)
>                return wl;
>
> -       /*
> -        * By not taking the decrease of shares on the other cpu into
> -        * account our error leans towards reducing the affine wakeups.
> -        */
> -       if (!wl && sched_feat(ASYM_EFF_LOAD))
> -               return wl;
> -
>        for_each_sched_entity(se) {
>                long S, rw, s, a, b;
> -               long more_w;
>
> -               /*
> -                * Instead of using this increment, also add the difference
> -                * between when the shares were last updated and now.
> -                */
> -               more_w = se->my_q->load.weight - se->my_q->rq_weight;
> -               wl += more_w;
> -               wg += more_w;
> -
> -               S = se->my_q->tg->shares;
> -               s = se->my_q->shares;
> -               rw = se->my_q->rq_weight;
> +               S = tg->shares;
> +               s = se->load.weight;
> +               rw = se->my_q->load.weight;
>
>                a = S*(rw + wl);
>                b = S*rw + s*wg;
> @@ -1509,23 +1564,6 @@ select_task_rq_fair(struct rq *rq, struc
>                        sd = tmp;
>        }
>
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -       if (sched_feat(LB_SHARES_UPDATE)) {
> -               /*
> -                * Pick the largest domain to update shares over
> -                */
> -               tmp = sd;
> -               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
> -                       tmp = affine_sd;
> -
> -               if (tmp) {
> -                       raw_spin_unlock(&rq->lock);
> -                       update_shares(tmp);
> -                       raw_spin_lock(&rq->lock);
> -               }
> -       }
> -#endif
> -
>        if (affine_sd) {
>                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
>                        return select_idle_sibling(p, cpu);
> @@ -2980,7 +3018,6 @@ static int load_balance(int this_cpu, st
>        schedstat_inc(sd, lb_count[idle]);
>
>  redo:
> -       update_shares(sd);
>        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
>                                   cpus, balance);
>
> @@ -3115,8 +3152,6 @@ out_one_pinned:
>        else
>                ld_moved = 0;
>  out:
> -       if (ld_moved)
> -               update_shares(sd);
>        return ld_moved;
>  }
>
> @@ -3510,6 +3545,8 @@ static void rebalance_domains(int cpu, e
>        int update_next_balance = 0;
>        int need_serialize;
>
> +       update_shares(cpu);
> +

This may not be frequent enough, especially in the dilated cpus-busy case

>        for_each_domain(cpu, sd) {
>                if (!(sd->flags & SD_LOAD_BALANCE))
>                        continue;
> Index: linux-2.6/kernel/sched_features.h
> ===================================================================
> --- linux-2.6.orig/kernel/sched_features.h
> +++ linux-2.6/kernel/sched_features.h
> @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
>  SCHED_FEAT(HRTICK, 0)
>  SCHED_FEAT(DOUBLE_TICK, 0)
>  SCHED_FEAT(LB_BIAS, 1)
> -SCHED_FEAT(LB_SHARES_UPDATE, 1)
> -SCHED_FEAT(ASYM_EFF_LOAD, 1)
>
>  /*
>  * Spin-wait on mutex acquisition when the mutex owner is running on
> Index: linux-2.6/kernel/sysctl.c
> ===================================================================
> --- linux-2.6.orig/kernel/sysctl.c
> +++ linux-2.6/kernel/sysctl.c
> @@ -307,15 +307,6 @@ static struct ctl_table kern_table[] = {
>                .extra2         = &max_wakeup_granularity_ns,
>        },
>        {
> -               .procname       = "sched_shares_ratelimit",
> -               .data           = &sysctl_sched_shares_ratelimit,
> -               .maxlen         = sizeof(unsigned int),
> -               .mode           = 0644,
> -               .proc_handler   = sched_proc_update_handler,
> -               .extra1         = &min_sched_shares_ratelimit,
> -               .extra2         = &max_sched_shares_ratelimit,
> -       },
> -       {
>                .procname       = "sched_tunable_scaling",
>                .data           = &sysctl_sched_tunable_scaling,
>                .maxlen         = sizeof(enum sched_tunable_scaling),
> @@ -325,14 +316,6 @@ static struct ctl_table kern_table[] = {
>                .extra2         = &max_sched_tunable_scaling,
>        },
>        {
> -               .procname       = "sched_shares_thresh",
> -               .data           = &sysctl_sched_shares_thresh,
> -               .maxlen         = sizeof(unsigned int),
> -               .mode           = 0644,
> -               .proc_handler   = proc_dointvec_minmax,
> -               .extra1         = &zero,
> -       },
> -       {
>                .procname       = "sched_migration_cost",
>                .data           = &sysctl_sched_migration_cost,
>                .maxlen         = sizeof(unsigned int),
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/