[RFC][PATCH 1/3] sched: Rewrite tg_shares_up

From: Peter Zijlstra
Date: Sat Aug 28 2010 - 18:38:03 EST


By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.

This should improve cpu-cgroup performance for smp systems
significantly.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
include/linux/sched.h | 2
kernel/sched.c | 173 ++++++++++++------------------------------------
kernel/sched_debug.c | 11 ++-
kernel/sched_fair.c | 153 ++++++++++++++++++++++++++----------------
kernel/sched_features.h | 2
kernel/sysctl.c | 17 ----
6 files changed, 147 insertions(+), 211 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1858,8 +1858,6 @@ static inline void wake_up_idle_cpu(int
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
extern unsigned int sysctl_sched_child_runs_first;

enum sched_tunable_scaling {
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -253,6 +253,8 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+
+ atomic_t load_weight;
#endif

#ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
*/
unsigned long h_load;

- /*
- * this cpu's part of tg->shares
- */
- unsigned long shares;
+ u64 load_avg;
+ u64 load_period;
+ u64 load_stamp;

- /*
- * load.weight at the time we set shares
- */
- unsigned long rq_weight;
+ unsigned long load_contribution;
#endif
#endif
};
@@ -793,20 +791,6 @@ late_initcall(sched_init_debug);
const_debug unsigned int sysctl_sched_nr_migrate = 32;

/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
-/*
* period over which we average the RT time consumption, measured
* in ms.
*
@@ -1351,6 +1335,12 @@ static inline void update_load_sub(struc
lw->inv_weight = 0;
}

+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1539,97 +1529,44 @@ static unsigned long cpu_avg_load_per_ta

#ifdef CONFIG_FAIR_GROUP_SCHED

-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares,
- unsigned long sd_rq_weight,
- unsigned long *usd_rq_weight)
-{
- unsigned long shares, rq_weight;
- int boost = 0;
-
- rq_weight = usd_rq_weight[cpu];
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum_j shares_j * rq_weight_i
- * shares_i = -----------------------------
- * \Sum_j rq_weight_j
- */
- shares = (sd_shares * rq_weight) / sd_rq_weight;
- shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
- if (abs(shares - tg->se[cpu]->load.weight) >
- sysctl_sched_shares_thresh) {
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- __set_se_shares(tg->se[cpu], shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-}
+static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);

/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
+ * update tg->load_weight by folding this cpu's load_avg
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
- unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
- unsigned long *usd_rq_weight;
- struct sched_domain *sd = data;
+ unsigned long load_avg;
+ struct cfs_rq *cfs_rq;
unsigned long flags;
- int i;
+ int cpu = (long)data;
+ struct rq *rq;

- if (!tg->se[0])
+ if (!tg->se[cpu])
return 0;

- local_irq_save(flags);
- usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
- for_each_cpu(i, sched_domain_span(sd)) {
- weight = tg->cfs_rq[i]->load.weight;
- usd_rq_weight[i] = weight;
-
- rq_weight += weight;
- /*
- * If there are currently no tasks on the cpu pretend there
- * is one of average load so that when a new task gets to
- * run here it will not get delayed by group starvation.
- */
- if (!weight)
- weight = NICE_0_LOAD;
+ rq = cpu_rq(cpu);
+ cfs_rq = tg->cfs_rq[cpu];

- sum_weight += weight;
- shares += tg->cfs_rq[i]->shares;
- }
+ raw_spin_lock_irqsave(&rq->lock, flags);

- if (!rq_weight)
- rq_weight = sum_weight;
+ update_rq_clock(rq);
+ update_cfs_load(cfs_rq);

- if ((!shares && rq_weight) || shares > tg->shares)
- shares = tg->shares;
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ load_avg -= cfs_rq->load_contribution;

- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
- shares = tg->shares;
+ atomic_add(load_avg, &tg->load_weight);
+ cfs_rq->load_contribution += load_avg;

- for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
+ /*
+ * We need to update shares after updating tg->load_weight in
+ * order to adjust the weight of groups with long running tasks.
+ */
+ update_cfs_shares(cfs_rq);

- local_irq_restore(flags);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);

return 0;
}
@@ -1648,7 +1585,7 @@ static int tg_load_down(struct task_grou
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->cfs_rq[cpu]->shares;
+ load *= tg->se[cpu]->load.weight;
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}

@@ -1657,21 +1594,16 @@ static int tg_load_down(struct task_grou
return 0;
}

-static void update_shares(struct sched_domain *sd)
+static void update_shares(long cpu)
{
- s64 elapsed;
- u64 now;
-
if (root_task_group_empty())
return;

- now = local_clock();
- elapsed = now - sd->last_update;
+ /*
+ * XXX: replace with an on-demand list
+ */

- if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
- sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, sd);
- }
+ walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
}

static void update_h_load(long cpu)
@@ -1681,7 +1613,7 @@ static void update_h_load(long cpu)

#else

-static inline void update_shares(struct sched_domain *sd)
+static inline void update_shares(int cpu)
{
}

@@ -1806,15 +1738,6 @@ static void double_rq_unlock(struct rq *

#endif

-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
- cfs_rq->shares = shares;
-#endif
-}
-#endif
-
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
@@ -5400,7 +5323,6 @@ static void update_sysctl(void)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
- SET_SYSCTL(sched_shares_ratelimit);
#undef SET_SYSCTL
}

@@ -7652,8 +7574,7 @@ static void init_tg_cfs_entry(struct tas
se->cfs_rq = parent->my_q;

se->my_q = cfs_rq;
- se->load.weight = tg->shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, tg->shares);
se->parent = parent;
}
#endif
@@ -7746,10 +7667,6 @@ void __init sched_init(void)

#endif /* CONFIG_CGROUP_SCHED */

-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
- update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
- __alignof__(unsigned long));
-#endif
for_each_possible_cpu(i) {
struct rq *rq;

@@ -8317,8 +8234,7 @@ static void __set_se_shares(struct sched
if (on_rq)
dequeue_entity(cfs_rq, se, 0);

- se->load.weight = shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, shares);

if (on_rq)
enqueue_entity(cfs_rq, se, 0);
@@ -8375,7 +8291,6 @@ int sched_group_set_shares(struct task_g
/*
* force a rebalance
*/
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
set_se_shares(tg->se[i], shares);
}

Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -204,13 +204,18 @@ void print_cfs_rq(struct seq_file *m, in
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
+ SPLIT_NS(cfs_rq->load_avg));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
+ SPLIT_NS(cfs_rq->load_period));
+ SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
+ cfs_rq->load_contribution);
+ SEQ_printf(m, " .%-30s: %d\n", "load_tg",
+ atomic_read(&tg->load_weight));

SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
-#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
- WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL

return 0;
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cf
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
- se->on_rq = 1;
}

static void
@@ -647,9 +645,84 @@ account_entity_dequeue(struct cfs_rq *cf
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
- se->on_rq = 0;
}

+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+ u64 period = sched_avg_period();
+ u64 now = rq_of(cfs_rq)->clock;
+ u64 delta = now - cfs_rq->load_stamp;
+
+ cfs_rq->load_stamp = now;
+ cfs_rq->load_period += delta;
+ cfs_rq->load_avg += delta * cfs_rq->load.weight;
+
+ while (cfs_rq->load_period > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (cfs_rq->load_period));
+ cfs_rq->load_period /= 2;
+ cfs_rq->load_avg /= 2;
+ }
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ if (se->on_rq)
+ account_entity_dequeue(cfs_rq, se);
+
+ update_load_set(&se->load, weight);
+
+ if (se->on_rq)
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg;
+ struct sched_entity *se;
+ unsigned long load_weight, load, shares;
+
+ if (!cfs_rq)
+ return;
+
+ tg = cfs_rq->tg;
+ se = tg->se[cpu_of(rq_of(cfs_rq))];
+ if (!se)
+ return;
+
+ load = cfs_rq->load.weight;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight -= cfs_rq->load_contribution;
+ load_weight += load;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -771,7 +844,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_cfs_load(cfs_rq);
account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(group_cfs_rq(se));

if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -782,6 +857,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
}

static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +901,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st

if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ update_cfs_load(cfs_rq);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
+ update_cfs_shares(group_cfs_rq(se));

/*
* Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1134,9 @@ enqueue_task_fair(struct rq *rq, struct
flags = ENQUEUE_WAKEUP;
}

+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+
hrtick_update(rq);
}

@@ -1071,12 +1153,16 @@ static void dequeue_task_fair(struct rq
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
flags |= DEQUEUE_SLEEP;
}

+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+
hrtick_update(rq);
}

@@ -1143,51 +1229,20 @@ static void task_waking_fair(struct rq *
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
*/
-static long effective_load(struct task_group *tg, int cpu,
- long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];

if (!tg->parent)
return wl;

- /*
- * By not taking the decrease of shares on the other cpu into
- * account our error leans towards reducing the affine wakeups.
- */
- if (!wl && sched_feat(ASYM_EFF_LOAD))
- return wl;
-
for_each_sched_entity(se) {
long S, rw, s, a, b;
- long more_w;

- /*
- * Instead of using this increment, also add the difference
- * between when the shares were last updated and now.
- */
- more_w = se->my_q->load.weight - se->my_q->rq_weight;
- wl += more_w;
- wg += more_w;
-
- S = se->my_q->tg->shares;
- s = se->my_q->shares;
- rw = se->my_q->rq_weight;
+ S = tg->shares;
+ s = se->load.weight;
+ rw = se->my_q->load.weight;

a = S*(rw + wl);
b = S*rw + s*wg;
@@ -1509,23 +1564,6 @@ select_task_rq_fair(struct rq *rq, struc
sd = tmp;
}

-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (sched_feat(LB_SHARES_UPDATE)) {
- /*
- * Pick the largest domain to update shares over
- */
- tmp = sd;
- if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
- tmp = affine_sd;
-
- if (tmp) {
- raw_spin_unlock(&rq->lock);
- update_shares(tmp);
- raw_spin_lock(&rq->lock);
- }
- }
-#endif
-
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu);
@@ -2980,7 +3018,6 @@ static int load_balance(int this_cpu, st
schedstat_inc(sd, lb_count[idle]);

redo:
- update_shares(sd);
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
cpus, balance);

@@ -3115,8 +3152,6 @@ out_one_pinned:
else
ld_moved = 0;
out:
- if (ld_moved)
- update_shares(sd);
return ld_moved;
}

@@ -3510,6 +3545,8 @@ static void rebalance_domains(int cpu, e
int update_next_balance = 0;
int need_serialize;

+ update_shares(cpu);
+
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
Index: linux-2.6/kernel/sched_features.h
===================================================================
--- linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)

/*
* Spin-wait on mutex acquisition when the mutex owner is running on
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -307,15 +307,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_wakeup_granularity_ns,
},
{
- .procname = "sched_shares_ratelimit",
- .data = &sysctl_sched_shares_ratelimit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_shares_ratelimit,
- .extra2 = &max_sched_shares_ratelimit,
- },
- {
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
@@ -325,14 +316,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_tunable_scaling,
},
{
- .procname = "sched_shares_thresh",
- .data = &sysctl_sched_shares_thresh,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
.procname = "sched_migration_cost",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/