[GIT PULL] scheduler fixes

From: Ingo Molnar
Date: Thu Mar 24 2016 - 03:51:59 EST


Linus,

Please pull the latest sched-urgent-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

# HEAD: 73e6aafd9ea81498d31361f01db84a0118da2d1c sched/cpuacct: Simplify the cpuacct code

Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a cputime fix
and two cpuacct cleanups.

Thanks,

Ingo

------------------>
Dongsheng Yang (1):
sched/cpuacct: Rename parameter in cpuusage_write() for readability

Matt Fleming (1):
sched/fair: Add comments to explain select_idle_sibling()

Peter Zijlstra (2):
sched/cgroup: Fix/cleanup cgroup teardown/init
sched/fair: Fix fairness issue on migration

Thomas Gleixner (1):
sched/cputime: Fix steal time accounting vs. CPU hotplug

Zhao Lei (1):
sched/cpuacct: Simplify the cpuacct code


kernel/sched/core.c | 36 +++++++++++++++---------------------
kernel/sched/cpuacct.c | 35 ++++++++++-------------------------
kernel/sched/cpuacct.h | 4 ++--
kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++-------
kernel/sched/sched.h | 13 +++++++++++++
5 files changed, 72 insertions(+), 55 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea8f49ae0062..2a87bdde8d4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5369,6 +5369,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

case CPU_UP_PREPARE:
rq->calc_load_update = calc_load_update;
+ account_reset_rq(rq);
break;

case CPU_ONLINE:
@@ -7535,7 +7536,7 @@ void set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);

-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
@@ -7561,7 +7562,7 @@ struct task_group *sched_create_group(struct task_group *parent)
return tg;

err:
- free_sched_group(tg);
+ sched_free_group(tg);
return ERR_PTR(-ENOMEM);
}

@@ -7581,17 +7582,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}

/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
{
/* now it should be safe to free those cfs_rqs */
- free_sched_group(container_of(rhp, struct task_group, rcu));
+ sched_free_group(container_of(rhp, struct task_group, rcu));
}

-/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
/* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
+ call_rcu(&tg->rcu, sched_free_group_rcu);
}

void sched_offline_group(struct task_group *tg)
@@ -8050,31 +8050,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);

+ sched_online_group(tg, parent);
+
return &tg->css;
}

-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css->parent);

- if (parent)
- sched_online_group(tg, parent);
- return 0;
+ sched_offline_group(tg);
}

static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);

- sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
- struct task_group *tg = css_tg(css);
-
- sched_offline_group(tg);
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
}

static void cpu_cgroup_fork(struct task_struct *task)
@@ -8434,9 +8429,8 @@ static struct cftype cpu_files[] = {

struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
- .css_online = cpu_cgroup_css_online,
- .css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb55bbf2..434c2fa41352 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
}

static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 reset)
+ u64 val)
{
struct cpuacct *ca = css_ca(css);
int err = 0;
int i;

- if (reset) {
+ /*
+ * Only allow '0' here to do a reset.
+ */
+ if (val) {
err = -EINVAL;
goto out;
}
@@ -235,23 +238,10 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int cpu;
-
- cpu = task_cpu(tsk);

rcu_read_lock();
-
- ca = task_ca(tsk);
-
- while (true) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
-
- ca = parent_ca(ca);
- if (!ca)
- break;
- }
-
+ for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+ *this_cpu_ptr(ca->cpuusage) += cputime;
rcu_read_unlock();
}

@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
*
* Note: it's the caller that updates the account of the root cgroup.
*/
-void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
- struct kernel_cpustat *kcpustat;
struct cpuacct *ca;

rcu_read_lock();
- ca = task_ca(p);
- while (ca != &root_cpuacct) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += val;
- ca = parent_ca(ca);
- }
+ for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+ this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
rcu_read_unlock();
}

diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ed605624a5e7..ba72807c73d4 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -1,7 +1,7 @@
#ifdef CONFIG_CGROUP_CPUACCT

extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);

#else

@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
}

static inline void
-cpuacct_account_field(struct task_struct *p, int index, u64 val)
+cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
}

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33130529e9b5..303d6392b389 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3157,17 +3157,25 @@ static inline void check_schedstat_required(void)
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+ bool curr = cfs_rq->curr == se;
+
/*
- * Update the normalized vruntime before updating min_vruntime
- * through calling update_curr().
+ * If we're the current task, we must renormalise before calling
+ * update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+ if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;

+ update_curr(cfs_rq);
+
/*
- * Update run-time statistics of the 'current'.
+ * Otherwise, renormalise after, such that we're placed at the current
+ * moment in time, instead of some random moment in the past.
*/
- update_curr(cfs_rq);
+ if (renorm && !curr)
+ se->vruntime += cfs_rq->min_vruntime;
+
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3183,7 +3191,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
- if (se != cfs_rq->curr)
+ if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;

@@ -5047,7 +5055,19 @@ static int select_idle_sibling(struct task_struct *p, int target)
return i;

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an eligible idle cpu.
+ *
+ * A completely idle sched group at higher domains is more
+ * desirable than an idle group at a lower level, because lower
+ * domains have smaller groups and usually share hardware
+ * resources which causes tasks to contend on them, e.g. x86
+ * hyperthread siblings in the lowest domain (SMT) can contend
+ * on the shared cpu pipeline.
+ *
+ * However, while we prefer idle groups at higher domains
+ * finding an idle cpu at the lowest domain is still better than
+ * returning 'target', which we've already established, isn't
+ * idle.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) {
@@ -5057,11 +5077,16 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p)))
goto next;

+ /* Ensure the entire group is idle */
for_each_cpu(i, sched_group_cpus(sg)) {
if (i == target || !idle_cpu(i))
goto next;
}

+ /*
+ * It doesn't matter which cpu we pick, the
+ * whole group is idle.
+ */
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
goto done;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2ff5a2bd6df..e6d4a3fa3660 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1793,3 +1793,16 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+ rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ rq->prev_steal_time_rq = 0;
+#endif
+}