[GIT pull] scheduler updates for 4.13

From: Thomas Gleixner
Date: Sun Jul 09 2017 - 05:05:05 EST


Linus,

please pull the latest sched-urgent-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

This scheduler update provides:

- The (hopefully) final fix for the vtime accounting issues which
were around for quite some time

- Use types known to user space in UAPI headers to unbreak user space
builds

- Make load balancing respect the current scheduling domain again
instead of evaluating unrelated CPUs.

Thanks,

tglx

------------------>
Dmitry V. Levin (1):
sched/headers/uapi: Fix linux/sched/types.h userspace compilation errors

Frederic Weisbecker (4):
vtime, sched/cputime: Remove vtime_account_user()
sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime
sched/cputime: Rename vtime fields
sched/cputime: Move the vtime task fields to their own struct

Ingo Molnar (1):
Revert "sched/cputime: Refactor the cputime_adjust() code"

Jeffrey Hugo (1):
sched/fair: Fix load_balance() affinity redo path

Wanpeng Li (1):
sched/cputime: Accumulate vtime on top of nsec clocksource


include/linux/init_task.h | 6 +-
include/linux/sched.h | 29 ++++---
include/linux/vtime.h | 9 +-
include/uapi/linux/sched/types.h | 16 ++--
kernel/fork.c | 6 +-
kernel/sched/cputime.c | 180 ++++++++++++++++++++++++---------------
kernel/sched/fair.c | 32 ++++---
7 files changed, 165 insertions(+), 113 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc188..a2f6707e9fc0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -170,9 +170,9 @@ extern struct cred init_cred;

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
# define INIT_VTIME(tsk) \
- .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \
- .vtime_snap = 0, \
- .vtime_snap_whence = VTIME_SYS,
+ .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
+ .vtime.starttime = 0, \
+ .vtime.state = VTIME_SYS,
#else
# define INIT_VTIME(tsk)
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c4ca7433d9d..4818126c5153 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,6 +223,24 @@ struct task_cputime {
#define prof_exp stime
#define sched_exp sum_exec_runtime

+enum vtime_state {
+ /* Task is sleeping or running in a CPU with VTIME inactive: */
+ VTIME_INACTIVE = 0,
+ /* Task runs in userspace in a CPU with VTIME active: */
+ VTIME_USER,
+ /* Task runs in kernelspace in a CPU with VTIME active: */
+ VTIME_SYS,
+};
+
+struct vtime {
+ seqcount_t seqcount;
+ unsigned long long starttime;
+ enum vtime_state state;
+ u64 utime;
+ u64 stime;
+ u64 gtime;
+};
+
struct sched_info {
#ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */
@@ -688,16 +706,7 @@ struct task_struct {
u64 gtime;
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqcount_t vtime_seqcount;
- unsigned long long vtime_snap;
- enum {
- /* Task is sleeping or running in a CPU with VTIME inactive: */
- VTIME_INACTIVE = 0,
- /* Task runs in userspace in a CPU with VTIME active: */
- VTIME_USER,
- /* Task runs in kernelspace in a CPU with VTIME active: */
- VTIME_SYS,
- } vtime_snap_whence;
+ struct vtime vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 0681fe25abeb..18b405e3cd93 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { }

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_account_user(struct task_struct *tsk);
extern void vtime_user_enter(struct task_struct *tsk);
-
-static inline void vtime_user_exit(struct task_struct *tsk)
-{
- vtime_account_user(tsk);
-}
-
+extern void vtime_user_exit(struct task_struct *tsk);
extern void vtime_guest_enter(struct task_struct *tsk);
extern void vtime_guest_exit(struct task_struct *tsk);
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-static inline void vtime_account_user(struct task_struct *tsk) { }
static inline void vtime_user_enter(struct task_struct *tsk) { }
static inline void vtime_user_exit(struct task_struct *tsk) { }
static inline void vtime_guest_enter(struct task_struct *tsk) { }
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 307acbc82d80..34b81aa1a2f7 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -54,21 +54,21 @@ struct sched_param {
* available in the scheduling class file or in Documentation/.
*/
struct sched_attr {
- u32 size;
+ __u32 size;

- u32 sched_policy;
- u64 sched_flags;
+ __u32 sched_policy;
+ __u64 sched_flags;

/* SCHED_NORMAL, SCHED_BATCH */
- s32 sched_nice;
+ __s32 sched_nice;

/* SCHED_FIFO, SCHED_RR */
- u32 sched_priority;
+ __u32 sched_priority;

/* SCHED_DEADLINE */
- u64 sched_runtime;
- u64 sched_deadline;
- u64 sched_period;
+ __u64 sched_runtime;
+ __u64 sched_deadline;
+ __u64 sched_period;
};

#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..d927ec11aa7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process(
prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqcount_init(&p->vtime_seqcount);
- p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_INACTIVE;
+ seqcount_init(&p->vtime.seqcount);
+ p->vtime.starttime = 0;
+ p->vtime.state = VTIME_INACTIVE;
#endif

#if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..6e3ea4ac1bda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
utime = curr->utime;

/*
- * If either stime or both stime and utime are 0, assume all runtime is
- * userspace. Once a task gets some ticks, the monotonicy code at
- * 'update' will ensure things converge to the observed ratio.
+ * If either stime or utime are 0, assume all runtime is userspace.
+ * Once a task gets some ticks, the monotonicy code at 'update:'
+ * will ensure things converge to the observed ratio.
*/
- if (stime != 0) {
- if (utime == 0)
- stime = rtime;
- else
- stime = scale_stime(stime, rtime, stime + utime);
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}

+ if (utime == 0) {
+ stime = rtime;
+ goto update;
+ }
+
+ stime = scale_stime(stime, rtime, stime + utime);
+
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
+ unsigned long long clock;

- if (time_before(now, (unsigned long)tsk->vtime_snap))
+ clock = sched_clock_cpu(smp_processor_id());
+ if (clock < vtime->starttime)
return 0;

- return jiffies_to_nsecs(now - tsk->vtime_snap);
+ return clock - vtime->starttime;
}

-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
- u64 delta, other;
+ u64 delta = vtime_delta(vtime);
+ u64 other;

/*
* Unlike tick based timing, vtime based timing never has lost
@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative.
*/
- delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta);
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
- tsk->vtime_snap = now;
+ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+ vtime->starttime += delta;

return delta - other;
}

-static void __vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ vtime->stime += get_vtime_delta(vtime);
+ if (vtime->stime >= TICK_NSEC) {
+ account_system_time(tsk, irq_count(), vtime->stime);
+ vtime->stime = 0;
+ }
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+ struct vtime *vtime)
{
- account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+ vtime->gtime += get_vtime_delta(vtime);
+ if (vtime->gtime >= TICK_NSEC) {
+ account_guest_time(tsk, vtime->gtime);
+ vtime->gtime = 0;
+ }
}

void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_delta(tsk))
+ struct vtime *vtime = &tsk->vtime;
+
+ if (!vtime_delta(vtime))
return;

- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ /* We might have scheduled out from guest path */
+ if (current->flags & PF_VCPU)
+ vtime_account_guest(tsk, vtime);
+ else
+ __vtime_account_system(tsk, vtime);
+ write_seqcount_end(&vtime->seqcount);
}

-void vtime_account_user(struct task_struct *tsk)
+void vtime_user_enter(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- tsk->vtime_snap_whence = VTIME_SYS;
- if (vtime_delta(tsk))
- account_user_time(tsk, get_vtime_delta(tsk));
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
+ vtime->state = VTIME_USER;
+ write_seqcount_end(&vtime->seqcount);
}

-void vtime_user_enter(struct task_struct *tsk)
+void vtime_user_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->utime += get_vtime_delta(vtime);
+ if (vtime->utime >= TICK_NSEC) {
+ account_user_time(tsk, vtime->utime);
+ vtime->utime = 0;
+ }
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
}

void vtime_guest_enter(struct task_struct *tsk)
{
+ struct vtime *vtime = &tsk->vtime;
/*
* The flags must be updated under the lock with
- * the vtime_snap flush and update.
+ * the vtime_starttime flush and update.
* That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
current->flags |= PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);

void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_guest(tsk, vtime);
current->flags &= ~PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);

void vtime_account_idle(struct task_struct *tsk)
{
- account_idle_time(get_vtime_delta(tsk));
+ account_idle_time(get_vtime_delta(&tsk->vtime));
}

void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqcount_begin(&prev->vtime_seqcount);
- prev->vtime_snap_whence = VTIME_INACTIVE;
- write_seqcount_end(&prev->vtime_seqcount);
+ struct vtime *vtime = &prev->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_INACTIVE;
+ write_seqcount_end(&vtime->seqcount);
+
+ vtime = &current->vtime;

- write_seqcount_begin(&current->vtime_seqcount);
- current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = jiffies;
- write_seqcount_end(&current->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock_cpu(smp_processor_id());
+ write_seqcount_end(&vtime->seqcount);
}

void vtime_init_idle(struct task_struct *t, int cpu)
{
+ struct vtime *vtime = &t->vtime;
unsigned long flags;

local_irq_save(flags);
- write_seqcount_begin(&t->vtime_seqcount);
- t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = jiffies;
- write_seqcount_end(&t->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock_cpu(cpu);
+ write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}

u64 task_gtime(struct task_struct *t)
{
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 gtime;

@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
return t->gtime;

do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);

gtime = t->gtime;
- if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
- gtime += vtime_delta(t);
+ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ gtime += vtime->gtime + vtime_delta(vtime);

- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ } while (read_seqcount_retry(&vtime->seqcount, seq));

return gtime;
}
@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
*/
void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
- u64 delta;
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
+ u64 delta;

if (!vtime_accounting_enabled()) {
*utime = t->utime;
@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
}

do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);

*utime = t->utime;
*stime = t->stime;

/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
+ if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
continue;

- delta = vtime_delta(t);
+ delta = vtime_delta(vtime);

/*
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
- *utime += delta;
- else if (t->vtime_snap_whence == VTIME_SYS)
- *stime += delta;
- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+ *utime += vtime->utime + delta;
+ else if (vtime->state == VTIME_SYS)
+ *stime += vtime->stime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..c95880e216f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Also avoid computing new_dst_cpu if we have already computed
- * one in current iteration.
+ * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+ * already computed one in current iteration.
*/
- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0;

/* Prevent to re-select dst_cpu via env's cpus */
@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.tasks = LIST_HEAD_INIT(env.tasks),
};

- /*
- * For NEWLY_IDLE load_balancing, we don't need to consider
- * other cpus in our group
- */
- if (idle == CPU_NEWLY_IDLE)
- env.dst_grpmask = NULL;
-
- cpumask_copy(cpus, cpu_active_mask);
+ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);

schedstat_inc(sd->lb_count[idle]);

@@ -8151,7 +8144,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus)) {
+ /*
+ * Attempting to continue load balancing at the current
+ * sched_domain level only makes sense if there are
+ * active CPUs remaining as possible busiest CPUs to
+ * pull load from which are not contained within the
+ * destination group that is receiving any migrated
+ * load.
+ */
+ if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
+ /*
+ * can_migrate_task() doesn't need to compute new_dst_cpu
+ * for active balancing. Since we have CPU_IDLE, but no
+ * @dst_grpmask we need to make that test go away with lying
+ * about DST_PINNED.
+ */
+ .flags = LBF_DST_PINNED,
};

schedstat_inc(sd->alb_count);