[RFC PATCH 3/3] sched: Introduce WALT hooks into core and scheduling classes

From: Vikram Mulukutla
Date: Fri Oct 28 2016 - 03:12:05 EST


From: Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxx>

Add the necessary hooks to core and the various scheduling
classes that will allow WALT to track CPU utilization and
handle task migration between CPUs as well.

With CONFIG_SCHED_WALT enabled, schedutil will use WALT's cpu
utilization metric by default. This can be switched to PELT's
util_avg at runtime by the following command:

echo 0 > /proc/sys/kernel/sched_use_walt_metrics

Signed-off-by: Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxx>
Signed-off-by: Vikram Mulukutla <markivx@xxxxxxxxxxxxxx>
---
kernel/sched/core.c | 29 ++++++++++++++++++++++++++++-
kernel/sched/deadline.c | 7 +++++++
kernel/sched/debug.c | 9 +++++++++
kernel/sched/fair.c | 9 +++++++--
kernel/sched/rt.c | 6 ++++++
5 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44817c6..3b7f67d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,6 +91,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>

+#include "walt.h"
+
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

@@ -991,6 +993,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new

p->on_rq = TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, 0);
+ walt_prepare_migrate(p, rq, true);
set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock);

@@ -998,6 +1001,7 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new

raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
+ walt_finish_migrate(p, rq, true);
enqueue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
@@ -1257,7 +1261,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)

p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
+ walt_prepare_migrate(p, src_rq, true);
set_task_cpu(p, cpu);
+ walt_finish_migrate(p, dst_rq, true);
activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
@@ -2072,13 +2078,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);

+ raw_spin_lock(&task_rq(p)->lock);
+ walt_update_task_ravg(p, task_rq(p), TASK_WAKE, walt_ktime_clock(), 0);
+ raw_spin_unlock(&task_rq(p)->lock);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;

cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
+ walt_prepare_migrate(p, task_rq(p), false);
set_task_cpu(p, cpu);
+ walt_finish_migrate(p, cpu_rq(cpu), false);
}
#endif /* CONFIG_SMP */

@@ -2129,8 +2141,10 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie

trace_sched_waking(p);

- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ walt_update_task_ravg(p, rq, TASK_WAKE, walt_ktime_clock(), 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }

ttwu_do_wakeup(rq, p, 0, cookie);
if (schedstat_enabled())
@@ -2196,6 +2210,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+ walt_init_new_task_load(p);

#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -2570,6 +2585,8 @@ void wake_up_new_task(struct task_struct *p)
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);

+ walt_mark_task_starting(p);
+
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -3071,6 +3088,7 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, walt_ktime_clock(), 0);
calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);

@@ -3322,6 +3340,7 @@ static void __sched notrace __schedule(bool preempt)
struct pin_cookie cookie;
struct rq *rq;
int cpu;
+ u64 wallclock;

cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3385,6 +3404,9 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);

next = pick_next_task(rq, prev, cookie);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
@@ -7284,6 +7306,8 @@ static void sched_rq_cpu_starting(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);

+ walt_set_window_start(rq);
+
rq->calc_load_update = calc_load_update;
update_max_interval();
}
@@ -7304,6 +7328,9 @@ int sched_cpu_dying(unsigned int cpu)
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
raw_spin_lock_irqsave(&rq->lock, flags);
+
+ walt_migrate_sync_cpu(cpu);
+
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1ce8867..0dd3c1f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,6 +15,7 @@
* Fabio Checconi <fchecconi@xxxxxxxxx>
*/
#include "sched.h"
+#include "walt.h"

#include <linux/slab.h>

@@ -278,7 +279,9 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* By now the task is replenished and enqueued; migrate it.
*/
deactivate_task(rq, p, 0);
+ walt_prepare_migrate(p, rq, true);
set_task_cpu(p, later_rq->cpu);
+ walt_finish_migrate(p, later_rq, true);
activate_task(later_rq, p, 0);

if (!fallback)
@@ -1512,7 +1515,9 @@ retry:
}

deactivate_task(rq, next_task, 0);
+ walt_prepare_migrate(next_task, rq, true);
set_task_cpu(next_task, later_rq->cpu);
+ walt_finish_migrate(next_task, later_rq, true);
activate_task(later_rq, next_task, 0);
ret = 1;

@@ -1600,7 +1605,9 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;

deactivate_task(src_rq, p, 0);
+ walt_prepare_migrate(p, src_rq, true);
set_task_cpu(p, this_cpu);
+ walt_finish_migrate(p, this_rq, true);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a999..ab10031 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -607,6 +607,15 @@ do { \
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
+#ifdef CONFIG_SMP
+ P(cpu_capacity_orig);
+ P(cpu_capacity);
+#ifdef CONFIG_SCHED_WALT
+ P(window_start);
+ P(curr_runnable_sum);
+ P(prev_runnable_sum);
+#endif
+#endif
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 39c826d..182dcd3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -34,6 +34,7 @@
#include <trace/events/sched.h>

#include "sched.h"
+#include "walt.h"

/*
* Targeted preemption latency for CPU-bound tasks:
@@ -2885,6 +2886,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)

if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
unsigned long max = rq->cpu_capacity_orig;
+ unsigned long util = cpu_walt_util(rq);

/*
* There are a few boundary cases this might miss but it should
@@ -2902,8 +2904,8 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+
+ cpufreq_update_util(rq_clock(rq), min(util, max), max);
}
}

@@ -6205,7 +6207,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)

p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, 0);
+ walt_prepare_migrate(p, env->src_rq, true);
set_task_cpu(p, env->dst_cpu);
+ /* update WALT later under the dest rq's lock */
}

/*
@@ -6337,6 +6341,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);

BUG_ON(task_rq(p) != rq);
+ walt_finish_migrate(p, rq, true);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b7..130040c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,6 +8,8 @@
#include <linux/slab.h>
#include <linux/irq_work.h>

+#include "walt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -1843,7 +1845,9 @@ retry:
}

deactivate_task(rq, next_task, 0);
+ walt_prepare_migrate(next_task, rq, true);
set_task_cpu(next_task, lowest_rq->cpu);
+ walt_finish_migrate(next_task, lowest_rq, true);
activate_task(lowest_rq, next_task, 0);
ret = 1;

@@ -2097,7 +2101,9 @@ static void pull_rt_task(struct rq *this_rq)
resched = true;

deactivate_task(src_rq, p, 0);
+ walt_prepare_migrate(p, src_rq, true);
set_task_cpu(p, this_cpu);
+ walt_finish_migrate(p, this_rq, true);
activate_task(this_rq, p, 0);
/*
* We continue with the search, just in
--
TheMan