[PATCH 4/4] sched/core: Update core scheduler queue when taking cpu online/offline

From: Tim Chen
Date: Tue Jan 07 2020 - 16:26:54 EST


When we bring a CPU online and enable core scheduler, tasks that need
core scheduling need to be placed in the core's core scheduling queue.
Likewise when we taks a CPU offline or disable core scheudling on a
core, tasks in the core's core scheduling queue need to be removed.
Without such mechanisms, the core scheduler causes OOPs due to
inconsistent core scheduling state of a task.

Implement such enqueue and dequeue mechanisms according to a CPU's change
in core scheduling status. The switch of core scheduling mode of a core,
and enqueue/dequeue of tasks on a core's queue due to the core scheduling
mode change has to be run in a separate context as it cannot be done in
the context taking cpu online/offline.

Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---
kernel/sched/core.c | 156 ++++++++++++++++++++++++++++++++++++----
kernel/sched/deadline.c | 35 +++++++++
kernel/sched/fair.c | 38 ++++++++++
kernel/sched/rt.c | 43 +++++++++++
kernel/sched/sched.h | 7 ++
5 files changed, 264 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d875d6ed3f3..8db8960c6e69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,11 @@ int sysctl_sched_rt_runtime = 950000;

#ifdef CONFIG_SCHED_CORE

+struct core_sched_cpu_work {
+ struct work_struct work;
+ cpumask_t smt_mask;
+};
+
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);

/* kernel prio, less is more */
@@ -207,6 +212,18 @@ static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
RB_CLEAR_NODE(&p->core_node);
}

+void sched_core_add(struct rq *rq, struct task_struct *p)
+{
+ if (p->core_cookie && task_on_rq_queued(p))
+ sched_core_enqueue(rq, p);
+}
+
+void sched_core_remove(struct rq *rq, struct task_struct *p)
+{
+ if (sched_core_enqueued(p))
+ sched_core_dequeue(rq, p);
+}
+
/*
* Find left-most (aka, highest priority) task matching @cookie.
*/
@@ -329,11 +346,133 @@ void sched_core_put(void)
mutex_unlock(&sched_core_mutex);
}

+enum cpu_action {
+ CPU_ACTIVATE = 1,
+ CPU_DEACTIVATE = 2
+};
+
+static int __activate_cpu_core_sched(void *data);
+static int __deactivate_cpu_core_sched(void *data);
+static void core_sched_cpu_update(unsigned int cpu, enum cpu_action action);
+
+static int activate_cpu_core_sched(struct core_sched_cpu_work *work)
+{
+ if (static_branch_unlikely(&__sched_core_enabled))
+ stop_machine(__activate_cpu_core_sched, (void *) work, NULL);
+
+ return 0;
+}
+
+static int deactivate_cpu_core_sched(struct core_sched_cpu_work *work)
+{
+ if (static_branch_unlikely(&__sched_core_enabled))
+ stop_machine(__deactivate_cpu_core_sched, (void *) work, NULL);
+
+ return 0;
+}
+
+static void core_sched_cpu_activate_fn(struct work_struct *work)
+{
+ struct core_sched_cpu_work *cpu_work;
+
+ cpu_work = container_of(work, struct core_sched_cpu_work, work);
+ activate_cpu_core_sched(cpu_work);
+ kfree(cpu_work);
+}
+
+static void core_sched_cpu_deactivate_fn(struct work_struct *work)
+{
+ struct core_sched_cpu_work *cpu_work;
+
+ cpu_work = container_of(work, struct core_sched_cpu_work, work);
+ deactivate_cpu_core_sched(cpu_work);
+ kfree(cpu_work);
+}
+
+static void core_sched_cpu_update(unsigned int cpu, enum cpu_action action)
+{
+ struct core_sched_cpu_work *work;
+
+ work = kmalloc(sizeof(struct core_sched_cpu_work), GFP_ATOMIC);
+ if (!work)
+ return;
+
+ if (action == CPU_ACTIVATE)
+ INIT_WORK(&work->work, core_sched_cpu_activate_fn);
+ else
+ INIT_WORK(&work->work, core_sched_cpu_deactivate_fn);
+
+ cpumask_copy(&work->smt_mask, cpu_smt_mask(cpu));
+
+ queue_work(system_highpri_wq, &work->work);
+}
+
+static int __activate_cpu_core_sched(void *data)
+{
+ struct core_sched_cpu_work *work = (struct core_sched_cpu_work *) data;
+ struct rq *rq;
+ int i;
+
+ if (cpumask_weight(&work->smt_mask) < 2)
+ return 0;
+
+ for_each_cpu(i, &work->smt_mask) {
+ const struct sched_class *class;
+
+ rq = cpu_rq(i);
+
+ if (rq->core_enabled)
+ continue;
+
+ for_each_class(class) {
+ if (!class->core_sched_activate)
+ continue;
+
+ if (cpu_online(i))
+ class->core_sched_activate(rq);
+ }
+
+ rq->core_enabled = true;
+ }
+ return 0;
+}
+
+static int __deactivate_cpu_core_sched(void *data)
+{
+ struct core_sched_cpu_work *work = (struct core_sched_cpu_work *) data;
+ struct rq *rq;
+ int i;
+
+ if (cpumask_weight(&work->smt_mask) > 2)
+ return 0;
+
+ for_each_cpu(i, &work->smt_mask) {
+ const struct sched_class *class;
+
+ rq = cpu_rq(i);
+
+ if (!rq->core_enabled)
+ continue;
+
+ for_each_class(class) {
+ if (!class->core_sched_deactivate)
+ continue;
+
+ if (cpu_online(i))
+ class->core_sched_deactivate(cpu_rq(i));
+ }
+
+ rq->core_enabled = false;
+ }
+ return 0;
+}
+
#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
static bool sched_core_enqueued(struct task_struct *task) { return false; }
+static inline void core_sched_cpu_update(unsigned int cpu, int action) { }

#endif /* CONFIG_SCHED_CORE */

@@ -6941,13 +7080,8 @@ int sched_cpu_activate(unsigned int cpu)
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
static_branch_inc_cpuslocked(&sched_smt_present);
-#ifdef CONFIG_SCHED_CORE
- if (static_branch_unlikely(&__sched_core_enabled)) {
- rq->core_enabled = true;
- }
-#endif
}
-
+ core_sched_cpu_update(cpu, CPU_ACTIVATE);
#endif
set_cpu_active(cpu, true);

@@ -6996,15 +7130,10 @@ int sched_cpu_deactivate(unsigned int cpu)
* When going down, decrement the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
-#ifdef CONFIG_SCHED_CORE
- struct rq *rq = cpu_rq(cpu);
- if (static_branch_unlikely(&__sched_core_enabled)) {
- rq->core_enabled = false;
- }
-#endif
static_branch_dec_cpuslocked(&sched_smt_present);

}
+ core_sched_cpu_update(cpu, CPU_DEACTIVATE);
#endif

if (!sched_smp_initialized)
@@ -7081,9 +7210,6 @@ int sched_cpu_dying(unsigned int cpu)
update_max_interval();
nohz_balance_exit_idle(rq);
hrtick_clear(rq);
-#ifdef CONFIG_SCHED_CORE
- rq->core = NULL;
-#endif
return 0;
}
#endif
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 514b6328262f..6bb69d42965b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1755,6 +1755,37 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}

+static void for_each_dl_task(struct rq *rq,
+ void (*fn)(struct rq *rq, struct task_struct *p))
+{
+ struct dl_rq *dl_rq = &rq->dl;
+ struct sched_dl_entity *dl_ent;
+ struct task_struct *task;
+ struct rb_node *rb_node;
+
+ rb_node = rb_first_cached(&dl_rq->root);
+ while (rb_node) {
+ dl_ent = rb_entry(rb_node, struct sched_dl_entity, rb_node);
+ task = dl_task_of(dl_ent);
+ fn(rq, task);
+ rb_node = rb_next(rb_node);
+ }
+}
+
+#ifdef CONFIG_SCHED_CORE
+
+static void core_sched_activate_dl(struct rq *rq)
+{
+ for_each_dl_task(rq, sched_core_add);
+}
+
+static void core_sched_deactivate_dl(struct rq *rq)
+{
+ for_each_dl_task(rq, sched_core_remove);
+}
+
+#endif
+
static struct task_struct *pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
@@ -2430,6 +2461,10 @@ const struct sched_class dl_sched_class = {
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
+#ifdef CONFIG_SCHED_CORE
+ .core_sched_activate = core_sched_activate_dl,
+ .core_sched_deactivate = core_sched_deactivate_dl,
+#endif
#endif

.task_tick = task_tick_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4728f5ed45aa..6cfcced2b0bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10058,6 +10058,40 @@ static void rq_offline_fair(struct rq *rq)
unthrottle_offline_cfs_rqs(rq);
}

+static void for_each_fair_task(struct rq *rq,
+ void (*fn)(struct rq *rq, struct task_struct *p))
+{
+ struct cfs_rq *cfs_rq, *pos;
+ struct sched_entity *se;
+ struct task_struct *task;
+
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+ for (se = __pick_first_entity(cfs_rq);
+ se != NULL;
+ se = __pick_next_entity(se)) {
+
+ if (!entity_is_task(se))
+ continue;
+
+ task = task_of(se);
+ fn(rq, task);
+ }
+ }
+}
+
+#ifdef CONFIG_SCHED_CORE
+
+static void core_sched_activate_fair(struct rq *rq)
+{
+ for_each_fair_task(rq, sched_core_add);
+}
+
+static void core_sched_deactivate_fair(struct rq *rq)
+{
+ for_each_fair_task(rq, sched_core_remove);
+}
+
+#endif
#endif /* CONFIG_SMP */

#ifdef CONFIG_SCHED_CORE
@@ -10612,6 +10646,10 @@ const struct sched_class fair_sched_class = {

.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_common,
+#ifdef CONFIG_SCHED_CORE
+ .core_sched_activate = core_sched_activate_fair,
+ .core_sched_deactivate = core_sched_deactivate_fair,
+#endif
#endif

.task_tick = task_tick_fair,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4714630a90b9..c6694e45b255 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1548,6 +1548,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}

+static void for_each_rt_task(struct rq *rq,
+ void (*fn)(struct rq *rq, struct task_struct *p))
+{
+ rt_rq_iter_t iter;
+ struct rt_prio_array *array;
+ struct list_head *queue;
+ int i;
+ struct rt_rq *rt_rq = &rq->rt;
+ struct sched_rt_entity *rt_se = NULL;
+ struct task_struct *task;
+
+ for_each_rt_rq(rt_rq, iter, rq) {
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ queue = array->queue + i;
+ list_for_each_entry(rt_se, queue, run_list) {
+ if (rt_entity_is_task(rt_se)) {
+ task = rt_task_of(rt_se);
+ fn(rq, task);
+ }
+ }
+ }
+ }
+}
+
+#ifdef CONFIG_SCHED_CORE
+
+static void core_sched_activate_rt(struct rq *rq)
+{
+ for_each_rt_task(rq, sched_core_add);
+}
+
+static void core_sched_deactivate_rt(struct rq *rq)
+{
+ for_each_rt_task(rq, sched_core_remove);
+}
+
+#endif
+
static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;
@@ -2382,6 +2421,10 @@ const struct sched_class rt_sched_class = {
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
+#ifdef CONFIG_SCHED_CORE
+ .core_sched_activate = core_sched_activate_rt,
+ .core_sched_deactivate = core_sched_deactivate_rt,
+#endif
#endif

.task_tick = task_tick_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4844e703298a..c2068f2e2dd2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1055,6 +1055,9 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)

extern void queue_core_balance(struct rq *rq);

+void sched_core_add(struct rq *rq, struct task_struct *p);
+void sched_core_remove(struct rq *rq, struct task_struct *p);
+
#else /* !CONFIG_SCHED_CORE */

static inline bool sched_core_enabled(struct rq *rq)
@@ -1838,6 +1841,10 @@ struct sched_class {

void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+#ifdef CONFIG_SCHED_CORE
+ void (*core_sched_activate)(struct rq *rq);
+ void (*core_sched_deactivate)(struct rq *rq);
+#endif
#endif

void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
--
2.20.1