[patch 10/10] sched/core: Make migrate disable and CPU hotplug cooperative

From: Thomas Gleixner
Date: Thu Sep 17 2020 - 06:51:34 EST


On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.

Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.

Originally-by: Scott Wood <swood@xxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 38 ++++++++++++++++++++++++++++++++++----
kernel/sched/sched.h | 4 ++++
2 files changed, 38 insertions(+), 4 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -494,6 +494,11 @@ static bool task_self_migration(struct t
return true;
}

+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned > 0;
+}
+
#else /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) */
static inline void task_lock_migration_ctrl(struct task_struct *p) { }
static inline void task_unlock_migration_ctrl(struct task_struct *p) { }
@@ -504,6 +509,10 @@ static bool task_self_migration(struct t
{
return false;
}
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return false;
+}
#endif /* !(defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)) */

/*
@@ -3591,6 +3600,12 @@ void migrate_disable(void)
if (!current->migration_ctrl.disable_cnt) {
raw_spin_lock_irqsave(&current->pi_lock, flags);
current->migration_ctrl.disable_cnt++;
+ /*
+ * Account the pinned task in the runqueue so that an
+ * eventual CPU hot unplug operation will wait until
+ * this task left the migrate disabled section.
+ */
+ this_rq()->nr_pinned++;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
} else {
current->migration_ctrl.disable_cnt++;
@@ -3619,6 +3634,13 @@ void migrate_enable(void)
p->migration_ctrl.pending = NULL;

/*
+ * Adjust the number of pinned tasks in the runqueue. No further
+ * action required here. An eventually waiting CPU hot unplug
+ * operation will be woken up once the CPU goes through idle.
+ */
+ this_rq()->nr_pinned--;
+
+ /*
* If the task was never scheduled out while in the migrate
* disabled region and there is no migration request pending,
* return.
@@ -6989,8 +7011,13 @@ static bool balance_push(struct rq *rq)
* last task to vanish. The rcuwait_active() check is
* accurate here because the waiter is pinned on this CPU
* and can't obviously be running in parallel.
+ *
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
*/
- if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
raw_spin_unlock(&rq->lock);
rcuwait_wake_up(&rq->hotplug_wait);
raw_spin_lock(&rq->lock);
@@ -7033,13 +7060,16 @@ static void balance_push_set(int cpu, bo
* Invoked from a CPUs hotplug control thread after the CPU has been marked
* inactive. All tasks which are not per CPU kernel threads are either
* pushed off this CPU now via balance_push() or placed on a different CPU
- * during wakeup. Wait until the CPU is quiescent.
+ * during wakeup. Wait until the CPU is quiescent. On RT kernels this also
+ * waits for pinned non-runnable tasks to leave the migrate disabled
+ * section.
*/
static void balance_hotplug_wait(void)
{
struct rq *rq = this_rq();

- rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
TASK_UNINTERRUPTIBLE);
}

@@ -7279,7 +7309,7 @@ int sched_cpu_dying(unsigned int cpu)
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- BUG_ON(rq->nr_running != 1);
+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
rq_unlock_irqrestore(rq, &rf);

calc_load_migrate(rq);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+ unsigned int nr_pinned;
+#endif
};

#ifdef CONFIG_FAIR_GROUP_SCHED