[RFC PATCH 3/4] sched/fair: Do active load balance in cfs_migration

From: Yafang Shao
Date: Thu Nov 04 2021 - 10:57:42 EST


The active load balance has a known issue[1][2] that there is a race
window between waking up the migration thread on the busiest CPU and it
begins to preempt the current running CFS task. This race window may cause
unexpected behavior that the current running CFS task may be preempted
by a RT task first, and then the RT task will be preempted by this
waked migration thread. Per our tracing, the latency caused by this
preemption can be greater than 1ms, which is not a small latency for the
RT tasks.

Now as we have introduced cfs_migration, we can assign it to the active
load balance work. Then after we set latency sensitive RT tasks to a higher
priority, it can't be preempted again.

[1]. https://lore.kernel.org/lkml/CAKfTPtBygNcVewbb0GQOP5xxO96am3YeTZNP5dK9BxKHJJAL-g@xxxxxxxxxxxxxx/
[2]. https://lore.kernel.org/lkml/20210615121551.31138-1-laoar.shao@xxxxxxxxx/

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
Cc: Valentin Schneider <valentin.schneider@xxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
---
kernel/sched/fair.c | 37 +++++++++++++++++++++++++++++++++++--
1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b3fa91828b..932f63baeb82 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9807,6 +9807,8 @@ static int need_active_balance(struct lb_env *env)
}

static int active_load_balance_cpu_stop(void *data);
+static void wakeup_cfs_migrater_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_work *work_buf);

static int should_we_balance(struct lb_env *env)
{
@@ -10049,7 +10051,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
raw_spin_rq_unlock_irqrestore(busiest, flags);

if (active_balance) {
- stop_one_cpu_nowait(cpu_of(busiest),
+ wakeup_cfs_migrater_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
@@ -10147,7 +10149,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
}

/*
- * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
+ * active_load_balance_cpu_stop is run by the CFS migrater. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
@@ -11927,6 +11929,37 @@ struct cfs_migrater {

DEFINE_PER_CPU(struct cfs_migrater, cfs_migrater);

+static void __cfs_migration_queue_work(struct cfs_migrater *migrater,
+ struct cpu_stop_work *work,
+ struct wake_q_head *wakeq)
+{
+ list_add_tail(&work->list, &migrater->works);
+ wake_q_add(wakeq, migrater->thread);
+}
+
+/* queue @work to @migrater. if offline, @work is completed immediately */
+static void cfs_migration_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+{
+ struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
+ DEFINE_WAKE_Q(wakeq);
+ unsigned long flags;
+
+ preempt_disable();
+ raw_spin_lock_irqsave(&migrater->lock, flags);
+ __cfs_migration_queue_work(migrater, work, &wakeq);
+ raw_spin_unlock_irqrestore(&migrater->lock, flags);
+
+ wake_up_q(&wakeq);
+ preempt_enable();
+}
+
+static void wakeup_cfs_migrater_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_work *work_buf)
+{
+ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
+ cfs_migration_queue_work(cpu, work_buf);
+}
+
static int cfs_migration_should_run(unsigned int cpu)
{
struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
--
2.17.1