[RFC PATCH 1/5] sched/fair: Add push task framework

From: K Prateek Nayak
Date: Wed Apr 09 2025 - 07:17:25 EST


From: Vincent Guittot <vincent.guittot@xxxxxxxxxx>

Add the skeleton for push task infrastructure. The empty
push_fair_task() prototype will be used to implement proactive idle
balancing in subsequent commits.

[ prateek: Broke off relevant bits from [1] ]

Link: https://lore.kernel.org/all/20250302210539.1563190-6-vincent.guittot@xxxxxxxxxx/ [1]
Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
kernel/sched/fair.c | 85 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 2 ++
2 files changed, 87 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c19459c8042..98d3ed2078cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7044,6 +7044,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
hrtick_update(rq);
}

+static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
static void set_next_buddy(struct sched_entity *se);

/*
@@ -7074,6 +7075,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
h_nr_idle = task_has_idle_policy(p);
if (task_sleep || task_delayed || !se->sched_delayed)
h_nr_runnable = 1;
+
+ fair_remove_pushable_task(rq, p);
} else {
cfs_rq = group_cfs_rq(se);
slice = cfs_rq_min_slice(cfs_rq);
@@ -8556,6 +8559,64 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
return target;
}

+static inline bool fair_push_task(struct task_struct *p)
+{
+ if (!task_on_rq_queued(p))
+ return false;
+
+ if (p->se.sched_delayed)
+ return false;
+
+ if (p->nr_cpus_allowed == 1)
+ return false;
+
+ return true;
+}
+
+static inline int has_pushable_tasks(struct rq *rq)
+{
+ return !plist_head_empty(&rq->cfs.pushable_tasks);
+}
+
+/*
+ * See if the non running fair tasks on this rq can be sent on other CPUs
+ * that fits better with their profile.
+ */
+static bool push_fair_task(struct rq *rq)
+{
+ return false;
+}
+
+static void push_fair_tasks(struct rq *rq)
+{
+ /* push_fair_task() will return true if it moved a fair task */
+ while (push_fair_task(rq))
+ ;
+}
+
+static DEFINE_PER_CPU(struct balance_callback, fair_push_head);
+
+static inline void fair_queue_pushable_tasks(struct rq *rq)
+{
+ if (!has_pushable_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(fair_push_head, rq->cpu), push_fair_tasks);
+}
+static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
+}
+
+static void fair_add_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ if (fair_push_task(p)) {
+ plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
+ plist_node_init(&p->pushable_tasks, p->prio);
+ plist_add(&p->pushable_tasks, &rq->cfs.pushable_tasks);
+ }
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
@@ -8725,6 +8786,9 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return sched_balance_newidle(rq, rf) != 0;
}
#else
+static inline void fair_queue_pushable_tasks(struct rq *rq) {}
+static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p) {}
+static inline void fair_add_pushable_task(struct rq *rq, struct task_struct *p) {}
static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
#endif /* CONFIG_SMP */

@@ -8914,6 +8978,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);

+ /*
+ * The previous task might be eligible for being pushed on
+ * another cpu if it is still active.
+ */
+ fair_add_pushable_task(rq, prev);
+
__set_next_task_fair(rq, p, true);
}

@@ -8986,6 +9056,13 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
+
+ /*
+ * The previous task might be eligible for being pushed on another cpu
+ * if it is still active.
+ */
+ fair_add_pushable_task(rq, prev);
+
}

/*
@@ -13260,6 +13337,8 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
{
struct sched_entity *se = &p->se;

+ fair_remove_pushable_task(rq, p);
+
#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
@@ -13277,6 +13356,11 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);

+ /*
+ * Try to push prev task before checking misfit for next task as
+ * the migration of prev can make next fitting the CPU
+ */
+ fair_queue_pushable_tasks(rq);
update_misfit_status(p, rq);
sched_fair_update_stop_tick(rq, p);
}
@@ -13307,6 +13391,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifdef CONFIG_SMP
+ plist_head_init(&cfs_rq->pushable_tasks);
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5a6a503eb6d..aa92c0d75de7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -716,6 +716,8 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */

+ struct plist_head pushable_tasks;
+
/* Locally cached copy of our task_group's idle value */
int idle;

--
2.34.1