[RFC][PATCH 07/10] sched/fair: Re-organize dequeue_task_fair()

From: Peter Zijlstra
Date: Fri Apr 05 2024 - 07:04:55 EST


Working towards delaying dequeue, notably also inside the hierachy,
rework dequeue_task_fair() such that it can 'resume' an interrupted
hierarchy walk.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 82 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 55 insertions(+), 27 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6824,33 +6824,45 @@ enqueue_task_fair(struct rq *rq, struct
static void set_next_buddy(struct sched_entity *se);

/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ * 0 - dequeue throttled
+ * 1 - dequeue complete
*/
-static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
- int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ bool task_sleep = flags & DEQUEUE_SLEEP;
+ struct task_struct *p = NULL;
+ struct cfs_rq *cfs_rq;
+ int idle_h_nr_running;

- util_est_dequeue(&rq->cfs, p);
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ idle_h_nr_running = task_has_idle_policy(p);
+ } else {
+ idle_h_nr_running = cfs_rq_is_idle(group_cfs_rq(se));
+ }

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);

- cfs_rq->h_nr_running--;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ /* h_nr_running is the hierachical count of tasks */
+ if (p) {
+ cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;

- if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+ }

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
+ return 0;

/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -6870,33 +6882,49 @@ static bool dequeue_task_fair(struct rq
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);

+ // XXX avoid these load updates for delayed dequeues ?
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_cfs_group(se);

- cfs_rq->h_nr_running--;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (p) {
+ cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;

- if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+ }

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
+ return 0;
+ }
+
+ if (p) {
+ sub_nr_running(rq, 1);

+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
}

- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, 1);
+ return 1;
+}

- /* balance early to pull high priority tasks */
- if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
- rq->next_balance = jiffies;
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ util_est_dequeue(&rq->cfs, p);

-dequeue_throttle:
- util_est_update(&rq->cfs, p, task_sleep);
- hrtick_update(rq);
+ if (dequeue_entities(rq, &p->se, flags) < 0)
+ return false;

+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ hrtick_update(rq);
return true;
}