[RFC PATCH 11/22] sched/fair: Prepare unthrottle_cfs_rq() to demote throttle status

From: K Prateek Nayak
Date: Thu Feb 20 2025 - 04:37:51 EST


If an entity which was blocked when running in kernel mode wakes up, a
fully throttled hierarchy needs to be demoted to a partially throttled
one.

Prepare unthrottle_cfs_rq() to demote the status where the caller
explicitly specifies via "demote_to_partial" indicator that it is
requesting the throttle status to be demoted.

Modify all current callers of unthrottle_cfs_rq()() setting
"demote_to_partial" as false since all the existing scenarios
completely unthrottles a cfs_rq.

Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
kernel/sched/core.c | 2 +-
kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++----------
kernel/sched/sched.h | 2 +-
3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0851cdad9242..a797517d3dcf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9488,7 +9488,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
cfs_rq->runtime_remaining = 0;

if (cfs_rq->throttled)
- unthrottle_cfs_rq(cfs_rq);
+ unthrottle_cfs_rq(cfs_rq, false);
}

if (runtime_was_enabled && !runtime_enabled)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e1df614e82f..091493bc8506 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6082,17 +6082,26 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
return false;
}

-void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq, bool demote_to_partial)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long queued_delta, runnable_delta, idle_delta;
long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ int throttled_state = cfs_rq->throttled;

se = cfs_rq->tg->se[cpu_of(rq)];

- cfs_rq->throttled = CFS_UNTHROTTLED;
+ if (demote_to_partial) {
+ /*
+ * A demotion to partially throttled state can only be
+ * requested on a fully throttled hierarchy.
+ */
+ SCHED_WARN_ON(!cfs_rq_h_throttled(cfs_rq));
+ cfs_rq->throttled = CFS_THROTTLED_PARTIAL;
+ } else
+ cfs_rq->throttled = CFS_UNTHROTTLED;

update_rq_clock(rq);

@@ -6101,9 +6110,16 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
cfs_rq->throttled_clock = 0;
}
- list_del_rcu(&cfs_rq->throttled_list);
+
+ /* Partial throttle should retain itself in the throttled_list */
+ if (!demote_to_partial)
+ list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);

+ /* If cfs_rq was partially throttled, we have nothing to do */
+ if (throttled_state == CFS_THROTTLED_PARTIAL)
+ goto unthrottle_throttle;
+
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

@@ -6176,8 +6192,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);

- /* Determine whether we need to wake up potentially idle CPU: */
- if (rq->curr == rq->idle && rq->cfs.nr_queued)
+ /*
+ * Determine whether we need to wake up potentially idle CPU or
+ * reevalutate our pick on the throttled hierarchy.
+ */
+ if (cfs_rq->curr || (rq->curr == rq->idle && rq->cfs.nr_queued))
resched_curr(rq);
}

@@ -6212,7 +6231,7 @@ static void __cfsb_csd_unthrottle(void *arg)
list_del_init(&cursor->throttled_csd_list);

if (cfs_rq_throttled(cursor))
- unthrottle_cfs_rq(cursor);
+ unthrottle_cfs_rq(cursor, false);
}

rcu_read_unlock();
@@ -6227,7 +6246,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
bool first;

if (rq == this_rq()) {
- unthrottle_cfs_rq(cfs_rq);
+ unthrottle_cfs_rq(cfs_rq, false);
return;
}

@@ -6243,7 +6262,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
#else
static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
- unthrottle_cfs_rq(cfs_rq);
+ unthrottle_cfs_rq(cfs_rq, false);
}
#endif

@@ -6329,7 +6348,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
list_del_init(&cfs_rq->throttled_csd_list);

if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
+ unthrottle_cfs_rq(cfs_rq, false);

rq_unlock_irqrestore(rq, &rf);
}
@@ -6786,7 +6805,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
- unthrottle_cfs_rq(cfs_rq);
+ unthrottle_cfs_rq(cfs_rq, false);
}
rcu_read_unlock();

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 22567d236f82..bd43271fa166 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -554,7 +554,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth

extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq, bool demote_to_partial);
extern bool cfs_task_bw_constrained(struct task_struct *p);

extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
--
2.43.0