[RFC PATCH 06/22] sched/fair: Propagate the min_vruntime of kernel mode preempted entity

From: K Prateek Nayak
Date: Thu Feb 20 2025 - 04:36:04 EST


Propagate the min_vruntime of the kernel mode preempted entity to the
root of the cfs_rq's rbtree. This will be soon used to pick amongst the
kernel mode entities on a throttled hierarchy using the similar min-heap
approach that pick_eevdf() currently implements.

Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
include/linux/sched.h | 6 ++++++
kernel/sched/fair.c | 47 ++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63f3f235a5c1..4bb7e45758f4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -593,6 +593,12 @@ struct sched_entity {
*/
int kernel_cs_count;
/* hole */
+
+ /*
+ * min_vruntime of the kernel mode preempted entities
+ * in the subtree of this sched entity.
+ */
+ s64 min_kcs_vruntime;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbb7a227afe7..ba1bd60ce433 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -828,6 +828,9 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n
}
}

+static __always_inline void init_se_kcs_stats(struct sched_entity *se);
+static inline bool min_kcs_vruntime_update(struct sched_entity *se);
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
@@ -836,6 +839,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
u64 old_min_vruntime = se->min_vruntime;
u64 old_min_slice = se->min_slice;
struct rb_node *node = &se->run_node;
+ bool kcs_stats_unchanged = min_kcs_vruntime_update(se);

se->min_vruntime = se->vruntime;
__min_vruntime_update(se, node->rb_right);
@@ -846,7 +850,8 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
__min_slice_update(se, node->rb_left);

return se->min_vruntime == old_min_vruntime &&
- se->min_slice == old_min_slice;
+ se->min_slice == old_min_slice &&
+ kcs_stats_unchanged;
}

RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@ -858,6 +863,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
+ init_se_kcs_stats(se);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -6778,6 +6784,39 @@ static __always_inline void avg_kcs_vruntime_update(struct cfs_rq *cfs_rq, s64 d
cfs_rq->avg_kcs_vruntime -= cfs_rq->avg_kcs_load * delta;
}

+static __always_inline void init_se_kcs_stats(struct sched_entity *se)
+{
+ /*
+ * With the introduction of EEVDF, the vruntime of entities can go negative when
+ * a lagging entity joins a runqueue with avg_vruntime < vlag. Use LLONG_MAX as
+ * the upper bound to differentiate the case where no kernel mode preempted
+ * entities are queued on the subtree.
+ */
+ se->min_kcs_vruntime = (se_in_kernel(se)) ? se->vruntime : LLONG_MAX;
+}
+
+static inline void __min_kcs_vruntime_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+
+ if (rse->min_kcs_vruntime < se->min_kcs_vruntime)
+ se->min_kcs_vruntime = rse->min_kcs_vruntime;
+ }
+}
+
+static inline bool min_kcs_vruntime_update(struct sched_entity *se)
+{
+ u64 old_min_kcs_vruntime = se->min_kcs_vruntime;
+ struct rb_node *node = &se->run_node;
+
+ init_se_kcs_stats(se);
+ __min_kcs_vruntime_update(se, node->rb_right);
+ __min_kcs_vruntime_update(se, node->rb_left);
+
+ return se->min_kcs_vruntime == old_min_kcs_vruntime;
+}
+
#ifdef CONFIG_NO_HZ_FULL
/* called from pick_next_task_fair() */
static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
@@ -6853,6 +6892,12 @@ __always_inline void sched_notify_critical_section_exit(void) {}
static __always_inline void avg_kcs_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static __always_inline void avg_kcs_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static __always_inline void avg_kcs_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) {}
+static __always_inline void init_se_kcs_stats(struct sched_entity *se) {}
+
+static inline bool min_kcs_vruntime_update(struct sched_entity *se)
+{
+ return true;
+}

#endif /* CONFIG_CFS_BANDWIDTH */

--
2.43.0