[PATCH 23/24] sched/eevdf: Propagate min_slice up the cgroup hierarchy

From: Peter Zijlstra
Date: Sat Jul 27 2024 - 07:02:42 EST


In the absence of an explicit cgroup slice configureation, make mixed
slice length work with cgroups by propagating the min_slice up the
hierarchy.

This ensures the cgroup entity gets timely service to service its
entities that have this timing constraint set on them.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched.h | 1
kernel/sched/fair.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 57 insertions(+), 1 deletion(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -542,6 +542,7 @@ struct sched_entity {
struct rb_node run_node;
u64 deadline;
u64 min_vruntime;
+ u64 min_slice;

struct list_head group_node;
unsigned char on_rq;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -782,6 +782,21 @@ static void update_min_vruntime(struct c
cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
}

+static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 min_slice = ~0ULL;
+
+ if (curr && curr->on_rq)
+ min_slice = curr->slice;
+
+ if (root)
+ min_slice = min(min_slice, root->min_slice);
+
+ return min_slice;
+}
+
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
@@ -798,19 +813,34 @@ static inline void __min_vruntime_update
}
}

+static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->min_slice < se->min_slice)
+ se->min_slice = rse->min_slice;
+ }
+}
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
+ u64 old_min_slice = se->min_slice;
struct rb_node *node = &se->run_node;

se->min_vruntime = se->vruntime;
__min_vruntime_update(se, node->rb_right);
__min_vruntime_update(se, node->rb_left);

- return se->min_vruntime == old_min_vruntime;
+ se->min_slice = se->slice;
+ __min_slice_update(se, node->rb_right);
+ __min_slice_update(se, node->rb_left);
+
+ return se->min_vruntime == old_min_vruntime &&
+ se->min_slice == old_min_slice;
}

RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@ -823,6 +853,7 @@ static void __enqueue_entity(struct cfs_
{
avg_vruntime_add(cfs_rq, se);
se->min_vruntime = se->vruntime;
+ se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
__entity_less, &min_vruntime_cb);
}
@@ -6917,6 +6948,7 @@ enqueue_task_fair(struct rq *rq, struct
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
int rq_h_nr_running = rq->cfs.h_nr_running;
+ u64 slice = 0;

if (flags & ENQUEUE_DELAYED) {
requeue_delayed_entity(se);
@@ -6946,7 +6978,18 @@ enqueue_task_fair(struct rq *rq, struct
break;
}
cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Basically set the slice of group entries to the min_slice of
+ * their respective cfs_rq. This ensures the group can service
+ * its entities in the desired time-frame.
+ */
+ if (slice) {
+ se->slice = slice;
+ se->custom_slice = 1;
+ }
enqueue_entity(cfs_rq, se, flags);
+ slice = cfs_rq_min_slice(cfs_rq);

cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@ -6968,6 +7011,9 @@ enqueue_task_fair(struct rq *rq, struct
se_update_runnable(se);
update_cfs_group(se);

+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;

@@ -7033,11 +7079,15 @@ static int dequeue_entities(struct rq *r
int idle_h_nr_running = 0;
int h_nr_running = 0;
struct cfs_rq *cfs_rq;
+ u64 slice = 0;

if (entity_is_task(se)) {
p = task_of(se);
h_nr_running = 1;
idle_h_nr_running = task_has_idle_policy(p);
+ } else {
+ cfs_rq = group_cfs_rq(se);
+ slice = cfs_rq_min_slice(cfs_rq);
}

for_each_sched_entity(se) {
@@ -7062,6 +7112,8 @@ static int dequeue_entities(struct rq *r

/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ slice = cfs_rq_min_slice(cfs_rq);
+
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
/*
@@ -7083,6 +7135,9 @@ static int dequeue_entities(struct rq *r
se_update_runnable(se);
update_cfs_group(se);

+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;