Re: [PATCH v2 5/7] sched/fair: Increase weight bits for avg_vruntime

From: Vincent Guittot

Date: Mon Feb 23 2026 - 05:57:51 EST

On Thu, 19 Feb 2026 at 09:10, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> Due to the zero_vruntime patch, the deltas are now a lot smaller and
> measurement with kernel-build and hackbench runs show about 45 bits
> used.
>
> This ensures avg_vruntime() tracks the full weight range, reducing
> numerical artifacts in reweight and the like.

Instead of paranoid, would it be better to add WARN_ONCE ?

I'm afraid that we will not notice any potential overflow without a
long study of the regression with SCHED_FEAT(PARANOID_AVG, false)

Couldn't we add a cheaper WARN_ONCE (key > 2^50) in __sum_w_vruntime_add ?

We should always have
key < 110ms (max slice+max tick) * nice_0 (2^20) / weight (2)
key < 2^46

We can use 50 bits to get margin

Weight is always less than 27bits and key*weight gives us 110ms (max
slice+max tick) * nice_0 (2^20) so we should never add more than 2^47
to ->sum_weight

so a WARN_ONCE (cfs_rq->sum_weight > 2^63) should be enough

>
> Also, lets keep the paranoid debug code around fow now.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> Tested-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
> Tested-by: Shubhang Kaushik <shubhang@xxxxxxxxxxxxxxxxxxxxxx>
> ---
> kernel/sched/debug.c | 14 ++++++-
> kernel/sched/fair.c | 91 ++++++++++++++++++++++++++++++++++++++----------
> kernel/sched/features.h | 2 +
> kernel/sched/sched.h | 3 +
> 4 files changed, 90 insertions(+), 20 deletions(-)
>
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -8,6 +8,7 @@
> */
> #include <linux/debugfs.h>
> #include <linux/nmi.h>
> +#include <linux/log2.h>
> #include "sched.h"
>
> /*
> @@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m,
>
> void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> {
> - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
> + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
> + s64 zero_vruntime = -1, sum_w_vruntime = -1;
> struct sched_entity *last, *first, *root;
> struct rq *rq = cpu_rq(cpu);
> + unsigned int sum_shift;
> unsigned long flags;
> + u64 sum_weight;
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> SEQ_printf(m, "\n");
> @@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, in
> if (last)
> right_vruntime = last->vruntime;
> zero_vruntime = cfs_rq->zero_vruntime;
> + sum_w_vruntime = cfs_rq->sum_w_vruntime;
> + sum_weight = cfs_rq->sum_weight;
> + sum_shift = cfs_rq->sum_shift;
> raw_spin_rq_unlock_irqrestore(rq, flags);
>
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
> @@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, in
> SPLIT_NS(left_vruntime));
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
> SPLIT_NS(zero_vruntime));
> + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
> + sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
> + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight",
> + sum_weight);
> + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift);
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
> SPLIT_NS(avg_vruntime(cfs_rq)));
> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -665,15 +665,20 @@ static inline s64 entity_key(struct cfs_
> * Since zero_vruntime closely tracks the per-task service, these
> * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
> * induced in the system due to quantisation.
> - *
> - * Also, we use scale_load_down() to reduce the size.
> - *
> - * As measured, the max (key * weight) value was ~44 bits for a kernel build.
> */
> -static void
> -sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
> +{
> +#ifdef CONFIG_64BIT
> + if (cfs_rq->sum_shift)
> + w = max(2UL, w >> cfs_rq->sum_shift);
> +#endif
> + return w;
> +}
> +
> +static inline void
> +__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - unsigned long weight = scale_load_down(se->load.weight);
> + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> s64 key = entity_key(cfs_rq, se);
>
> cfs_rq->sum_w_vruntime += key * weight;
> @@ -681,9 +686,59 @@ sum_w_vruntime_add(struct cfs_rq *cfs_rq
> }
>
> static void
> +sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + unsigned long weight;
> + s64 key, tmp;
> +
> +again:
> + weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> + key = entity_key(cfs_rq, se);
> +
> + if (check_mul_overflow(key, weight, &key))
> + goto overflow;
> +
> + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
> + goto overflow;
> +
> + cfs_rq->sum_w_vruntime = tmp;
> + cfs_rq->sum_weight += weight;
> + return;
> +
> +overflow:
> + /*
> + * There's gotta be a limit -- if we're still failing at this point
> + * there's really nothing much to be done about things.
> + */
> + BUG_ON(cfs_rq->sum_shift >= 10);
> + cfs_rq->sum_shift++;
> +
> + /*
> + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
> + */
> + cfs_rq->sum_w_vruntime = 0;
> + cfs_rq->sum_weight = 0;
> +
> + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
> + node; node = rb_next(node))
> + __sum_w_vruntime_add(cfs_rq, __node_2_se(node));
> +
> + goto again;
> +}
> +
> +static void
> +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> + if (sched_feat(PARANOID_AVG))
> + return sum_w_vruntime_add_paranoid(cfs_rq, se);
> +
> + __sum_w_vruntime_add(cfs_rq, se);
> +}
> +
> +static void
> sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - unsigned long weight = scale_load_down(se->load.weight);
> + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
> s64 key = entity_key(cfs_rq, se);
>
> cfs_rq->sum_w_vruntime -= key * weight;
> @@ -725,7 +780,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
> s64 runtime = cfs_rq->sum_w_vruntime;
>
> if (curr) {
> - unsigned long w = scale_load_down(curr->load.weight);
> + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
>
> runtime += entity_key(cfs_rq, curr) * w;
> weight += w;
> @@ -735,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
> if (runtime < 0)
> runtime -= (weight - 1);
>
> - delta = div_s64(runtime, weight);
> + delta = div64_long(runtime, weight);
> } else if (curr) {
> /*
> * When there is but one element, it is the average.
> @@ -801,7 +856,7 @@ static int vruntime_eligible(struct cfs_
> long load = cfs_rq->sum_weight;
>
> if (curr && curr->on_rq) {
> - unsigned long weight = scale_load_down(curr->load.weight);
> + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
>
> avg += entity_key(cfs_rq, curr) * weight;
> load += weight;
> @@ -3871,12 +3926,12 @@ static void reweight_entity(struct cfs_r
> * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
> * we need to scale se->vlag when w_i changes.
> */
> - se->vlag = div_s64(se->vlag * se->load.weight, weight);
> + se->vlag = div64_long(se->vlag * se->load.weight, weight);
> if (se->rel_deadline)
> - se->deadline = div_s64(se->deadline * se->load.weight, weight);
> + se->deadline = div64_long(se->deadline * se->load.weight, weight);
>
> if (rel_vprot)
> - vprot = div_s64(vprot * se->load.weight, weight);
> + vprot = div64_long(vprot * se->load.weight, weight);
>
> update_load_set(&se->load, weight);
>
> @@ -5180,7 +5235,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
> */
> if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
> struct sched_entity *curr = cfs_rq->curr;
> - unsigned long load;
> + long load;
>
> lag = se->vlag;
>
> @@ -5238,12 +5293,12 @@ place_entity(struct cfs_rq *cfs_rq, stru
> */
> load = cfs_rq->sum_weight;
> if (curr && curr->on_rq)
> - load += scale_load_down(curr->load.weight);
> + load += avg_vruntime_weight(cfs_rq, curr->load.weight);
>
> - lag *= load + scale_load_down(se->load.weight);
> + lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
> if (WARN_ON_ONCE(!load))
> load = 1;
> - lag = div_s64(lag, load);
> + lag = div64_long(lag, load);
> }
>
> se->vruntime = vruntime - lag;
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
> SCHED_FEAT(DELAY_DEQUEUE, true)
> SCHED_FEAT(DELAY_ZERO, true)
>
> +SCHED_FEAT(PARANOID_AVG, false)
> +
> /*
> * Allow wakeup-time preemption of the current task:
> */
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -684,8 +684,9 @@ struct cfs_rq {
>
> s64 sum_w_vruntime;
> u64 sum_weight;
> -
> u64 zero_vruntime;
> + unsigned int sum_shift;
> +
> #ifdef CONFIG_SCHED_CORE
> unsigned int forceidle_seq;
> u64 zero_vruntime_fi;
>
>