Re: [PATCH 4/4] sched/fair: Proportional newidle balance
From: Mohamed Abuelfotoh, Hazem
Date: Sun Jan 25 2026 - 07:22:51 EST
On 18/01/2026 20:46, Mario Roy wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.Hi All,
The patch "Proportional newidle balance" introduced a regression
with Linux 6.12.65 and 6.18.5. There is noticeable regression with
easyWave testing. [1]
The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
to install easyWave [2]. That is fetching the two tar.gz archives.
#!/bin/bash
# CXXFLAGS="-O3 $CXXFLAGS" ./configure
# make -j8
trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
OMP_NUM_THREADS=48 ./src/easywave \
-grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
-time 1200
Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
easyWave ver.2013-04-11
Model time = 00:00:00, elapsed: 0 msec
Model time = 00:10:00, elapsed: 5 msec
Model time = 00:20:00, elapsed: 10 msec
Model time = 00:30:00, elapsed: 19 msec
...
Model time = 05:00:00, elapsed: 2908 msec
Model time = 05:10:00, elapsed: 3079 msec
Model time = 05:20:00, elapsed: 3307 msec
Model time = 05:30:00, elapsed: 3503 msec
...
After results with CachyOS 6.12.66-2 and 6.18.6-2 kernels.
easyWave ver.2013-04-11
Model time = 00:00:00, elapsed: 0 msec
Model time = 00:10:00, elapsed: 5 msec
Model time = 00:20:00, elapsed: 10 msec
Model time = 00:30:00, elapsed: 18 msec
...
Model time = 05:00:00, elapsed: 13057 msec (normal is < 3.0s)
Model time = 05:10:00, elapsed: 13512 msec
Model time = 05:20:00, elapsed: 13833 msec
Model time = 05:30:00, elapsed: 14206 msec
...
Reverting the patch "sched/fair: Proportional newidle balance"
returns back to prior performance.
[1] https://openbenchmarking.org/test/pts/easywave
[2]
https://openbenchmarking.org/innhold/ da7f1cf159033fdfbb925102284aea8a83e8afdc
On 11/7/25 11:06 AM, Peter Zijlstra wrote:
Add a randomized algorithm that runs newidle balancing proportional to
its success rate.
This improves schbench significantly:
6.18-rc4: 2.22 Mrps/s
6.18-rc4+revert: 2.04 Mrps/s
6.18-rc4+revert+random: 2.18 Mrps/S
Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
6.17: -6%
6.17+revert: 0%
6.17+revert+random: -1%
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched/topology.h | 3 ++
kernel/sched/core.c | 3 ++
kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++++ +++++++----
kernel/sched/features.h | 5 ++++
kernel/sched/sched.h | 7 ++++++
kernel/sched/topology.c | 6 +++++
6 files changed, 63 insertions(+), 4 deletions(-)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
unsigned int nr_balance_failed; /* initialise to 0 */
/* idle_balance() stats */
+ unsigned int newidle_call;
+ unsigned int newidle_success;
+ unsigned int newidle_ratio;
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
+ prandom_init_once(&sched_rnd_state);
+
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12146,11 +12146,26 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
{
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
unsigned long now = jiffies;
+ update_newidle_stats(sd, success);
+
if (cost > sd->max_newidle_lb_cost) {
/*
* Track max cost of a domain to make sure to not delay the
@@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
* Decay the newidle max times here because this is a regular
* visit to all the domains.
*/
- need_decay = update_newidle_cost(sd, 0);
+ need_decay = update_newidle_cost(sd, 0, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
break;
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
@@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ update_newidle_cost(sd, domain_cost, weight * !! pulled_task);
}
/*
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
#ifndef _KERNEL_SCHED_SCHED_H
#define _KERNEL_SCHED_SCHED_H
+#include <linux/prandom.h>
#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
.last_balance = jiffies,
.balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
I can confirm that we are seeing a 4-11% performance regression in v6.12.66 on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which is powered by 4th-Generation Intel Xeon Scalable processor (code-named Sapphire Rapids). The regression is caused by the commit 33cf66d88306 ("sched/fair: Proportional newidle balance"). We were able to reclaim the performance back after reverting this commit. We also noticed that the impact is higher on AMD vs Intel.
Benchmark Name | Description | Unit
postgresql | HammerDB workload (TPC-C-like benchmark) | NOPM
nginx_lb | Testing NGINX as a load balancer | RPS
memcached | Testing using Lancet load generator | QPS
**Results on v6.12.66**
Benchmark name | SUT EC2 Instance | Regression percentage
postgresql | c7a.4xlarge | -4.0%
postgresql | c7i.4xlarge | -4.0%
nginx_lb | c7a.4xlarge | -5.0%
memcached | c7a.4xlarge | -11.0%
We have also seen smaller impact on v6.1.161 which has the mentioned commit.
**Results on v6.1.161**
Benchmark name | SUT EC2 Instance | Regression percentage
nginx_lb | c7a.4xlarge | -3.0%
nginx_lb | c7i.4xlarge | -4.0%
memcached | c7a.4xlarge | -5.0%