Re: [RFC PATCH 4/5] sched/fair: Rework inter-NUMA newidle balancing
From: Peter Zijlstra
Date: Thu Apr 10 2025 - 06:15:46 EST
On Wed, Apr 09, 2025 at 11:15:38AM +0000, K Prateek Nayak wrote:
> +static inline int sched_newidle_pull_overloaded(struct sched_domain *sd,
> + struct rq *this_rq,
> + int *continue_balancing)
> +{
> + struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
> + int cpu, this_cpu = cpu_of(this_rq);
> + struct sched_domain *sd_parent;
> + struct lb_env env = {
> + .dst_cpu = this_cpu,
> + .dst_rq = this_rq,
> + .idle = CPU_NEWLY_IDLE,
> + };
> +
> +
> + cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
> +
> +next_domain:
> + env.sd = sd;
> + /* Allow migrating cache_hot tasks too. */
> + sd->nr_balance_failed = sd->cache_nice_tries + 1;
> +
> + for_each_cpu_wrap(cpu, cpus, this_cpu) {
> + struct sched_domain_shared *sd_share;
> + struct cpumask *overloaded_mask;
> + struct sched_domain *cpu_llc;
> + int overloaded_cpu;
> +
> + cpu_llc = rcu_dereference(per_cpu(sd_llc, cpu));
> + if (!cpu_llc)
> + break;
> +
> + sd_share = cpu_llc->shared;
> + if (!sd_share)
> + break;
> +
> + overloaded_mask = sd_share->overloaded_mask;
> + if (!overloaded_mask)
> + break;
> +
> + for_each_cpu_wrap(overloaded_cpu, overloaded_mask, this_cpu + 1) {
> + struct rq *overloaded_rq = cpu_rq(overloaded_cpu);
> + struct task_struct *p = NULL;
> +
> + if (sched_newidle_continue_balance(this_rq)) {
> + *continue_balancing = 0;
> + return 0;
> + }
> +
> + /* Quick peek to find if pushable tasks exist. */
> + if (!has_pushable_tasks(overloaded_rq))
> + continue;
> +
> + scoped_guard (rq_lock, overloaded_rq) {
> + update_rq_clock(overloaded_rq);
> +
> + if (!has_pushable_tasks(overloaded_rq))
> + break;
You can skip the clock update if there aren't any tasks to grab.
> +
> + env.src_cpu = overloaded_cpu;
> + env.src_rq = overloaded_rq;
> +
> + p = detach_one_task(&env);
Yep, detach_one_task() uses can_migrate_task() which checks
task_on_cpu(), so that's all good :-)
> + }
> +
> + if (!p)
> + continue;
> +
> + attach_one_task(this_rq, p);
> + return 1;
> + }
> +
> + cpumask_andnot(cpus, cpus, sched_domain_span(cpu_llc));
> + }