[PATCH v5 06/24] sched/core: allow only preferred CPUs in is_cpu_allowed

From: Shrikanth Hegde

Date: Thu Jun 25 2026 - 08:48:38 EST


When possible, choose a preferred CPUs to pick.

Push task mechanism uses stopper thread which going to call
select_fallback_rq and use this mechanism to pick only a preferred CPU.

When task is affined only to non-preferred CPUs it should continue to
run there. Detect that by checking if cpus_ptr and cpu_preferred_mask
intersect or not.

Since is_cpu_allowed can be called directly or repeatedly in
select_fallback_rq, encode the info in task_struct->has_preferred_cpu_state
if the path is via select_fallback_rq or not.
This helps to avoid N**2 complexity for the rare cases.

Additional overhead of O(N) comes to is_cpu_allowed only when cpu is not
preferred. So in normal scenarios overhead is only a bit check.

Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
---
v4->v5:
- Do simple encoding of -1,0,1 instead (K Prateek Nayak)
- Make it s8 (K Prateek Nayak)
- Update changelog to address sashiko concerns of overhead.

include/linux/sched.h | 1 +
kernel/sched/core.c | 35 +++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 25 +++++++++++++++++++++++++
3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc6ecb3869dd..27dbf676113e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1657,6 +1657,7 @@ struct task_struct {
#ifdef CONFIG_UNWIND_USER
struct unwind_task_info unwind_info;
#endif
+ s8 has_preferred_cpu_state;

/* CPU-specific state of this task: */
struct thread_struct thread;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e16946c9d62..281715a6e88f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2500,6 +2500,8 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
+ bool task_check_preferred_cpu;
+
/* When not in the task's cpumask, no point in looking further. */
if (!task_allowed_on_cpu(p, cpu))
return false;
@@ -2508,9 +2510,23 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
if (is_migration_disabled(p))
return cpu_online(cpu);

+ /*
+ * This is essential to maintain user affinities when preferred
+ * CPUs change. A task pinned on non-preferred CPU should continue
+ * to run there, since this is non-user triggered.
+ *
+ * If CPU is non-preferred and task can run on other CPUs which are
+ * currently preferred, then choose those other CPUs instead.
+ * Overhead is minimal when CPU is preferred.
+ */
+ task_check_preferred_cpu = !cpu_preferred(cpu) && task_has_preferred_cpus(p);
+
/* Non kernel threads are not allowed during either online or offline. */
- if (!(p->flags & PF_KTHREAD))
+ if (!(p->flags & PF_KTHREAD)) {
+ if (task_check_preferred_cpu)
+ return false;
return cpu_active(cpu);
+ }

/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
@@ -2520,6 +2536,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
if (cpu_dying(cpu))
return false;

+ /* Try on preferred CPU first if possible*/
+ if (task_check_preferred_cpu)
+ return false;
+
/* But are allowed during online. */
return cpu_online(cpu);
}
@@ -3549,6 +3569,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;

+ /*
+ * Cache the value whether task's affinity spans preferred CPUs.
+ * This helps to avoid repeating the same for each CPU
+ * later in the loop. Encode call to is_cpu_allowed coming
+ * via select_fallback_rq.
+ */
+ p->has_preferred_cpu_state = task_has_preferred_cpus(p) ? 1 : -1;
+
/*
* If the node that the CPU is on has been offlined, cpu_to_node()
* will return -1. There is no CPU on the node, and we should
@@ -3560,7 +3588,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
if (is_cpu_allowed(p, dest_cpu))
- return dest_cpu;
+ goto clear_and_return;
}
}

@@ -3604,6 +3632,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
}

+clear_and_return:
+ p->has_preferred_cpu_state = 0;
return dest_cpu;
}

@@ -4612,6 +4642,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
+ p->has_preferred_cpu_state = 0;
init_sched_mm(p);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c7c2dea65edd..5d009c2529b2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4213,4 +4213,29 @@ DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)

#include "ext.h"

+/*
+ * has_preferred_cpu_state could have the value cached from
+ * select_fallback_rq. It is set/cleared while holding pi_lock
+ * and irq disabled.
+ *
+ * 1: Cached and preferred CPUs exists in task's affinity.
+ * 0: Not cached and need to evaluate.
+ * -1: Cached and preferred CPU doesn't exits task's affinity
+ *
+ * Only affects FAIR task.
+ */
+static inline bool task_has_preferred_cpus(struct task_struct *p)
+{
+ int cached;
+
+ /* Only FAIR tasks honor preferred CPU state */
+ if (unlikely(p->sched_class != &fair_sched_class))
+ return false;
+
+ cached = READ_ONCE(p->has_preferred_cpu_state);
+ if (cached)
+ return cached > 0;
+ else
+ return cpumask_intersects(p->cpus_ptr, cpu_preferred_mask);
+}
#endif /* _KERNEL_SCHED_SCHED_H */
--
2.47.3