Re: [PATCH 6/6] sched_ext: Move built-in idle CPU selection policy to a separate file

From: Yury Norov
Date: Fri Dec 20 2024 - 09:53:47 EST


On Tue, Dec 17, 2024 at 10:32:31AM +0100, Andrea Righi wrote:
> As ext.c is becoming quite large, move the idle CPU selection policy
> to a separate file (ext_idle.c) for better code readability.
>
> No functional change, this is purely code reorganization.
>
> Suggested-by: Yury Norov <yury.norov@xxxxxxxxx>
> Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>

I think it should be a preparation patch at the beginning of the
series. The one major downside of patches of that sort is that when
moving code, you wipe git history. If you have to do that, you need
to move code before doing the work, such that at least your updates
will be accessible to git blame.

> ---
> MAINTAINERS | 1 +
> kernel/sched/ext.c | 857 +---------------------------------------
> kernel/sched/ext_idle.c | 835 ++++++++++++++++++++++++++++++++++++++
> 3 files changed, 853 insertions(+), 840 deletions(-)
> create mode 100644 kernel/sched/ext_idle.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1e930c7a58b1..02960d1b9ee9 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -20909,6 +20909,7 @@ T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
> F: include/linux/sched/ext.h
> F: kernel/sched/ext.h
> F: kernel/sched/ext.c
> +F: kernel/sched/ext_idle.c
> F: tools/sched_ext/
> F: tools/testing/selftests/sched_ext
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 1edf4206aab5..40d13594c1d1 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -6,6 +6,8 @@
> * Copyright (c) 2022 Tejun Heo <tj@xxxxxxxxxx>
> * Copyright (c) 2022 David Vernet <dvernet@xxxxxxxx>
> */
> +#include <linux/btf_ids.h>
> +
> #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
>
> enum scx_consts {
> @@ -890,12 +892,6 @@ static bool scx_warned_zero_slice;
> static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
> static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
> static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
> -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
> -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
> -
> -#ifdef CONFIG_SMP
> -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
> -#endif
>
> static struct static_key_false scx_has_op[SCX_OPI_END] =
> { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
> @@ -903,6 +899,17 @@ static struct static_key_false scx_has_op[SCX_OPI_END] =
> static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
> static struct scx_exit_info *scx_exit_info;
>
> +#define scx_ops_error_kind(err, fmt, args...) \
> + scx_ops_exit_kind((err), 0, fmt, ##args)
> +
> +#define scx_ops_exit(code, fmt, args...) \
> + scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
> +
> +#define scx_ops_error(fmt, args...) \
> + scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
> +
> +#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
> +
> static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
> static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
>
> @@ -930,97 +937,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
>
> static struct delayed_work scx_watchdog_work;
>
> -/* idle tracking */
> -#ifdef CONFIG_SMP
> -#ifdef CONFIG_CPUMASK_OFFSTACK
> -#define CL_ALIGNED_IF_ONSTACK
> -#else
> -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
> -#endif
> -
> -struct idle_cpumask {
> - cpumask_var_t cpu;
> - cpumask_var_t smt;
> -};
> -
> -/*
> - * Make sure a NUMA node is in a valid range.
> - */
> -static int validate_node(int node)
> -{
> - /* If no node is specified, return the current one */
> - if (node == NUMA_NO_NODE)
> - return numa_node_id();
> -
> - /* Make sure node is in the range of possible nodes */
> - if (node < 0 || node >= num_possible_nodes())
> - return -EINVAL;
> -
> - return node;
> -}
> -
> -/*
> - * cpumasks to track idle CPUs within each NUMA node.
> - *
> - * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not specified, a single flat cpumask
> - * from node 0 is used to track all idle CPUs system-wide.
> - */
> -static struct idle_cpumask **idle_masks CL_ALIGNED_IF_ONSTACK;
> -
> -static struct cpumask *get_idle_mask_node(int node, bool smt)
> -{
> - if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> - return smt ? idle_masks[0]->smt : idle_masks[0]->cpu;
> -
> - node = validate_node(node);
> - if (node < 0)
> - return cpu_none_mask;
> -
> - return smt ? idle_masks[node]->smt : idle_masks[node]->cpu;
> -}
> -
> -static struct cpumask *get_idle_cpumask_node(int node)
> -{
> - return get_idle_mask_node(node, false);
> -}
> -
> -static struct cpumask *get_idle_smtmask_node(int node)
> -{
> - return get_idle_mask_node(node, true);
> -}
> -
> -static void idle_masks_init(void)
> -{
> - int node;
> -
> - idle_masks = kcalloc(num_possible_nodes(), sizeof(*idle_masks), GFP_KERNEL);
> - BUG_ON(!idle_masks);
> -
> - for_each_node_state(node, N_POSSIBLE) {
> - idle_masks[node] = kzalloc_node(sizeof(**idle_masks), GFP_KERNEL, node);
> - BUG_ON(!idle_masks[node]);
> -
> - BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->cpu, GFP_KERNEL, node));
> - BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->smt, GFP_KERNEL, node));
> - }
> -}
> -#else /* !CONFIG_SMP */
> -static int validate_node(int node)
> -{
> - return numa_node_id();
> -}
> -
> -static struct cpumask *get_idle_cpumask_node(int node)
> -{
> - return cpu_none_mask;
> -}
> -
> -static struct cpumask *get_idle_smtmask_node(int node)
> -{
> - return cpu_none_mask;
> -}
> -#endif /* CONFIG_SMP */
> -
> /* for %SCX_KICK_WAIT */
> static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
>
> @@ -1107,17 +1023,6 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
> s64 exit_code,
> const char *fmt, ...);
>
> -#define scx_ops_error_kind(err, fmt, args...) \
> - scx_ops_exit_kind((err), 0, fmt, ##args)
> -
> -#define scx_ops_exit(code, fmt, args...) \
> - scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
> -
> -#define scx_ops_error(fmt, args...) \
> - scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
> -
> -#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
> -
> static long jiffies_delta_msecs(unsigned long at, unsigned long now)
> {
> if (time_after(at, now))
> @@ -1624,6 +1529,9 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
> return -EPROTO;
> }
>
> +/* Built-in idle CPU selection policy */
> +#include "ext_idle.c"
> +
> static void run_deferred(struct rq *rq)
> {
> process_ddsp_deferred_locals(rq);
> @@ -3247,406 +3155,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
> #endif /* CONFIG_SCHED_CORE */
>
> #ifdef CONFIG_SMP
> -
> -static bool test_and_clear_cpu_idle(int cpu)
> -{
> - int node = cpu_to_node(cpu);
> - struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> -
> -#ifdef CONFIG_SCHED_SMT
> - /*
> - * SMT mask should be cleared whether we can claim @cpu or not. The SMT
> - * cluster is not wholly idle either way. This also prevents
> - * scx_pick_idle_cpu() from getting caught in an infinite loop.
> - */
> - if (sched_smt_active()) {
> - const struct cpumask *smt = cpu_smt_mask(cpu);
> - struct cpumask *idle_smt = get_idle_smtmask_node(node);
> -
> - /*
> - * If offline, @cpu is not its own sibling and
> - * scx_pick_idle_cpu() can get caught in an infinite loop as
> - * @cpu is never cleared from the idle SMT mask. Ensure that
> - * @cpu is eventually cleared.
> - *
> - * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
> - * reduce memory writes, which may help alleviate cache
> - * coherence pressure.
> - */
> - if (cpumask_intersects(smt, idle_smt))
> - cpumask_andnot(idle_smt, idle_smt, smt);
> - else if (cpumask_test_cpu(cpu, idle_smt))
> - __cpumask_clear_cpu(cpu, idle_smt);
> - }
> -#endif
> - return cpumask_test_and_clear_cpu(cpu, idle_cpu);
> -}
> -
> -static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
> -{
> - int cpu;
> -
> -retry:
> - if (sched_smt_active()) {
> - cpu = cpumask_any_and_distribute(get_idle_smtmask_node(node), cpus_allowed);
> - if (cpu < nr_cpu_ids)
> - goto found;
> -
> - if (flags & SCX_PICK_IDLE_CORE)
> - return -EBUSY;
> - }
> -
> - cpu = cpumask_any_and_distribute(get_idle_cpumask_node(node), cpus_allowed);
> - if (cpu >= nr_cpu_ids)
> - return -EBUSY;
> -
> -found:
> - if (test_and_clear_cpu_idle(cpu))
> - return cpu;
> - else
> - goto retry;
> -
> -}
> -
> -static s32
> -scx_pick_idle_cpu_numa(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> -{
> - nodemask_t hop_nodes = NODE_MASK_NONE;
> - int start_node = cpu_to_node(prev_cpu);
> - s32 cpu = -EBUSY;
> -
> - /*
> - * Traverse all online nodes in order of increasing distance,
> - * starting from prev_cpu's node.
> - */
> - rcu_read_lock();
> - for_each_numa_hop_node(node, start_node, hop_nodes, N_ONLINE) {
> - cpu = scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
> - if (cpu >= 0)
> - break;
> - }
> - rcu_read_unlock();
> -
> - return cpu;
> -}
> -
> -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> -{
> - /*
> - * Only node 0 is used if per-node idle cpumasks are disabled.
> - */
> - if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> - return scx_pick_idle_cpu_from_node(0, cpus_allowed, flags);
> -
> - return scx_pick_idle_cpu_numa(cpus_allowed, prev_cpu, flags);
> -}
> -
> -/*
> - * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
> - * domain is not defined).
> - */
> -static unsigned int llc_weight(s32 cpu)
> -{
> - struct sched_domain *sd;
> -
> - sd = rcu_dereference(per_cpu(sd_llc, cpu));
> - if (!sd)
> - return 0;
> -
> - return sd->span_weight;
> -}
> -
> -/*
> - * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
> - * domain is not defined).
> - */
> -static struct cpumask *llc_span(s32 cpu)
> -{
> - struct sched_domain *sd;
> -
> - sd = rcu_dereference(per_cpu(sd_llc, cpu));
> - if (!sd)
> - return 0;
> -
> - return sched_domain_span(sd);
> -}
> -
> -/*
> - * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
> - * NUMA domain is not defined).
> - */
> -static unsigned int numa_weight(s32 cpu)
> -{
> - struct sched_domain *sd;
> - struct sched_group *sg;
> -
> - sd = rcu_dereference(per_cpu(sd_numa, cpu));
> - if (!sd)
> - return 0;
> - sg = sd->groups;
> - if (!sg)
> - return 0;
> -
> - return sg->group_weight;
> -}
> -
> -/*
> - * Return true if the LLC domains do not perfectly overlap with the NUMA
> - * domains, false otherwise.
> - */
> -static bool llc_numa_mismatch(void)
> -{
> - int cpu;
> -
> - /*
> - * We need to scan all online CPUs to verify whether their scheduling
> - * domains overlap.
> - *
> - * While it is rare to encounter architectures with asymmetric NUMA
> - * topologies, CPU hotplugging or virtualized environments can result
> - * in asymmetric configurations.
> - *
> - * For example:
> - *
> - * NUMA 0:
> - * - LLC 0: cpu0..cpu7
> - * - LLC 1: cpu8..cpu15 [offline]
> - *
> - * NUMA 1:
> - * - LLC 0: cpu16..cpu23
> - * - LLC 1: cpu24..cpu31
> - *
> - * In this case, if we only check the first online CPU (cpu0), we might
> - * incorrectly assume that the LLC and NUMA domains are fully
> - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
> - * domains).
> - */
> - for_each_online_cpu(cpu)
> - if (llc_weight(cpu) != numa_weight(cpu))
> - return true;
> -
> - return false;
> -}
> -
> -/*
> - * Initialize topology-aware scheduling.
> - *
> - * Detect if the system has multiple LLC or multiple NUMA domains and enable
> - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
> - * selection policy.
> - *
> - * Assumption: the kernel's internal topology representation assumes that each
> - * CPU belongs to a single LLC domain, and that each LLC domain is entirely
> - * contained within a single NUMA node.
> - */
> -static void update_selcpu_topology(struct sched_ext_ops *ops)
> -{
> - bool enable_llc = false;
> - unsigned int nr_cpus;
> - s32 cpu = cpumask_first(cpu_online_mask);
> -
> - /*
> - * Enable LLC domain optimization only when there are multiple LLC
> - * domains among the online CPUs. If all online CPUs are part of a
> - * single LLC domain, the idle CPU selection logic can choose any
> - * online CPU without bias.
> - *
> - * Note that it is sufficient to check the LLC domain of the first
> - * online CPU to determine whether a single LLC domain includes all
> - * CPUs.
> - */
> - rcu_read_lock();
> - nr_cpus = llc_weight(cpu);
> - if (nr_cpus > 0) {
> - if (nr_cpus < num_online_cpus())
> - enable_llc = true;
> - /*
> - * No need to enable LLC optimization if the LLC domains are
> - * perfectly overlapping with the NUMA domains when per-node
> - * cpumasks are enabled.
> - */
> - if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
> - !llc_numa_mismatch())
> - enable_llc = false;
> - pr_debug("sched_ext: LLC=%*pb weight=%u\n",
> - cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
> - }
> - rcu_read_unlock();
> -
> - pr_debug("sched_ext: LLC idle selection %s\n",
> - enable_llc ? "enabled" : "disabled");
> -
> - if (enable_llc)
> - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
> - else
> - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
> -
> - /*
> - * Check if we need to enable per-node cpumasks.
> - */
> - if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
> - static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
> - else
> - static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
> -}
> -
> -/*
> - * Built-in CPU idle selection policy:
> - *
> - * 1. Prioritize full-idle cores:
> - * - always prioritize CPUs from fully idle cores (both logical CPUs are
> - * idle) to avoid interference caused by SMT.
> - *
> - * 2. Reuse the same CPU:
> - * - prefer the last used CPU to take advantage of cached data (L1, L2) and
> - * branch prediction optimizations.
> - *
> - * 3. Pick a CPU within the same LLC (Last-Level Cache):
> - * - if the above conditions aren't met, pick a CPU that shares the same LLC
> - * to maintain cache locality.
> - *
> - * 4. Pick a CPU within the same NUMA node, if enabled:
> - * - choose a CPU from the same NUMA node to reduce memory access latency.
> - *
> - * 5. Pick any idle CPU usable by the task.
> - *
> - * Step 3 is performed only if the system has multiple LLC domains that are not
> - * perfectly overlapping with the NUMA domains (see scx_selcpu_topo_llc).
> - *
> - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
> - * we never call ops.select_cpu() for them, see select_task_rq().
> - */
> -static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
> - u64 wake_flags, bool *found)
> -{
> - const struct cpumask *llc_cpus = NULL;
> - int node = cpu_to_node(prev_cpu);
> - s32 cpu;
> -
> - *found = false;
> -
> - /*
> - * This is necessary to protect llc_cpus.
> - */
> - rcu_read_lock();
> -
> - /*
> - * Determine the scheduling domain only if the task is allowed to run
> - * on all CPUs.
> - *
> - * This is done primarily for efficiency, as it avoids the overhead of
> - * updating a cpumask every time we need to select an idle CPU (which
> - * can be costly in large SMP systems), but it also aligns logically:
> - * if a task's scheduling domain is restricted by user-space (through
> - * CPU affinity), the task will simply use the flat scheduling domain
> - * defined by user-space.
> - */
> - if (p->nr_cpus_allowed >= num_possible_cpus())
> - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
> - llc_cpus = llc_span(prev_cpu);
> -
> - /*
> - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
> - */
> - if (wake_flags & SCX_WAKE_SYNC) {
> - cpu = smp_processor_id();
> -
> - /*
> - * If the waker's CPU is cache affine and prev_cpu is idle,
> - * then avoid a migration.
> - */
> - if (cpus_share_cache(cpu, prev_cpu) &&
> - test_and_clear_cpu_idle(prev_cpu)) {
> - cpu = prev_cpu;
> - goto cpu_found;
> - }
> -
> - /*
> - * If the waker's local DSQ is empty, and the system is under
> - * utilized, try to wake up @p to the local DSQ of the waker.
> - *
> - * Checking only for an empty local DSQ is insufficient as it
> - * could give the wakee an unfair advantage when the system is
> - * oversaturated.
> - *
> - * Checking only for the presence of idle CPUs is also
> - * insufficient as the local DSQ of the waker could have tasks
> - * piled up on it even if there is an idle core elsewhere on
> - * the system.
> - */
> - if (!(current->flags & PF_EXITING) &&
> - cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
> - !cpumask_empty(get_idle_cpumask_node(cpu_to_node(cpu)))) {
> - if (cpumask_test_cpu(cpu, p->cpus_ptr))
> - goto cpu_found;
> - }
> - }
> -
> - /*
> - * If CPU has SMT, any wholly idle CPU is likely a better pick than
> - * partially idle @prev_cpu.
> - */
> - if (sched_smt_active()) {
> - /*
> - * Keep using @prev_cpu if it's part of a fully idle core.
> - */
> - if (cpumask_test_cpu(prev_cpu, get_idle_smtmask_node(node)) &&
> - test_and_clear_cpu_idle(prev_cpu)) {
> - cpu = prev_cpu;
> - goto cpu_found;
> - }
> -
> - /*
> - * Search for any fully idle core in the same LLC domain.
> - */
> - if (llc_cpus) {
> - cpu = scx_pick_idle_cpu_from_node(node, llc_cpus, SCX_PICK_IDLE_CORE);
> - if (cpu >= 0)
> - goto cpu_found;
> - }
> -
> - /*
> - * Search for any full idle core usable by the task.
> - */
> - cpu = scx_pick_idle_cpu(p->cpus_ptr, prev_cpu, SCX_PICK_IDLE_CORE);
> - if (cpu >= 0)
> - goto cpu_found;
> - }
> -
> - /*
> - * Use @prev_cpu if it's idle.
> - */
> - if (test_and_clear_cpu_idle(prev_cpu)) {
> - cpu = prev_cpu;
> - goto cpu_found;
> - }
> -
> - /*
> - * Search for any idle CPU in the same LLC domain.
> - */
> - if (llc_cpus) {
> - cpu = scx_pick_idle_cpu_from_node(node, llc_cpus, 0);
> - if (cpu >= 0)
> - goto cpu_found;
> - }
> -
> - /*
> - * Search for any idle CPU usable by the task.
> - */
> - cpu = scx_pick_idle_cpu(p->cpus_ptr, prev_cpu, 0);
> - if (cpu >= 0)
> - goto cpu_found;
> -
> - rcu_read_unlock();
> - return prev_cpu;
> -
> -cpu_found:
> - rcu_read_unlock();
> -
> - *found = true;
> - return cpu;
> -}
> -
> static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
> {
> /*
> @@ -3713,66 +3221,6 @@ static void set_cpus_allowed_scx(struct task_struct *p,
> (struct cpumask *)p->cpus_ptr);
> }
>
> -static void reset_idle_masks(void)
> -{
> - int node;
> -
> - if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
> - cpumask_copy(get_idle_cpumask_node(0), cpu_online_mask);
> - cpumask_copy(get_idle_smtmask_node(0), cpu_online_mask);
> - return;
> - }
> -
> - /*
> - * Consider all online cpus idle. Should converge to the actual state
> - * quickly.
> - */
> - for_each_node_state(node, N_POSSIBLE) {
> - const struct cpumask *node_mask = cpumask_of_node(node);
> - struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> - struct cpumask *idle_smt = get_idle_smtmask_node(node);
> -
> - cpumask_and(idle_cpu, cpu_online_mask, node_mask);
> - cpumask_copy(idle_smt, idle_cpu);
> - }
> -}
> -
> -void __scx_update_idle(struct rq *rq, bool idle)
> -{
> - int cpu = cpu_of(rq);
> - int node = cpu_to_node(cpu);
> - struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> -
> - if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
> - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
> - if (!static_branch_unlikely(&scx_builtin_idle_enabled))
> - return;
> - }
> -
> - assign_cpu(cpu, idle_cpu, idle);
> -
> -#ifdef CONFIG_SCHED_SMT
> - if (sched_smt_active()) {
> - const struct cpumask *smt = cpu_smt_mask(cpu);
> - struct cpumask *idle_smt = get_idle_smtmask_node(node);
> -
> - if (idle) {
> - /*
> - * idle_smt handling is racy but that's fine as it's
> - * only for optimization and self-correcting.
> - */
> - for_each_cpu(cpu, smt) {
> - if (!cpumask_test_cpu(cpu, idle_cpu))
> - return;
> - }
> - cpumask_or(idle_smt, idle_smt, smt);
> - } else {
> - cpumask_andnot(idle_smt, idle_smt, smt);
> - }
> - }
> -#endif
> -}
> -
> static void handle_hotplug(struct rq *rq, bool online)
> {
> int cpu = cpu_of(rq);
> @@ -3811,21 +3259,7 @@ static void rq_offline_scx(struct rq *rq)
> {
> rq->scx.flags &= ~SCX_RQ_ONLINE;
> }
> -
> -#else /* CONFIG_SMP */
> -
> -static bool test_and_clear_cpu_idle(int cpu) { return false; }
> -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> -{
> - return -EBUSY;
> -}
> -static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
> -{
> - return -EBUSY;
> -}
> -static void reset_idle_masks(void) {}
> -
> -#endif /* CONFIG_SMP */
> +#endif /* CONFIG_SMP */
>
> static bool check_rq_for_timeouts(struct rq *rq)
> {
> @@ -6399,71 +5833,6 @@ void __init init_sched_ext_class(void)
> /********************************************************************************
> * Helpers that can be called from the BPF scheduler.
> */
> -#include <linux/btf_ids.h>
> -
> -static bool check_builtin_idle_enabled(void)
> -{
> - if (static_branch_likely(&scx_builtin_idle_enabled))
> - return true;
> -
> - scx_ops_error("built-in idle tracking is disabled");
> - return false;
> -}
> -
> -static bool check_builtin_idle_per_node_enabled(void)
> -{
> - if (static_branch_likely(&scx_builtin_idle_per_node))
> - return true;
> -
> - scx_ops_error("per-node idle tracking is disabled");
> - return false;
> -}
> -
> -__bpf_kfunc_start_defs();
> -
> -/**
> - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
> - * @p: task_struct to select a CPU for
> - * @prev_cpu: CPU @p was on previously
> - * @wake_flags: %SCX_WAKE_* flags
> - * @is_idle: out parameter indicating whether the returned CPU is idle
> - *
> - * Can only be called from ops.select_cpu() if the built-in CPU selection is
> - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
> - * @p, @prev_cpu and @wake_flags match ops.select_cpu().
> - *
> - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
> - * currently idle and thus a good candidate for direct dispatching.
> - */
> -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
> - u64 wake_flags, bool *is_idle)
> -{
> - if (!check_builtin_idle_enabled())
> - goto prev_cpu;
> -
> - if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
> - goto prev_cpu;
> -
> -#ifdef CONFIG_SMP
> - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
> -#endif
> -
> -prev_cpu:
> - *is_idle = false;
> - return prev_cpu;
> -}
> -
> -__bpf_kfunc_end_defs();
> -
> -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
> -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
> -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
> -
> -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
> - .owner = THIS_MODULE,
> - .set = &scx_kfunc_ids_select_cpu,
> -};
> -
> static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
> {
> if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
> @@ -7535,189 +6904,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
> */
> }
>
> -/**
> - * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the idle-tracking
> - * per-CPU cpumask of a target NUMA node.
> - *
> - * Returns an empty cpumask if idle tracking is not enabled, if @node is not
> - * valid, or running on a UP kernel.
> - */
> -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
> -{
> - if (!check_builtin_idle_per_node_enabled())
> - return cpu_none_mask;
> -
> - return get_idle_cpumask_node(node);
> -}
> -/**
> - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
> - * per-CPU cpumask of the current NUMA node.
> - *
> - * Returns an emtpy cpumask if idle tracking is not enabled, or running on a UP
> - * kernel.
> - */
> -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
> -{
> - if (!check_builtin_idle_enabled())
> - return cpu_none_mask;
> -
> - return get_idle_cpumask_node(NUMA_NO_NODE);
> -}
> -
> -/**
> - * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the idle-tracking,
> - * per-physical-core cpumask of a target NUMA node. Can be used to determine
> - * if an entire physical core is free.
> - *
> - * Returns an empty cpumask if idle tracking is not enabled, if @node is not
> - * valid, or running on a UP kernel.
> - */
> -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
> -{
> - if (!check_builtin_idle_per_node_enabled())
> - return cpu_none_mask;
> -
> - return get_idle_smtmask_node(node);
> -}
> -
> -/**
> - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
> - * per-physical-core cpumask of the current NUMA node. Can be used to determine
> - * if an entire physical core is free.
> - *
> - * Returns an empty cumask if idle tracking is not enabled, or running on a UP
> - * kernel.
> - */
> -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
> -{
> - if (!check_builtin_idle_enabled())
> - return cpu_none_mask;
> -
> - return get_idle_smtmask_node(NUMA_NO_NODE);
> -}
> -
> -/**
> - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
> - * either the percpu, or SMT idle-tracking cpumask.
> - */
> -__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
> -{
> - /*
> - * Empty function body because we aren't actually acquiring or releasing
> - * a reference to a global idle cpumask, which is read-only in the
> - * caller and is never released. The acquire / release semantics here
> - * are just used to make the cpumask a trusted pointer in the caller.
> - */
> -}
> -
> -/**
> - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
> - * @cpu: cpu to test and clear idle for
> - *
> - * Returns %true if @cpu was idle and its idle state was successfully cleared.
> - * %false otherwise.
> - *
> - * Unavailable if ops.update_idle() is implemented and
> - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
> - */
> -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
> -{
> - if (!check_builtin_idle_enabled())
> - return false;
> -
> - if (ops_cpu_valid(cpu, NULL))
> - return test_and_clear_cpu_idle(cpu);
> - else
> - return false;
> -}
> -
> -/**
> - * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from a NUMA node
> - * @node: target NUMA node
> - * @cpus_allowed: Allowed cpumask
> - * @flags: %SCX_PICK_IDLE_CPU_* flags
> - *
> - * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
> - * Returns the picked idle cpu number on success. -%EBUSY if no matching cpu
> - * was found.
> - *
> - * Unavailable if ops.update_idle() is implemented and
> - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set or if %SCX_OPS_KEEP_BUILTIN_IDLE is
> - * not set.
> - */
> -__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(int node, const struct cpumask *cpus_allowed,
> - u64 flags)
> -{
> - node = validate_node(node);
> - if (node < 0)
> - return node;
> -
> - if (!check_builtin_idle_per_node_enabled())
> - return -EBUSY;
> -
> - return scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
> -}
> -
> -/**
> - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
> - * @cpus_allowed: Allowed cpumask
> - * @flags: %SCX_PICK_IDLE_CPU_* flags
> - *
> - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
> - * number on success. -%EBUSY if no matching cpu was found.
> - *
> - * Idle CPU tracking may race against CPU scheduling state transitions. For
> - * example, this function may return -%EBUSY as CPUs are transitioning into the
> - * idle state. If the caller then assumes that there will be dispatch events on
> - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
> - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
> - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
> - * event in the near future.
> - *
> - * Unavailable if ops.update_idle() is implemented and
> - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
> - */
> -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
> - u64 flags)
> -{
> - if (!check_builtin_idle_enabled())
> - return -EBUSY;
> -
> - return scx_pick_idle_cpu(cpus_allowed, smp_processor_id(), flags);
> -}
> -
> -/**
> - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
> - * @cpus_allowed: Allowed cpumask
> - * @flags: %SCX_PICK_IDLE_CPU_* flags
> - *
> - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
> - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
> - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
> - * empty.
> - *
> - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
> - * set, this function can't tell which CPUs are idle and will always pick any
> - * CPU.
> - */
> -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
> - u64 flags)
> -{
> - s32 cpu;
> -
> - if (static_branch_likely(&scx_builtin_idle_enabled)) {
> - cpu = scx_pick_idle_cpu(cpus_allowed, smp_processor_id(), flags);
> - if (cpu >= 0)
> - return cpu;
> - }
> -
> - cpu = cpumask_any_distribute(cpus_allowed);
> - if (cpu < nr_cpu_ids)
> - return cpu;
> - else
> - return -EBUSY;
> -}
> -
> /**
> * scx_bpf_task_running - Is task currently running?
> * @p: task of interest
> @@ -7796,15 +6982,6 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_to_node)
> BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
> BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
> BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
> -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
> -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
> -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
> -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
> -BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
> -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
> -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
> -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
> -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
> BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
> BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
> BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
> diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
> new file mode 100644
> index 000000000000..ec36848c1a10
> --- /dev/null
> +++ b/kernel/sched/ext_idle.c
> @@ -0,0 +1,835 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
> + *
> + * Built-in idle CPU tracking policy.
> + *
> + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
> + * Copyright (c) 2022 Tejun Heo <tj@xxxxxxxxxx>
> + * Copyright (c) 2022 David Vernet <dvernet@xxxxxxxx>
> + * Copyright (c) 2024 Andrea Righi <arighi@xxxxxxxxxx>
> + */
> +
> +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
> +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
> +
> +static bool check_builtin_idle_enabled(void)
> +{
> + if (static_branch_likely(&scx_builtin_idle_enabled))
> + return true;
> +
> + scx_ops_error("built-in idle tracking is disabled");
> + return false;
> +}
> +
> +static bool check_builtin_idle_per_node_enabled(void)
> +{
> + if (static_branch_likely(&scx_builtin_idle_per_node))
> + return true;
> +
> + scx_ops_error("per-node idle tracking is disabled");
> + return false;
> +}
> +
> +#ifdef CONFIG_SMP
> +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
> +
> +#ifdef CONFIG_CPUMASK_OFFSTACK
> +#define CL_ALIGNED_IF_ONSTACK
> +#else
> +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
> +#endif
> +
> +struct idle_cpumask {
> + cpumask_var_t cpu;
> + cpumask_var_t smt;
> +};
> +
> +/*
> + * cpumasks to track idle CPUs within each NUMA node.
> + *
> + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not specified, a single flat cpumask
> + * from node 0 is used to track all idle CPUs system-wide.
> + */
> +static struct idle_cpumask **idle_masks CL_ALIGNED_IF_ONSTACK;
> +
> +/*
> + * Make sure a NUMA node is in a valid range.
> + */
> +static int validate_node(int node)
> +{
> + /* If no node is specified, return the current one */
> + if (node == NUMA_NO_NODE)
> + return numa_node_id();
> +
> + /* Make sure node is in the range of possible nodes */
> + if (node < 0 || node >= num_possible_nodes())
> + return -EINVAL;
> +
> + return node;
> +}
> +
> +static struct cpumask *get_idle_mask_node(int node, bool smt)
> +{
> + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> + return smt ? idle_masks[0]->smt : idle_masks[0]->cpu;
> +
> + node = validate_node(node);
> + if (node < 0)
> + return cpu_none_mask;
> +
> + return smt ? idle_masks[node]->smt : idle_masks[node]->cpu;
> +}
> +
> +static struct cpumask *get_idle_cpumask_node(int node)
> +{
> + return get_idle_mask_node(node, false);
> +}
> +
> +static struct cpumask *get_idle_smtmask_node(int node)
> +{
> + return get_idle_mask_node(node, true);
> +}
> +
> +static void idle_masks_init(void)
> +{
> + int node;
> +
> + idle_masks = kcalloc(num_possible_nodes(), sizeof(*idle_masks), GFP_KERNEL);
> + BUG_ON(!idle_masks);
> +
> + for_each_node_state(node, N_POSSIBLE) {
> + idle_masks[node] = kzalloc_node(sizeof(**idle_masks), GFP_KERNEL, node);
> + BUG_ON(!idle_masks[node]);
> +
> + BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->cpu, GFP_KERNEL, node));
> + BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->smt, GFP_KERNEL, node));
> + }
> +}
> +
> +static bool test_and_clear_cpu_idle(int cpu)
> +{
> + int node = cpu_to_node(cpu);
> + struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> +
> +#ifdef CONFIG_SCHED_SMT
> + /*
> + * SMT mask should be cleared whether we can claim @cpu or not. The SMT
> + * cluster is not wholly idle either way. This also prevents
> + * scx_pick_idle_cpu() from getting caught in an infinite loop.
> + */
> + if (sched_smt_active()) {
> + const struct cpumask *smt = cpu_smt_mask(cpu);
> + struct cpumask *idle_smt = get_idle_smtmask_node(node);
> +
> + /*
> + * If offline, @cpu is not its own sibling and
> + * scx_pick_idle_cpu() can get caught in an infinite loop as
> + * @cpu is never cleared from the idle SMT mask. Ensure that
> + * @cpu is eventually cleared.
> + *
> + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
> + * reduce memory writes, which may help alleviate cache
> + * coherence pressure.
> + */
> + if (cpumask_intersects(smt, idle_smt))
> + cpumask_andnot(idle_smt, idle_smt, smt);
> + else if (cpumask_test_cpu(cpu, idle_smt))
> + __cpumask_clear_cpu(cpu, idle_smt);
> + }
> +#endif
> + return cpumask_test_and_clear_cpu(cpu, idle_cpu);
> +}
> +
> +static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
> +{
> + int cpu;
> +
> +retry:
> + if (sched_smt_active()) {
> + cpu = cpumask_any_and_distribute(get_idle_smtmask_node(node), cpus_allowed);
> + if (cpu < nr_cpu_ids)
> + goto found;
> +
> + if (flags & SCX_PICK_IDLE_CORE)
> + return -EBUSY;
> + }
> +
> + cpu = cpumask_any_and_distribute(get_idle_cpumask_node(node), cpus_allowed);
> + if (cpu >= nr_cpu_ids)
> + return -EBUSY;
> +
> +found:
> + if (test_and_clear_cpu_idle(cpu))
> + return cpu;
> + goto retry;
> +
> +}
> +
> +static s32
> +scx_pick_idle_cpu_numa(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> +{
> + nodemask_t hop_nodes = NODE_MASK_NONE;
> + int start_node = cpu_to_node(prev_cpu);
> + s32 cpu = -EBUSY;
> +
> + /*
> + * Traverse all online nodes in order of increasing distance,
> + * starting from prev_cpu's node.
> + */
> + rcu_read_lock();
> + for_each_numa_hop_node(node, start_node, hop_nodes, N_ONLINE) {
> + cpu = scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
> + if (cpu >= 0)
> + break;
> + }
> + rcu_read_unlock();
> +
> + return cpu;
> +}
> +
> +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> +{
> + /*
> + * Only node 0 is used if per-node idle cpumasks are disabled.
> + */
> + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> + return scx_pick_idle_cpu_from_node(0, cpus_allowed, flags);
> +
> + return scx_pick_idle_cpu_numa(cpus_allowed, prev_cpu, flags);
> +}
> +
> +/*
> + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
> + * domain is not defined).
> + */
> +static unsigned int llc_weight(s32 cpu)
> +{
> + struct sched_domain *sd;
> +
> + sd = rcu_dereference(per_cpu(sd_llc, cpu));
> + if (!sd)
> + return 0;
> +
> + return sd->span_weight;
> +}
> +
> +/*
> + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
> + * domain is not defined).
> + */
> +static struct cpumask *llc_span(s32 cpu)
> +{
> + struct sched_domain *sd;
> +
> + sd = rcu_dereference(per_cpu(sd_llc, cpu));
> + if (!sd)
> + return 0;
> +
> + return sched_domain_span(sd);
> +}
> +
> +/*
> + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
> + * NUMA domain is not defined).
> + */
> +static unsigned int numa_weight(s32 cpu)
> +{
> + struct sched_domain *sd;
> + struct sched_group *sg;
> +
> + sd = rcu_dereference(per_cpu(sd_numa, cpu));
> + if (!sd)
> + return 0;
> + sg = sd->groups;
> + if (!sg)
> + return 0;
> +
> + return sg->group_weight;
> +}
> +
> +/*
> + * Return true if the LLC domains do not perfectly overlap with the NUMA
> + * domains, false otherwise.
> + */
> +static bool llc_numa_mismatch(void)
> +{
> + int cpu;
> +
> + /*
> + * We need to scan all online CPUs to verify whether their scheduling
> + * domains overlap.
> + *
> + * While it is rare to encounter architectures with asymmetric NUMA
> + * topologies, CPU hotplugging or virtualized environments can result
> + * in asymmetric configurations.
> + *
> + * For example:
> + *
> + * NUMA 0:
> + * - LLC 0: cpu0..cpu7
> + * - LLC 1: cpu8..cpu15 [offline]
> + *
> + * NUMA 1:
> + * - LLC 0: cpu16..cpu23
> + * - LLC 1: cpu24..cpu31
> + *
> + * In this case, if we only check the first online CPU (cpu0), we might
> + * incorrectly assume that the LLC and NUMA domains are fully
> + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
> + * domains).
> + */
> + for_each_online_cpu(cpu)
> + if (llc_weight(cpu) != numa_weight(cpu))
> + return true;
> +
> + return false;
> +}
> +
> +/*
> + * Initialize topology-aware scheduling.
> + *
> + * Detect if the system has multiple LLC or multiple NUMA domains and enable
> + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
> + * selection policy.
> + *
> + * Assumption: the kernel's internal topology representation assumes that each
> + * CPU belongs to a single LLC domain, and that each LLC domain is entirely
> + * contained within a single NUMA node.
> + */
> +static void update_selcpu_topology(struct sched_ext_ops *ops)
> +{
> + bool enable_llc = false;
> + unsigned int nr_cpus;
> + s32 cpu = cpumask_first(cpu_online_mask);
> +
> + /*
> + * Enable LLC domain optimization only when there are multiple LLC
> + * domains among the online CPUs. If all online CPUs are part of a
> + * single LLC domain, the idle CPU selection logic can choose any
> + * online CPU without bias.
> + *
> + * Note that it is sufficient to check the LLC domain of the first
> + * online CPU to determine whether a single LLC domain includes all
> + * CPUs.
> + */
> + rcu_read_lock();
> + nr_cpus = llc_weight(cpu);
> + if (nr_cpus > 0) {
> + if (nr_cpus < num_online_cpus())
> + enable_llc = true;
> + /*
> + * No need to enable LLC optimization if the LLC domains are
> + * perfectly overlapping with the NUMA domains when per-node
> + * cpumasks are enabled.
> + */
> + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
> + !llc_numa_mismatch())
> + enable_llc = false;
> + pr_debug("sched_ext: LLC=%*pb weight=%u\n",
> + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
> + }
> + rcu_read_unlock();
> +
> + pr_debug("sched_ext: LLC idle selection %s\n",
> + enable_llc ? "enabled" : "disabled");
> +
> + if (enable_llc)
> + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
> + else
> + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
> +
> + /*
> + * Check if we need to enable per-node cpumasks.
> + */
> + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
> + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
> + else
> + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
> +}
> +
> +/*
> + * Built-in CPU idle selection policy:
> + *
> + * 1. Prioritize full-idle cores:
> + * - always prioritize CPUs from fully idle cores (both logical CPUs are
> + * idle) to avoid interference caused by SMT.
> + *
> + * 2. Reuse the same CPU:
> + * - prefer the last used CPU to take advantage of cached data (L1, L2) and
> + * branch prediction optimizations.
> + *
> + * 3. Pick a CPU within the same LLC (Last-Level Cache):
> + * - if the above conditions aren't met, pick a CPU that shares the same LLC
> + * to maintain cache locality.
> + *
> + * 4. Pick a CPU within the same NUMA node, if enabled:
> + * - choose a CPU from the same NUMA node to reduce memory access latency.
> + *
> + * 5. Pick any idle CPU usable by the task.
> + *
> + * Step 3 is performed only if the system has multiple LLC domains that are not
> + * perfectly overlapping with the NUMA domains (see scx_selcpu_topo_llc).
> + *
> + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
> + * we never call ops.select_cpu() for them, see select_task_rq().
> + */
> +static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
> + u64 wake_flags, bool *found)
> +{
> + const struct cpumask *llc_cpus = NULL;
> + int node = cpu_to_node(prev_cpu);
> + s32 cpu;
> +
> + *found = false;
> +
> + /*
> + * This is necessary to protect llc_cpus.
> + */
> + rcu_read_lock();
> +
> + /*
> + * Determine the scheduling domain only if the task is allowed to run
> + * on all CPUs.
> + *
> + * This is done primarily for efficiency, as it avoids the overhead of
> + * updating a cpumask every time we need to select an idle CPU (which
> + * can be costly in large SMP systems), but it also aligns logically:
> + * if a task's scheduling domain is restricted by user-space (through
> + * CPU affinity), the task will simply use the flat scheduling domain
> + * defined by user-space.
> + */
> + if (p->nr_cpus_allowed >= num_possible_cpus())
> + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
> + llc_cpus = llc_span(prev_cpu);
> +
> + /*
> + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
> + */
> + if (wake_flags & SCX_WAKE_SYNC) {
> + cpu = smp_processor_id();
> +
> + /*
> + * If the waker's CPU is cache affine and prev_cpu is idle,
> + * then avoid a migration.
> + */
> + if (cpus_share_cache(cpu, prev_cpu) &&
> + test_and_clear_cpu_idle(prev_cpu)) {
> + cpu = prev_cpu;
> + goto cpu_found;
> + }
> +
> + /*
> + * If the waker's local DSQ is empty, and the system is under
> + * utilized, try to wake up @p to the local DSQ of the waker.
> + *
> + * Checking only for an empty local DSQ is insufficient as it
> + * could give the wakee an unfair advantage when the system is
> + * oversaturated.
> + *
> + * Checking only for the presence of idle CPUs is also
> + * insufficient as the local DSQ of the waker could have tasks
> + * piled up on it even if there is an idle core elsewhere on
> + * the system.
> + */
> + if (!(current->flags & PF_EXITING) &&
> + cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
> + !cpumask_empty(get_idle_cpumask_node(cpu_to_node(cpu)))) {
> + if (cpumask_test_cpu(cpu, p->cpus_ptr))
> + goto cpu_found;
> + }
> + }
> +
> + /*
> + * If CPU has SMT, any wholly idle CPU is likely a better pick than
> + * partially idle @prev_cpu.
> + */
> + if (sched_smt_active()) {
> + /*
> + * Keep using @prev_cpu if it's part of a fully idle core.
> + */
> + if (cpumask_test_cpu(prev_cpu, get_idle_smtmask_node(node)) &&
> + test_and_clear_cpu_idle(prev_cpu)) {
> + cpu = prev_cpu;
> + goto cpu_found;
> + }
> +
> + /*
> + * Search for any fully idle core in the same LLC domain.
> + */
> + if (llc_cpus) {
> + cpu = scx_pick_idle_cpu_from_node(node, llc_cpus, SCX_PICK_IDLE_CORE);
> + if (cpu >= 0)
> + goto cpu_found;
> + }
> +
> + /*
> + * Search for any full idle core usable by the task.
> + */
> + cpu = scx_pick_idle_cpu(p->cpus_ptr, prev_cpu, SCX_PICK_IDLE_CORE);
> + if (cpu >= 0)
> + goto cpu_found;
> + }
> +
> + /*
> + * Use @prev_cpu if it's idle.
> + */
> + if (test_and_clear_cpu_idle(prev_cpu)) {
> + cpu = prev_cpu;
> + goto cpu_found;
> + }
> +
> + /*
> + * Search for any idle CPU in the same LLC domain.
> + */
> + if (llc_cpus) {
> + cpu = scx_pick_idle_cpu_from_node(node, llc_cpus, 0);
> + if (cpu >= 0)
> + goto cpu_found;
> + }
> +
> + /*
> + * Search for any idle CPU usable by the task.
> + */
> + cpu = scx_pick_idle_cpu(p->cpus_ptr, prev_cpu, 0);
> + if (cpu >= 0)
> + goto cpu_found;
> +
> + rcu_read_unlock();
> + return prev_cpu;
> +
> +cpu_found:
> + rcu_read_unlock();
> +
> + *found = true;
> + return cpu;
> +}
> +
> +static void reset_idle_masks(void)
> +{
> + int node;
> +
> + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
> + cpumask_copy(get_idle_cpumask_node(0), cpu_online_mask);
> + cpumask_copy(get_idle_smtmask_node(0), cpu_online_mask);
> + return;
> + }
> +
> + /*
> + * Consider all online cpus idle. Should converge to the actual state
> + * quickly.
> + */
> + for_each_node_state(node, N_POSSIBLE) {
> + const struct cpumask *node_mask = cpumask_of_node(node);
> + struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> + struct cpumask *idle_smt = get_idle_smtmask_node(node);
> +
> + cpumask_and(idle_cpu, cpu_online_mask, node_mask);
> + cpumask_copy(idle_smt, idle_cpu);
> + }
> +}
> +
> +void __scx_update_idle(struct rq *rq, bool idle)
> +{
> + int cpu = cpu_of(rq);
> + int node = cpu_to_node(cpu);
> + struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> +
> + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
> + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
> + if (!static_branch_unlikely(&scx_builtin_idle_enabled))
> + return;
> + }
> +
> + assign_cpu(cpu, idle_cpu, idle);
> +
> +#ifdef CONFIG_SCHED_SMT
> + if (sched_smt_active()) {
> + const struct cpumask *smt = cpu_smt_mask(cpu);
> + struct cpumask *idle_smt = get_idle_smtmask_node(node);
> +
> + if (idle) {
> + /*
> + * idle_smt handling is racy but that's fine as it's
> + * only for optimization and self-correcting.
> + */
> + for_each_cpu(cpu, smt) {
> + if (!cpumask_test_cpu(cpu, idle_cpu))
> + return;
> + }
> + cpumask_or(idle_smt, idle_smt, smt);
> + } else {
> + cpumask_andnot(idle_smt, idle_smt, smt);
> + }
> + }
> +#endif
> +}
> +#else /* !CONFIG_SMP */
> +static inline int validate_node(int node)
> +{
> + return numa_node_id();
> +}
> +
> +static struct cpumask *get_idle_cpumask_node(int node)
> +{
> + return cpu_none_mask;
> +}
> +
> +static struct cpumask *get_idle_smtmask_node(int node)
> +{
> + return cpu_none_mask;
> +}
> +
> +static bool test_and_clear_cpu_idle(int cpu) { return false; }
> +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> +{
> + return -EBUSY;
> +}
> +
> +static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
> +{
> + return -EBUSY;
> +}
> +static void reset_idle_masks(void) {}
> +#endif /* CONFIG_SMP */
> +
> +
> +/********************************************************************************
> + * Helpers that can be called from the BPF scheduler.
> + */
> +__bpf_kfunc_start_defs();
> +
> +/**
> + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
> + * @p: task_struct to select a CPU for
> + * @prev_cpu: CPU @p was on previously
> + * @wake_flags: %SCX_WAKE_* flags
> + * @is_idle: out parameter indicating whether the returned CPU is idle
> + *
> + * Can only be called from ops.select_cpu() if the built-in CPU selection is
> + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
> + * @p, @prev_cpu and @wake_flags match ops.select_cpu().
> + *
> + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
> + * currently idle and thus a good candidate for direct dispatching.
> + */
> +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
> + u64 wake_flags, bool *is_idle)
> +{
> + if (!check_builtin_idle_enabled())
> + goto prev_cpu;
> +
> + if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
> + goto prev_cpu;
> +
> +#ifdef CONFIG_SMP
> + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
> +#endif
> +
> +prev_cpu:
> + *is_idle = false;
> + return prev_cpu;
> +}
> +
> +/**
> + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the idle-tracking
> + * per-CPU cpumask of a target NUMA node.
> + *
> + * Returns an empty cpumask if idle tracking is not enabled, if @node is not
> + * valid, or running on a UP kernel.
> + */
> +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
> +{
> + if (!check_builtin_idle_per_node_enabled())
> + return cpu_none_mask;
> +
> + return get_idle_cpumask_node(node);
> +}
> +/**
> + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
> + * per-CPU cpumask of the current NUMA node.
> + *
> + * Returns an emtpy cpumask if idle tracking is not enabled, or running on a UP
> + * kernel.
> + */
> +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
> +{
> + if (!check_builtin_idle_enabled())
> + return cpu_none_mask;
> +
> + return get_idle_cpumask_node(NUMA_NO_NODE);
> +}
> +
> +/**
> + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the idle-tracking,
> + * per-physical-core cpumask of a target NUMA node. Can be used to determine
> + * if an entire physical core is free.
> + *
> + * Returns an empty cpumask if idle tracking is not enabled, if @node is not
> + * valid, or running on a UP kernel.
> + */
> +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
> +{
> + if (!check_builtin_idle_per_node_enabled())
> + return cpu_none_mask;
> +
> + return get_idle_smtmask_node(node);
> +}
> +
> +/**
> + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
> + * per-physical-core cpumask of the current NUMA node. Can be used to determine
> + * if an entire physical core is free.
> + *
> + * Returns an empty cumask if idle tracking is not enabled, or running on a UP
> + * kernel.
> + */
> +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
> +{
> + if (!check_builtin_idle_enabled())
> + return cpu_none_mask;
> +
> + return get_idle_smtmask_node(NUMA_NO_NODE);
> +}
> +
> +/**
> + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
> + * either the percpu, or SMT idle-tracking cpumask.
> + */
> +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
> +{
> + /*
> + * Empty function body because we aren't actually acquiring or releasing
> + * a reference to a global idle cpumask, which is read-only in the
> + * caller and is never released. The acquire / release semantics here
> + * are just used to make the cpumask a trusted pointer in the caller.
> + */
> +}
> +
> +/**
> + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
> + * @cpu: cpu to test and clear idle for
> + *
> + * Returns %true if @cpu was idle and its idle state was successfully cleared.
> + * %false otherwise.
> + *
> + * Unavailable if ops.update_idle() is implemented and
> + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
> + */
> +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
> +{
> + if (!check_builtin_idle_enabled())
> + return false;
> +
> + if (ops_cpu_valid(cpu, NULL))
> + return test_and_clear_cpu_idle(cpu);
> + else
> + return false;
> +}
> +
> +/**
> + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from a NUMA node
> + * @node: target NUMA node
> + * @cpus_allowed: Allowed cpumask
> + * @flags: %SCX_PICK_IDLE_CPU_* flags
> + *
> + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
> + * Returns the picked idle cpu number on success. -%EBUSY if no matching cpu
> + * was found.
> + *
> + * Unavailable if ops.update_idle() is implemented and
> + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set or if %SCX_OPS_KEEP_BUILTIN_IDLE is
> + * not set.
> + */
> +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(int node, const struct cpumask *cpus_allowed,
> + u64 flags)
> +{
> + node = validate_node(node);
> + if (node < 0)
> + return node;
> +
> + if (!check_builtin_idle_per_node_enabled())
> + return -EBUSY;
> +
> + return scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
> +}
> +
> +/**
> + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
> + * @cpus_allowed: Allowed cpumask
> + * @flags: %SCX_PICK_IDLE_CPU_* flags
> + *
> + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
> + * number on success. -%EBUSY if no matching cpu was found.
> + *
> + * Idle CPU tracking may race against CPU scheduling state transitions. For
> + * example, this function may return -%EBUSY as CPUs are transitioning into the
> + * idle state. If the caller then assumes that there will be dispatch events on
> + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
> + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
> + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
> + * event in the near future.
> + *
> + * Unavailable if ops.update_idle() is implemented and
> + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
> + */
> +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
> + u64 flags)
> +{
> + if (!check_builtin_idle_enabled())
> + return -EBUSY;
> +
> + return scx_pick_idle_cpu(cpus_allowed, smp_processor_id(), flags);
> +}
> +
> +/**
> + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
> + * @cpus_allowed: Allowed cpumask
> + * @flags: %SCX_PICK_IDLE_CPU_* flags
> + *
> + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
> + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
> + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
> + * empty.
> + *
> + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
> + * set, this function can't tell which CPUs are idle and will always pick any
> + * CPU.
> + */
> +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
> + u64 flags)
> +{
> + s32 cpu;
> +
> + if (static_branch_likely(&scx_builtin_idle_enabled)) {
> + cpu = scx_pick_idle_cpu(cpus_allowed, smp_processor_id(), flags);
> + if (cpu >= 0)
> + return cpu;
> + }
> +
> + cpu = cpumask_any_distribute(cpus_allowed);
> + if (cpu < nr_cpu_ids)
> + return cpu;
> + else
> + return -EBUSY;
> +}
> +
> +__bpf_kfunc_end_defs();
> +
> +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
> +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
> +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
> +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
> +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
> +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
> +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
> +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
> +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
> +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
> +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
> +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
> +
> +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
> + .owner = THIS_MODULE,
> + .set = &scx_kfunc_ids_select_cpu,
> +};
> --
> 2.47.1