[PATCH 2/3] sched_ext: Introduce per-NUMA idle cpumasks

From: Andrea Righi
Date: Tue Dec 03 2024 - 10:56:13 EST

Next message: Andrea Righi: "[PATCH 3/3] sched_ext: get rid of the scx_selcpu_topo_numa logic"
Previous message: Krzysztof Kozlowski: "Re: [RFC PATCH v1 08/14] dt-bindings: power: thead,th1520: Add support for power domains"
In reply to: Yury Norov: "Re: [PATCH 1/3] nodemask: Introduce for_each_node_mask_wrap/for_each_node_state_wrap()"
Next in thread: Tejun Heo: "Re: [PATCH 2/3] sched_ext: Introduce per-NUMA idle cpumasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Using a single global idle mask can lead to inefficiencies and a lot of
stress on the cache coherency protocol on large systems with multiple
NUMA nodes, since all the CPUs can create a really intense read/write
activity on the single global cpumask.

Therefore, split the global cpumask into multiple per-NUMA node cpumasks
to improve scalability and performance on large systems.

The concept is that each cpumask will track only the idle CPUs within
its corresponding NUMA node, treating CPUs in other NUMA nodes as busy.
In this way concurrent access to the idle cpumask will be restricted
within each NUMA node.

[Open issue]

The scx_bpf_get_idle_cpu/smtmask() kfunc's, that are supposed to return
a single cpumask for all the CPUs, have been changed to report only the
cpumask of the current NUMA node (using the current CPU); this breaks
the old behavior, so it can potentially introduce regressions in some
scx schedulers.

An alternative approach could be to construct a global cpumask
on-the-fly, but this could add significant overhead to ops.select_cpu()
for schedulers relying on these kfunc's. Additionally, it would be less
reliable than accessing the actual cpumasks, as the copy could quickly
become out of sync and not represent the actual idle state very well.

Probably a better way to solve this issue is to introduce new kfunc's to
explicitly select specific per-NUMA cpumask and modify the scx
schedulers to transition to this new API, for example:

const struct cpumask *scx_bpf_get_idle_numa_cpumask(int node)
const struct cpumask *scx_bpf_get_idle_numa_smtmask(int node)

Signed-off-by: Andrea Righi <arighi@xxxxxxxxxx>
---
kernel/sched/ext.c | 159 ++++++++++++++++++++++++++++++++-------------
1 file changed, 114 insertions(+), 45 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3c4a94e4258f..cff4210e9c7b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -933,7 +933,37 @@ static struct delayed_work scx_watchdog_work;
static struct {
cpumask_var_t cpu;
cpumask_var_t smt;
-} idle_masks CL_ALIGNED_IF_ONSTACK;
+} **idle_masks CL_ALIGNED_IF_ONSTACK;
+
+static struct cpumask *get_idle_cpumask(int cpu)
+{
+ int node = cpu_to_node(cpu);
+
+ return idle_masks[node]->cpu;
+}
+
+static struct cpumask *get_idle_smtmask(int cpu)
+{
+ int node = cpu_to_node(cpu);
+
+ return idle_masks[node]->smt;
+}
+
+static void idle_masks_init(void)
+{
+ int node;
+
+ idle_masks = kcalloc(num_possible_nodes(), sizeof(*idle_masks), GFP_KERNEL);
+ BUG_ON(!idle_masks);
+
+ for_each_node_state(node, N_POSSIBLE) {
+ idle_masks[node] = kzalloc_node(sizeof(**idle_masks), GFP_KERNEL, node);
+ BUG_ON(!idle_masks[node]);
+
+ BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->cpu, GFP_KERNEL, node));
+ BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->smt, GFP_KERNEL, node));
+ }
+}

#endif /* CONFIG_SMP */

@@ -3156,29 +3186,34 @@ static bool test_and_clear_cpu_idle(int cpu)
*/
if (sched_smt_active()) {
const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smt = get_idle_smtmask(cpu);

/*
* If offline, @cpu is not its own sibling and
* scx_pick_idle_cpu() can get caught in an infinite loop as
- * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
- * is eventually cleared.
+ * @cpu is never cleared from the idle SMT mask. Ensure that
+ * @cpu is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
*/
- if (cpumask_intersects(smt, idle_masks.smt))
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- else if (cpumask_test_cpu(cpu, idle_masks.smt))
- __cpumask_clear_cpu(cpu, idle_masks.smt);
+ if (cpumask_intersects(smt, idle_smt))
+ cpumask_andnot(idle_smt, idle_smt, smt);
+ else if (cpumask_test_cpu(cpu, idle_smt))
+ __cpumask_clear_cpu(cpu, idle_smt);
}
#endif
- return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
+ return cpumask_test_and_clear_cpu(cpu, get_idle_cpumask(cpu));
}

-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
+static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
{
int cpu;

retry:
if (sched_smt_active()) {
- cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
+ cpu = cpumask_any_and_distribute(idle_masks[node]->smt, cpus_allowed);
if (cpu < nr_cpu_ids)
goto found;

@@ -3186,15 +3221,42 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
return -EBUSY;
}

- cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
- if (cpu >= nr_cpu_ids)
- return -EBUSY;
+ cpu = cpumask_any_and_distribute(idle_masks[node]->cpu, cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ goto found;
+
+ return -EBUSY;

found:
if (test_and_clear_cpu_idle(cpu))
return cpu;
else
goto retry;
+
+}
+
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
+{
+ int start = cpu_to_node(smp_processor_id());
+ int node, cpu;
+
+ for_each_node_state_wrap(node, N_ONLINE, start) {
+ /*
+ * scx_pick_idle_cpu_from_node() can be expensive and redundant
+ * if none of the CPUs in the NUMA node can be used (according
+ * to cpus_allowed).
+ *
+ * Therefore, check if the NUMA node is usable in advance to
+ * save some CPU cycles.
+ */
+ if (!cpumask_intersects(cpumask_of_node(node), cpus_allowed))
+ continue;
+ cpu = scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
+ if (cpu >= 0)
+ return cpu;
+ }
+
+ return -EBUSY;
}

/*
@@ -3338,11 +3400,11 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
{
const struct cpumask *llc_cpus = NULL;
const struct cpumask *numa_cpus = NULL;
+ int node = cpu_to_node(prev_cpu);
s32 cpu;

*found = false;

-
/*
* This is necessary to protect llc_cpus.
*/
@@ -3361,7 +3423,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
*/
if (p->nr_cpus_allowed >= num_possible_cpus()) {
if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+ numa_cpus = p->cpus_ptr;

if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
struct sched_domain *sd;
@@ -3401,9 +3463,9 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* piled up on it even if there is an idle core elsewhere on
* the system.
*/
- if (!cpumask_empty(idle_masks.cpu) &&
- !(current->flags & PF_EXITING) &&
- cpu_rq(cpu)->scx.local_dsq.nr == 0) {
+ if (!(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
+ !cpumask_empty(get_idle_cpumask(cpu))) {
if (cpumask_test_cpu(cpu, p->cpus_ptr))
goto cpu_found;
}
@@ -3417,7 +3479,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
/*
* Keep using @prev_cpu if it's part of a fully idle core.
*/
- if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
+ if (cpumask_test_cpu(prev_cpu, get_idle_smtmask(prev_cpu)) &&
test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
goto cpu_found;
@@ -3427,7 +3489,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* Search for any fully idle core in the same LLC domain.
*/
if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu_from_node(node, llc_cpus, SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto cpu_found;
}
@@ -3436,7 +3498,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* Search for any fully idle core in the same NUMA node.
*/
if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu_from_node(node, numa_cpus, SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto cpu_found;
}
@@ -3444,7 +3506,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
/*
* Search for any full idle core usable by the task.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, prev_cpu, SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto cpu_found;
}
@@ -3461,7 +3523,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* Search for any idle CPU in the same LLC domain.
*/
if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, 0);
+ cpu = scx_pick_idle_cpu_from_node(node, llc_cpus, 0);
if (cpu >= 0)
goto cpu_found;
}
@@ -3470,7 +3532,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* Search for any idle CPU in the same NUMA node.
*/
if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, 0);
+ cpu = scx_pick_idle_cpu_from_node(node, numa_cpus, 0);
if (cpu >= 0)
goto cpu_found;
}
@@ -3478,7 +3540,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
/*
* Search for any idle CPU usable by the task.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, prev_cpu, 0);
if (cpu >= 0)
goto cpu_found;

@@ -3560,12 +3622,18 @@ static void set_cpus_allowed_scx(struct task_struct *p,

static void reset_idle_masks(void)
{
+ int node;
+
/*
* Consider all online cpus idle. Should converge to the actual state
* quickly.
*/
- cpumask_copy(idle_masks.cpu, cpu_online_mask);
- cpumask_copy(idle_masks.smt, cpu_online_mask);
+ for_each_node_state(node, N_POSSIBLE) {
+ const struct cpumask *node_mask = cpumask_of_node(node);
+
+ cpumask_and(idle_masks[node]->cpu, cpu_online_mask, node_mask);
+ cpumask_copy(idle_masks[node]->smt, idle_masks[node]->cpu);
+ }
}

void __scx_update_idle(struct rq *rq, bool idle)
@@ -3578,14 +3646,13 @@ void __scx_update_idle(struct rq *rq, bool idle)
return;
}

- if (idle)
- cpumask_set_cpu(cpu, idle_masks.cpu);
- else
- cpumask_clear_cpu(cpu, idle_masks.cpu);
+ assign_cpu(cpu, get_idle_cpumask(cpu), idle);

#ifdef CONFIG_SCHED_SMT
if (sched_smt_active()) {
const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_cpu = get_idle_cpumask(cpu);
+ struct cpumask *idle_smt = get_idle_smtmask(cpu);

if (idle) {
/*
@@ -3593,12 +3660,12 @@ void __scx_update_idle(struct rq *rq, bool idle)
* it's only for optimization and self-correcting.
*/
for_each_cpu(cpu, smt) {
- if (!cpumask_test_cpu(cpu, idle_masks.cpu))
+ if (!cpumask_test_cpu(cpu, idle_cpu))
return;
}
- cpumask_or(idle_masks.smt, idle_masks.smt, smt);
+ cpumask_or(idle_smt, idle_smt, smt);
} else {
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
+ cpumask_andnot(idle_smt, idle_smt, smt);
}
}
#endif
@@ -3646,7 +3713,10 @@ static void rq_offline_scx(struct rq *rq)
#else /* CONFIG_SMP */

static bool test_and_clear_cpu_idle(int cpu) { return false; }
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
+{
+ return -EBUSY;
+}
static void reset_idle_masks(void) {}

#endif /* CONFIG_SMP */
@@ -6174,8 +6244,7 @@ void __init init_sched_ext_class(void)

BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
#ifdef CONFIG_SMP
- BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
- BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
+ idle_masks_init();
#endif
scx_kick_cpus_pnt_seqs =
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
@@ -7321,7 +7390,7 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)

/**
* scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
- * per-CPU cpumask.
+ * per-CPU cpumask of the current NUMA node.
*
* Returns NULL if idle tracking is not enabled, or running on a UP kernel.
*/
@@ -7333,7 +7402,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
}

#ifdef CONFIG_SMP
- return idle_masks.cpu;
+ return get_idle_cpumask(smp_processor_id());
#else
return cpu_none_mask;
#endif
@@ -7341,8 +7410,8 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)

/**
* scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
- * per-physical-core cpumask. Can be used to determine if an entire physical
- * core is free.
+ * per-physical-core cpumask of the current NUMA node. Can be used to determine
+ * if an entire physical core is free.
*
* Returns NULL if idle tracking is not enabled, or running on a UP kernel.
*/
@@ -7355,9 +7424,9 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)

#ifdef CONFIG_SMP
if (sched_smt_active())
- return idle_masks.smt;
+ return get_idle_smtmask(smp_processor_id());
else
- return idle_masks.cpu;
+ return get_idle_cpumask(smp_processor_id());
#else
return cpu_none_mask;
#endif
@@ -7427,7 +7496,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
return -EBUSY;
}

- return scx_pick_idle_cpu(cpus_allowed, flags);
+ return scx_pick_idle_cpu(cpus_allowed, smp_processor_id(), flags);
}

/**
@@ -7450,7 +7519,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
s32 cpu;

if (static_branch_likely(&scx_builtin_idle_enabled)) {
- cpu = scx_pick_idle_cpu(cpus_allowed, flags);
+ cpu = scx_pick_idle_cpu(cpus_allowed, smp_processor_id(), flags);
if (cpu >= 0)
return cpu;
}
--
2.47.1

Next message: Andrea Righi: "[PATCH 3/3] sched_ext: get rid of the scx_selcpu_topo_numa logic"
Previous message: Krzysztof Kozlowski: "Re: [RFC PATCH v1 08/14] dt-bindings: power: thead,th1520: Add support for power domains"
In reply to: Yury Norov: "Re: [PATCH 1/3] nodemask: Introduce for_each_node_mask_wrap/for_each_node_state_wrap()"
Next in thread: Tejun Heo: "Re: [PATCH 2/3] sched_ext: Introduce per-NUMA idle cpumasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]