[PATCH 1/2] sched/numa: Add ability to override task's numa_preferred_nid.
From: Chris Hyser
Date: Mon Apr 14 2025 - 21:36:57 EST
From: chris hyser <chris.hyser@xxxxxxxxxx>
This patch allows directly setting and subsequent overriding of a task's
"Preferred Node Affinity" by setting the task's numa_preferred_nid and
relying on the existing NUMA balancing infrastructure.
NUMA balancing introduced the notion of tracking and using a task's
preferred memory node for both migrating/consolidating the physical pages
accessed by a task and to assist the scheduler in making NUMA aware
placement and load balancing decisions.
The existing mechanism for determining this, Auto NUMA Balancing, relies
on periodic removal of virtual mappings for blocks of a task's address
space. The resulting faults can indicate a preference for an accessed
node.
This has two issues that this patch seeks to overcome:
- there is a trade-off between faulting overhead and the ability to detect
dynamic access patterns. In cases where the task or user understand the
NUMA sensitivities, this patch can enable the benefits of setting a
preferred node used either in conjunction with Auto NUMA Balancing's
default parameters or adjusting the NUMA balance parameters to reduce the
faulting rate (potentially to 0).
- memory pinned to nodes or to physical addresses such as RDMA cannot be
migrated and have thus far been excluded from the scanning. Not taking
those faults however can prevent Auto NUMA Balancing from reliably
detecting a node preference with the scheduler load balancer then
possibly operating with incorrect NUMA information.
The following results were from TPCC runs on an Oracle Database. The system
was a 2-node Intel machine with a database running on each node with local
memory allocations. No tasks or memory were pinned.
There are four scenarios of interest:
- Auto NUMA Balancing OFF.
base value
- Auto NUMA Balancing ON.
1.2% - ANB ON better than ANB OFF.
- Use the prctl(), ANB ON, parameters set to prevent faulting.
2.4% - prctl() better then ANB OFF.
1.2% - prctl() better than ANB ON.
- Use the prctl(), ANB parameters normal.
3.1% - prctl() and ANB ON better than ANB OFF.
1.9% - prctl() and ANB ON better than just ANB ON.
0.7% - prctl() and ANB ON better than prctl() and ANB ON/faulting off
In benchmarks pinning large regions of heavily accessed memory, the
advantages of the prctl() over Auto NUMA Balancing alone is significantly
higher.
Signed-off-by: Chris Hyser <chris.hyser@xxxxxxxxxx>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/core.c | 5 ++++-
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 15 +++++++++++++--
5 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f96ac1982893..373046c82b35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1350,6 +1350,7 @@ struct task_struct {
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
+ int numa_preferred_nid_force;
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
diff --git a/init/init_task.c b/init/init_task.c
index e557f622bd90..1921a87326db 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -184,6 +184,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
+ .numa_preferred_nid_force = NUMA_NO_NODE,
.numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 79692f85643f..7d1532f35d15 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7980,7 +7980,10 @@ void sched_setnuma(struct task_struct *p, int nid)
if (running)
put_prev_task(rq, p);
- p->numa_preferred_nid = nid;
+ if (p->numa_preferred_nid_force != NUMA_NO_NODE)
+ p->numa_preferred_nid = p->numa_preferred_nid_force;
+ else
+ p->numa_preferred_nid = nid;
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 56ae54e0ce6a..4cba21f5d24d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1154,6 +1154,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
P(mm->numa_scan_seq);
P(numa_pages_migrated);
+ P(numa_preferred_nid_force);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c19459c8042..79d3d0840fb2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2642,9 +2642,15 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE))
return;
+ /* Execute rest of function if forced PNID */
+ if (p->numa_preferred_nid_force == NUMA_NO_NODE) {
+ if (unlikely(!p->numa_faults))
+ return;
+ }
+
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
p->numa_migrate_retry = jiffies + interval;
@@ -3578,6 +3584,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid_force = NUMA_NO_NODE;
p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -9301,7 +9308,11 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!static_branch_likely(&sched_numa_balancing))
return 0;
- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ /* Execute rest of function if forced PNID */
+ if (p->numa_preferred_nid_force == NUMA_NO_NODE && !p->numa_faults)
+ return 0;
+
+ if (!(env->sd->flags & SD_NUMA))
return 0;
src_nid = cpu_to_node(env->src_cpu);
--
2.43.5