[PATCH 2/2] sched/numa: prctl to set/override task's numa_preferred_nid
From: Chris Hyser
Date: Mon Apr 14 2025 - 21:36:56 EST
From: chris hyser <chris.hyser@xxxxxxxxxx>
Adds a simple prctl() interface to enable setting or reading a task's
numa_preferred_nid. Once set this value will override any value set
by auto NUMA balancing.
Signed-off-by: Chris Hyser <chris.hyser@xxxxxxxxxx>
---
.../scheduler/sched-preferred-node.rst | 67 +++++++++++++++++++
include/linux/sched.h | 9 +++
include/uapi/linux/prctl.h | 8 +++
kernel/sched/fair.c | 64 ++++++++++++++++++
kernel/sys.c | 5 ++
tools/include/uapi/linux/prctl.h | 6 ++
6 files changed, 159 insertions(+)
create mode 100644 Documentation/scheduler/sched-preferred-node.rst
diff --git a/Documentation/scheduler/sched-preferred-node.rst b/Documentation/scheduler/sched-preferred-node.rst
new file mode 100644
index 000000000000..753fd0b20993
--- /dev/null
+++ b/Documentation/scheduler/sched-preferred-node.rst
@@ -0,0 +1,67 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Prctl for Explicitly Setting Task's Preferred Node
+####################################################
+
+This feature is an addition to Auto NUMA Balancing. Auto NUMA balancing by
+default scans a task's address space removing address translations such that
+subsequent faults can indicate the predominant node from which memory is being
+accessed. A task's numa_preferred_nid is set to the node ID.
+
+The numa_preferred_nid is used to both consolidate physical pages and assist the
+scheduler in making NUMA friendly load balancing decisions.
+
+While quite useful for some workloads, this has two issues that this prctl() can
+help solve:
+
+- There is a trade-off between faulting overhead and the ability to detect
+dynamic access patterns. In cases where the task or user understand the NUMA
+sensitivities, this patch can enable the benefits of setting a preferred node
+used either in conjunction with Auto NUMA Balancing's default parameters or
+adjusting the NUMA balance parameters to reduce the faulting rate
+(potentially to 0).
+
+- Memory pinned to nodes or to physical addresses such as RDMA cannot be
+migrated and have thus far been excluded from the scanning. Not taking
+those faults however can prevent Auto NUMA Balancing from reliably detecting a
+node preference with the scheduler load balancer then possibly operating with
+incorrect NUMA information.
+
+
+Usage
+*******
+
+ Note: Auto NUMA Balancing must be enabled to get the effects.
+
+ #include <sys/prctl.h>
+
+ int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5);
+
+option:
+ ``PR_PREFERRED_NID``
+
+arg2:
+ Command for operation, must be one of:
+
+ - ``PR_PREFERRED_NID_GET`` -- get the forced preferred node ID for ``pid``.
+ - ``PR_PREFERRED_NID_SET`` -- set the forced preferred node ID for ``pid``.
+
+ Returns ERANGE for an illegal command.
+
+arg3:
+ ``pid`` of the task for which the operation applies. ``0`` implies current.
+
+ Returns ESRCH if ``pid`` is not found.
+
+arg4:
+ ``node_id`` for PR_PREFERRED_NID_SET. Between ``-1`` and ``num_possible_nodes()``.
+ ``-1`` indicates no preference.
+
+ Returns EINVAL for an illegal command.
+
+arg5:
+ userspace pointer to an integer for returning the Node ID from
+ ``PR_PREFERRED_NID_GET``. Should be 0 for all other commands.
+
+Must have the ptrace access mode: `PTRACE_MODE_READ_REALCREDS` to get/set
+the preferred node ID to a process otherwise returns EPERM.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 373046c82b35..8054fd37acdc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2261,6 +2261,15 @@ static inline void sched_core_fork(struct task_struct *p) { }
static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
#endif
+#ifdef CONFIG_NUMA_BALANCING
+/* Change a task's numa_preferred_nid */
+int prctl_chg_pref_nid(unsigned long cmd, int nid, pid_t pid,
+ unsigned long uaddr);
+#else
+static inline int prctl_chg_pref_nid(unsigned long cmd, int nid, pid_t pid,
+ unsigned long uaddr) { return -ERANGE; }
+#endif
+
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
#ifdef CONFIG_MEM_ALLOC_PROFILING
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 15c18ef4eb11..e8a47777aeb2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -364,4 +364,12 @@ struct prctl_mm_map {
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
+/*
+ * Set or get a task's numa_preferred_nid
+ */
+#define PR_PREFERRED_NID 78
+# define PR_PREFERRED_NID_GET 0
+# define PR_PREFERRED_NID_SET 1
+# define PR_PREFERRED_NID_CMD_MAX 2
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79d3d0840fb2..7afff9fa3922 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,6 +49,7 @@
#include <linux/ratelimit.h>
#include <linux/task_work.h>
#include <linux/rbtree_augmented.h>
+#include <linux/prctl.h>
#include <asm/switch_to.h>
@@ -3670,6 +3671,69 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
p->numa_scan_period = task_scan_start(p);
}
+/*
+ * Enable setting task->numa_preferred_nid directly
+ */
+int prctl_chg_pref_nid(unsigned long cmd, pid_t pid, int nid,
+ unsigned long uaddr)
+{
+ struct task_struct *task;
+ struct rq_flags rf;
+ struct rq *rq;
+ int err = 0;
+
+ if (cmd >= PR_PREFERRED_NID_CMD_MAX)
+ return -ERANGE;
+
+ rcu_read_lock();
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid((pid_t)pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case PR_PREFERRED_NID_GET:
+ if (uaddr & 0x3) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = put_user(task->numa_preferred_nid_force,
+ (int __user *)uaddr);
+ break;
+
+ case PR_PREFERRED_NID_SET:
+ if (!(-1 <= nid && nid < num_possible_nodes())) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ rq = task_rq_lock(task, &rf);
+ task->numa_preferred_nid_force = nid;
+ task_rq_unlock(rq, task, &rf);
+ sched_setnuma(task, nid);
+ break;
+ }
+
+out:
+ put_task_struct(task);
+ return err;
+}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
diff --git a/kernel/sys.c b/kernel/sys.c
index c434968e9f5d..20629a3267b1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2746,6 +2746,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SCHED_CORE:
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ case PR_PREFERRED_NID:
+ error = prctl_chg_pref_nid(arg2, arg3, arg4, arg5);
+ break;
#endif
case PR_SET_MDWE:
error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 35791791a879..937160e3a77a 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -328,4 +328,10 @@ struct prctl_mm_map {
# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
# define PR_PPC_DEXCR_CTRL_MASK 0x1f
+/* Set or get a task's numa_preferred_nid
+ */
+#define PR_PREFERRED_NID 78
+# define PR_PREFERRED_NID_GET 0
+# define PR_PREFERRED_NID_SET 1
+# define PR_PREFERRED_NID_CMD_MAX 2
#endif /* _LINUX_PRCTL_H */
--
2.43.5