[PATCH v3] sched/core: Use empty mask to reset cpumasks in sched_setaffinity()

From: Waiman Long
Date: Thu Aug 03 2023 - 22:33:22 EST


Since commit 8f9ea86fdf99 ("sched: Always preserve the user requested
cpumask"), user provided CPU affinity via sched_setaffinity(2) is
perserved even if the task is being moved to a different cpuset. However,
that affinity is also being inherited by any subsequently created child
processes which may not want or be aware of that affinity.

One way to solve this problem is to provide a way to back off from
that user provided CPU affinity. This patch implements such a scheme
by using an empty cpumask to signal a reset of the cpumasks to the
default as allowed by the current cpuset.

Before this patch, passing in an empty cpumask to sched_setaffinity(2)
will always return an -EINVAL error. With this patch, an alternative
error of -ENODEV will be returned returned if sched_setaffinity(2)
has been called before to set up user_cpus_ptr. In this case, the
user_cpus_ptr that stores the user provided affinity will be cleared and
the task's CPU affinity will be reset to that of the current cpuset. This
alternative error code of -ENODEV signals that the no CPU is specified
and, at the same time, a side effect of resetting cpu affinity to the
cpuset default.

If sched_setaffinity(2) has not been called previously, an EINVAL error
will be returned with an empty cpumask just like before. Tests or
tools that rely on the behavior that an empty cpumask will return an
error code will not be affected.

We will have to update the sched_setaffinity(2) manpage to document
this possible side effect of passing in an empty cpumask.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/sched/core.c | 42 +++++++++++++++++++++++++++++++++---------
1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c52c2eba7c73..3ef7397f2a61 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8317,7 +8317,12 @@ __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
}

cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ /* Default to cpus_allowed with NULL new_mask */
+ if (ctx->new_mask)
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+ else
+ cpumask_copy(new_mask, cpus_allowed);

ctx->new_mask = new_mask;
ctx->flags |= SCA_CHECK;
@@ -8366,6 +8371,7 @@ __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
+ bool reset_cpumasks = cpumask_empty(in_mask);
struct affinity_context ac;
struct cpumask *user_mask;
struct task_struct *p;
@@ -8403,15 +8409,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_put_task;

/*
- * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
- * alloc_user_cpus_ptr() returns NULL.
+ * If an empty cpumask is passed in and user_cpus_ptr is set,
+ * clear user_cpus_ptr and reset the current cpu affinity to the
+ * default for the current cpuset. If user_cpus_ptr isn't set,
+ * -EINVAL will be returned as before.
*/
- user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
- if (user_mask) {
- cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
- retval = -ENOMEM;
- goto out_put_task;
+ if (reset_cpumasks && p->user_cpus_ptr) {
+ in_mask = NULL; /* To be updated in __sched_setaffinity */
+ user_mask = NULL;
+ } else {
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used
+ * and alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
}

ac = (struct affinity_context){
@@ -8423,6 +8440,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);

+ /*
+ * Force an error return (-ENODEV), if no error yet, for the empty
+ * cpumask case to avoid breaking existing tests.
+ */
+ if (reset_cpumasks && !retval)
+ retval = -ENODEV;
+
out_put_task:
put_task_struct(p);
return retval;
--
2.31.1