[PATCH 1/2] sched: Introduce new flags to sched_setaffinity to support soft affinity.

From: Rohit Jain
Date: Tue Sep 19 2017 - 18:36:48 EST


These are the changes for supporting the system call and set the
cpus_preferred mask as the application wants. This patch does not make
the cpus_preferred take any action.

Signed-off-by: Rohit Jain <rohit.k.jain@xxxxxxxxxx>
---
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
include/linux/init_task.h | 1 +
include/linux/sched.h | 4 +-
include/linux/syscalls.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
include/uapi/linux/sched.h | 3 +
kernel/compat.c | 2 +-
kernel/sched/core.c | 167 ++++++++++++++++++++++++++++-----
kernel/time/tick-sched.c | 1 +
9 files changed, 159 insertions(+), 27 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183..bd5f346 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+333 common sched_setaffinity_flags sys_sched_setaffinity_flags

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 0e84971..bb8a8e1 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -235,6 +235,7 @@ extern struct cred init_cred;
.normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
+ .cpus_preferred = CPU_MASK_ALL, \
.nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &init_mm, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 534542d..7e08ae8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -582,6 +582,7 @@ struct task_struct {
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
+ cpumask_t cpus_preferred;

#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
@@ -1647,7 +1648,8 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
# define vcpu_is_preempted(cpu) false
#endif

-extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask,
+ int flags);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);

#ifndef TASK_SIZE_OF
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d4dfac8..83d04da 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -326,6 +326,9 @@ asmlinkage long sys_sched_get_priority_max(int policy);
asmlinkage long sys_sched_get_priority_min(int policy);
asmlinkage long sys_sched_rr_get_interval(pid_t pid,
struct timespec __user *interval);
+asmlinkage long sys_sched_setaffinity_flags(pid_t pid, unsigned int len,
+ unsigned long __user *user_mask_ptr,
+ int flags);
asmlinkage long sys_setpriority(int which, int who, int niceval);
asmlinkage long sys_getpriority(int which, int who);

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a..5e88941 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -376,6 +376,8 @@ __SYSCALL(__NR_sched_getparam, sys_sched_getparam)
#define __NR_sched_setaffinity 122
__SC_COMP(__NR_sched_setaffinity, sys_sched_setaffinity, \
compat_sys_sched_setaffinity)
+#define __NR_sched_setaffinity_flags 293
+__SYSCALL(__NR_sched_setaffinity_flags, sys_sched_setaffinity_flags)
#define __NR_sched_getaffinity 123
__SC_COMP(__NR_sched_getaffinity, sys_sched_getaffinity, \
compat_sys_sched_getaffinity)
@@ -733,7 +735,7 @@ __SYSCALL(__NR_pkey_free, sys_pkey_free)
__SYSCALL(__NR_statx, sys_statx)

#undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293

/*
* All syscalls below here should go away really,
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index e2a6c7b..81c17f5 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -49,4 +49,7 @@
#define SCHED_FLAG_RESET_ON_FORK 0x01
#define SCHED_FLAG_RECLAIM 0x02

+#define SCHED_HARD_AFFINITY 0
+#define SCHED_SOFT_AFFINITY 1
+
#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index 6f0a0e7..0ec60ea 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -356,7 +356,7 @@ COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
if (retval)
goto out;

- retval = sched_setaffinity(pid, new_mask);
+ retval = sched_setaffinity(pid, new_mask, SCHED_HARD_AFFINITY);
out:
free_cpumask_var(new_mask);
return retval;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec80d2f..2e8d392 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1031,6 +1031,11 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
p->nr_cpus_allowed = cpumask_weight(new_mask);
}

+void set_cpus_preferred_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+ cpumask_copy(&p->cpus_preferred, new_mask);
+}
+
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
struct rq *rq = task_rq(p);
@@ -1053,6 +1058,36 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
put_prev_task(rq, p);

p->sched_class->set_cpus_allowed(p, new_mask);
+ set_cpus_preferred_common(p, new_mask);
+
+ if (queued)
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ if (running)
+ set_curr_task(rq, p);
+}
+
+void do_set_cpus_preferred(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+ bool queued, running;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ }
+ if (running)
+ put_prev_task(rq, p);
+
+ set_cpus_preferred_common(p, new_mask);

if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1142,6 +1177,63 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
return ret;
}

+static int
+__set_cpus_preferred_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ unsigned int dest_cpu;
+ struct rq_flags rf;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
+
+ if (cpumask_equal(&p->cpus_preferred, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_preferred(p, new_mask);
+
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * For kernel threads that do indeed end up on online &&
+ * !active we want to ensure they are strict per-CPU threads.
+ */
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
+ p->nr_cpus_allowed != 1);
+ }
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ if (task_on_rq_queued(p)) {
+ /*
+ * OK, since we're going to drop the lock immediately
+ * afterwards anyway.
+ */
+ rq = move_queued_task(rq, &rf, p, dest_cpu);
+ }
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ return ret;
+}
+
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
return __set_cpus_allowed_ptr(p, new_mask, false);
@@ -4620,7 +4712,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
return retval;
}

-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask, int flags)
{
cpumask_var_t cpus_allowed, new_mask;
struct task_struct *p;
@@ -4686,19 +4778,23 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
-
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
+ if (flags == SCHED_HARD_AFFINITY) {
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
+
+ if (!retval) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
}
+ } else if (flags == SCHED_SOFT_AFFINITY) {
+ retval = __set_cpus_preferred_ptr(p, new_mask);
}
out_free_new_mask:
free_cpumask_var(new_mask);
@@ -4720,30 +4816,53 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
}

-/**
- * sys_sched_setaffinity - set the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new CPU mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
+static bool
+valid_affinity_flags(int flags)
+{
+ return flags == SCHED_HARD_AFFINITY || flags == SCHED_SOFT_AFFINITY;
+}
+
+static int
+sched_setaffinity_common(pid_t pid, unsigned int len,
+ unsigned long __user *user_mask_ptr, int flags)
{
cpumask_var_t new_mask;
int retval;

+ if (!valid_affinity_flags(flags))
+ return -EINVAL;
+
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
if (retval == 0)
- retval = sched_setaffinity(pid, new_mask);
+ retval = sched_setaffinity(pid, new_mask, flags);
free_cpumask_var(new_mask);
return retval;
}

+SYSCALL_DEFINE4(sched_setaffinity_flags, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr, int, flags)
+{
+ return sched_setaffinity_common(pid, len, user_mask_ptr, flags);
+}
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ return sched_setaffinity_common(pid, len, user_mask_ptr,
+ SCHED_HARD_AFFINITY);
+}
+
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index eb0e975..ede1add 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,6 +19,7 @@
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/profile.h>
+#include <linux/vmstat.h>
#include <linux/sched/signal.h>
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
--
2.7.4