[PATCH 01/13] sched: Add 3 new scheduler syscalls to support an extended scheduling parameters ABI

From: Peter Zijlstra
Date: Tue Dec 17 2013 - 07:50:06 EST


From: Dario Faggioli <raistlin@xxxxxxxx>

Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).

In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:

- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.

Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.

For these reasons, this patch:

- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setscheduler2(), sched_setattr()
and sched_getattr().

Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.

Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.

Cc: oleg@xxxxxxxxxx
Cc: darren@xxxxxxxxxx
Cc: paulmck@xxxxxxxxxxxxxxxxxx
Cc: dhaval.giani@xxxxxxxxx
Cc: p.faure@xxxxxxxxxx
Cc: fchecconi@xxxxxxxxx
Cc: fweisbec@xxxxxxxxx
Cc: harald.gustafsson@xxxxxxxxxxxx
Cc: hgu1972@xxxxxxxxx
Cc: insop.song@xxxxxxxxx
Cc: rostedt@xxxxxxxxxxx
Cc: jkacur@xxxxxxxxxx
Cc: tommaso.cucinotta@xxxxxxxx
Cc: johan.eker@xxxxxxxxxxxx
Cc: vincent.guittot@xxxxxxxxxx
Cc: liming.wang@xxxxxxxxxxxxx
Cc: luca.abeni@xxxxxxxx
Cc: michael@xxxxxxxxxxxxxxxxxxxx
Cc: bruce.ashfield@xxxxxxxxxxxxx
Cc: nicola.manica@xxxxxxxxxxxxx
Cc: claudio@xxxxxxxxxxxxxxx
Signed-off-by: Dario Faggioli <raistlin@xxxxxxxx>
Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxx>
[ Twiddled the changelog. ]
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
arch/arm/include/asm/unistd.h | 2
arch/arm/include/uapi/asm/unistd.h | 3
arch/arm/kernel/calls.S | 3
arch/x86/syscalls/syscall_32.tbl | 3
arch/x86/syscalls/syscall_64.tbl | 3
include/linux/sched.h | 54 ++++++++
include/linux/syscalls.h | 8 +
kernel/sched/core.c | 234 +++++++++++++++++++++++++++++++++++--
8 files changed, 298 insertions(+), 12 deletions(-)

--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,7 @@

#include <uapi/asm/unistd.h>

-#define __NR_syscalls (380)
+#define __NR_syscalls (383)
#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)

#define __ARCH_WANT_STAT64
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -406,6 +406,9 @@
#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
#define __NR_kcmp (__NR_SYSCALL_BASE+378)
#define __NR_finit_module (__NR_SYSCALL_BASE+379)
+#define __NR_sched_setscheduler2 (__NR_SYSCALL_BASE+380)
+#define __NR_sched_setattr (__NR_SYSCALL_BASE+381)
+#define __NR_sched_getattr (__NR_SYSCALL_BASE+382)

/*
* This may need to be greater than __NR_last_syscall+1 in order to
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -389,6 +389,9 @@
CALL(sys_process_vm_writev)
CALL(sys_kcmp)
CALL(sys_finit_module)
+/* 380 */ CALL(sys_sched_setscheduler2)
+ CALL(sys_sched_setattr)
+ CALL(sys_sched_getattr)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,6 @@
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
349 i386 kcmp sys_kcmp
350 i386 finit_module sys_finit_module
+351 i386 sched_setattr sys_sched_setattr
+352 i386 sched_getattr sys_sched_getattr
+353 i386 sched_setscheduler2 sys_sched_setscheduler2
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,9 @@
311 64 process_vm_writev sys_process_vm_writev
312 common kcmp sys_kcmp
313 common finit_module sys_finit_module
+314 common sched_setattr sys_sched_setattr
+315 common sched_getattr sys_sched_getattr
+316 common sched_setscheduler2 sys_sched_setscheduler2

#
# x32-specific system call numbers start at 512 to avoid cache impact
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -56,6 +56,58 @@ struct sched_param {

#include <asm/processor.h>

+#define SCHED_ATTR_SIZE_VER0 40 /* sizeof first published struct */
+
+/*
+ * Extended scheduling parameters data structure.
+ *
+ * This is needed because the original struct sched_param can not be
+ * altered without introducing ABI issues with legacy applications
+ * (e.g., in sched_getparam()).
+ *
+ * However, the possibility of specifying more than just a priority for
+ * the tasks may be useful for a wide variety of application fields, e.g.,
+ * multimedia, streaming, automation and control, and many others.
+ *
+ * This variant (sched_attr) is meant at describing a so-called
+ * sporadic time-constrained task. In such model a task is specified by:
+ * - the activation period or minimum instance inter-arrival time;
+ * - the maximum (or average, depending on the actual scheduling
+ * discipline) computation time of all instances, a.k.a. runtime;
+ * - the deadline (relative to the actual activation time) of each
+ * instance.
+ * Very briefly, a periodic (sporadic) task asks for the execution of
+ * some specific computation --which is typically called an instance--
+ * (at most) every period. Moreover, each instance typically lasts no more
+ * than the runtime and must be completed by time instant t equal to
+ * the instance activation time + the deadline.
+ *
+ * This is reflected by the actual fields of the sched_attr structure:
+ *
+ * @sched_priority task's priority (might still be useful)
+ * @sched_flags for customizing the scheduler behaviour
+ * @sched_deadline representative of the task's deadline
+ * @sched_runtime representative of the task's runtime
+ * @sched_period representative of the task's period
+ *
+ * Given this task model, there are a multiplicity of scheduling algorithms
+ * and policies, that can be used to ensure all the tasks will make their
+ * timing constraints.
+ *
+ * @size size of the structure, for fwd/bwd compat.
+ */
+struct sched_attr {
+ int sched_priority;
+ unsigned int sched_flags;
+ u64 sched_runtime;
+ u64 sched_deadline;
+ u64 sched_period;
+ u32 size;
+
+ /* Align to u64. */
+ u32 __reserved;
+};
+
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
@@ -1960,6 +2012,8 @@ extern int sched_setscheduler(struct tas
const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int,
const struct sched_param *);
+extern int sched_setscheduler2(struct task_struct *, int,
+ const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);
/**
* is_idle_task - is the specified task an idle task?
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -38,6 +38,7 @@ struct rlimit;
struct rlimit64;
struct rusage;
struct sched_param;
+struct sched_attr;
struct sel_arg_struct;
struct semaphore;
struct sembuf;
@@ -277,11 +278,18 @@ asmlinkage long sys_clock_nanosleep(cloc
asmlinkage long sys_nice(int increment);
asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
struct sched_param __user *param);
+asmlinkage long sys_sched_setscheduler2(pid_t pid, int policy,
+ struct sched_attr __user *attr);
asmlinkage long sys_sched_setparam(pid_t pid,
struct sched_param __user *param);
+asmlinkage long sys_sched_setattr(pid_t pid,
+ struct sched_attr __user *attr);
asmlinkage long sys_sched_getscheduler(pid_t pid);
asmlinkage long sys_sched_getparam(pid_t pid,
struct sched_param __user *param);
+asmlinkage long sys_sched_getattr(pid_t pid,
+ struct sched_attr __user *attr,
+ unsigned int size);
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
unsigned long __user *user_mask_ptr);
asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3023,7 +3023,8 @@ static bool check_same_owner(struct task
}

static int __sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool user)
+ const struct sched_attr *attr,
+ bool user)
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
@@ -3053,11 +3054,11 @@ static int __sched_setscheduler(struct t
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+ if (attr->sched_priority < 0 ||
+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if (rt_policy(policy) != (param->sched_priority != 0))
+ if (rt_policy(policy) != (attr->sched_priority != 0))
return -EINVAL;

/*
@@ -3073,8 +3074,8 @@ static int __sched_setscheduler(struct t
return -EPERM;

/* can't increase priority */
- if (param->sched_priority > p->rt_priority &&
- param->sched_priority > rlim_rtprio)
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
return -EPERM;
}

@@ -3123,7 +3124,7 @@ static int __sched_setscheduler(struct t
* If not changing anything there's no need to proceed further:
*/
if (unlikely(policy == p->policy && (!rt_policy(policy) ||
- param->sched_priority == p->rt_priority))) {
+ attr->sched_priority == p->rt_priority))) {
task_rq_unlock(rq, p, &flags);
return 0;
}
@@ -3160,7 +3161,7 @@ static int __sched_setscheduler(struct t

oldprio = p->prio;
prev_class = p->sched_class;
- __setscheduler(rq, p, policy, param->sched_priority);
+ __setscheduler(rq, p, policy, attr->sched_priority);

if (running)
p->sched_class->set_curr_task(rq);
@@ -3188,10 +3189,20 @@ static int __sched_setscheduler(struct t
int sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, true);
+ struct sched_attr attr = {
+ .sched_priority = param->sched_priority
+ };
+ return __sched_setscheduler(p, policy, &attr, true);
}
EXPORT_SYMBOL_GPL(sched_setscheduler);

+int sched_setscheduler2(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, policy, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler2);
+
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question.
@@ -3208,7 +3219,10 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, false);
+ struct sched_attr attr = {
+ .sched_priority = param->sched_priority
+ };
+ return __sched_setscheduler(p, policy, &attr, false);
}

static int
@@ -3233,6 +3247,97 @@ do_sched_setscheduler(pid_t pid, int pol
return retval;
}

+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
+static int
+do_sched_setscheduler2(pid_t pid, int policy,
+ struct sched_attr __user *attr_uptr)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!attr_uptr || pid < 0)
+ return -EINVAL;
+
+ if (sched_copy_attr(attr_uptr, &attr))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setscheduler2(p, policy, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
@@ -3252,6 +3357,21 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_
}

/**
+ * sys_sched_setscheduler2 - same as above, but with extended sched_param
+ * @pid: the pid in question.
+ * @policy: new policy (could use extended sched_param).
+ * @attr: structure containg the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setscheduler2, pid_t, pid, int, policy,
+ struct sched_attr __user *, attr)
+{
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler2(pid, policy, attr);
+}
+
+/**
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
@@ -3264,6 +3384,17 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, p
}

/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @attr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE2(sched_setattr, pid_t, pid,
+ struct sched_attr __user *, attr)
+{
+ return do_sched_setscheduler2(pid, -1, attr);
+}
+
+/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
*
@@ -3329,6 +3460,87 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, p
return retval;

out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ goto err_size;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, usize);
+ if (ret)
+ return -EFAULT;
+
+out:
+ return ret;
+
+err_size:
+ ret = -E2BIG;
+ goto out;
+}
+
+/**
+ * sys_sched_getattr - same as above, but with extended "sched_param"
+ * @pid: the pid in question.
+ * @attr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ */
+SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_priority = p->rt_priority;
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
rcu_read_unlock();
return retval;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/