[PATCH resend 3/8] sched: prctl() cookie manipulation for core scheduling

From: Joel Fernandes (Google)
Date: Wed Mar 24 2021 - 17:41:23 EST


From: chris hyser <chris.hyser@xxxxxxxxxx>

This patch provides support for setting, clearing and copying core
scheduling 'task cookies' between threads (PID), processes (TGID), and
process groups (PGID).

The value of core scheduling isn't that tasks don't share a core, 'nosmt'
can do that. The value lies in exploiting all the sharing opportunities
that exist to recover possible lost performance and that requires a degree
of flexibility in the API. From a security perspective (and there are
others), the thread, process and process group distinction is an existent
hierarchal categorization of tasks that reflects many of the security
concerns about 'data sharing'. For example, protecting against
cache-snooping by a thread that can just read the memory directly isn't all
that useful. With this in mind, subcommands to CLEAR/CREATE/SHARE (TO/FROM)
provide a mechanism to create, clear and share cookies.
CLEAR/CREATE/SHARE_TO specify a target pid with enum pidtype used to
specify the scope of the targeted tasks. For example, PIDTYPE_TGID will
share the cookie with the process and all of it's threads as typically
desired in a security scenario.

API:

prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, 0)
prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_CLEAR, tgtpid, pidtype, 0)
prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_SHARE_FROM, srcpid, 0, 0)
prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, 0)

where 'tgtpid/srcpid == 0' implies the current process and pidtype is
kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}.
PIDTYPE_SID, sharing a cookie with an entire session, was considered less
useful given the choice to create a new cookie on task exec().

For return values, EINVAL, ENOMEM are what they say. ESRCH means the
tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission
access to tgtpid/srcpid. EACCES indicates that a task in the target pidtype
group was not updated due to permission.

In terms of interaction with the cgroup interface, task cookies are set
independently of cgroup core scheduling cookies and thus would allow use
for tasks within a container using cgroup cookies.

Current hard-coded policies are:
- a user can clear the cookie of any process they can set a cookie for.
Lack of a cookie *might* be a security issue if cookies are being used
for that.
- on fork of a parent with a cookie, both process and thread child tasks
get a copy.
- on exec a task with a cookie is given a new cookie

Signed-off-by: Chris Hyser <chris.hyser@xxxxxxxxxx>
Signed-off-by: Josh Don <joshdon@xxxxxxxxxx>
---
fs/exec.c | 4 +-
include/linux/sched.h | 11 ++
include/linux/sched/task.h | 4 +-
include/uapi/linux/prctl.h | 7 ++
kernel/sched/core.c | 11 +-
kernel/sched/coretag.c | 196 ++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 2 +
kernel/sys.c | 7 ++
tools/include/uapi/linux/prctl.h | 7 ++
9 files changed, 241 insertions(+), 8 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..ab0945508b50 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1807,7 +1807,9 @@ static int bprm_execve(struct linux_binprm *bprm,
if (IS_ERR(file))
goto out_unmark;

- sched_exec();
+ retval = sched_exec();
+ if (retval)
+ goto out;

bprm->file = file;
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 833f8d682212..075b15392a4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2184,8 +2184,19 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

#ifdef CONFIG_SCHED_CORE
void sched_tsk_free(struct task_struct *tsk);
+int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type);
+int sched_core_exec(void);
#else
#define sched_tsk_free(tsk) do { } while (0)
+static inline int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type)
+{
+ return 0;
+}
+
+static inline int sched_core_exec(void)
+{
+ return 0;
+}
#endif

#endif
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ef02be869cf2..d0f5b233f092 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -94,9 +94,9 @@ extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
-extern void sched_exec(void);
+int sched_exec(void);
#else
-#define sched_exec() {}
+static inline int sched_exec(void) { return 0; }
#endif

static inline struct task_struct *get_task_struct(struct task_struct *t)
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 667f1aed091c..e658dca88f4f 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -255,4 +255,11 @@ struct prctl_mm_map {
# define SYSCALL_DISPATCH_FILTER_ALLOW 0
# define SYSCALL_DISPATCH_FILTER_BLOCK 1

+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE_SHARE 60
+# define PR_SCHED_CORE_CLEAR 0 /* clear core_sched cookie of pid */
+# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_FROM 2 /* get core_sched cookie from pid */
+# define PR_SCHED_CORE_SHARE_TO 3 /* push core_sched cookie to pid */
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1b07687c53d4..3093cb3414c3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4752,11 +4752,17 @@ unsigned long nr_iowait(void)
* sched_exec - execve() is a valuable balancing opportunity, because at
* this point the task has the smallest effective memory and cache footprint.
*/
-void sched_exec(void)
+int sched_exec(void)
{
struct task_struct *p = current;
unsigned long flags;
int dest_cpu;
+ int ret;
+
+ /* this may change what tasks current can share a core with */
+ ret = sched_core_exec();
+ if (ret)
+ return ret;

raw_spin_lock_irqsave(&p->pi_lock, flags);
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
@@ -4768,10 +4774,11 @@ void sched_exec(void)

raw_spin_unlock_irqrestore(&p->pi_lock, flags);
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ return 0;
}
unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return 0;
}

#endif
diff --git a/kernel/sched/coretag.c b/kernel/sched/coretag.c
index ba73569237f0..550f4975eea2 100644
--- a/kernel/sched/coretag.c
+++ b/kernel/sched/coretag.c
@@ -155,6 +155,7 @@ static void sched_core_update_cookie(struct task_struct *p,
task_rq_unlock(rq, p, &rf);
}

+/* Per-task interface: task free. */
static void sched_core_free_task_cookie_work(struct work_struct *ws);

static unsigned long sched_core_alloc_task_cookie(void)
@@ -223,16 +224,205 @@ static inline void sched_core_update_task_cookie(struct task_struct *t,
sched_core_update_cookie(t, c, sched_core_task_cookie_type);
}

-/*
- * Called from sched_fork().
- */
+static int sched_core_create_cookie(struct task_struct *p)
+{
+ unsigned long cookie;
+
+ lockdep_assert_held(&sched_core_tasks_mutex);
+
+ cookie = sched_core_alloc_task_cookie();
+ if (!cookie)
+ return -ENOMEM;
+
+ if (p->core_cookie.task_cookie)
+ sched_core_put_task_cookie(p->core_cookie.task_cookie);
+
+ sched_core_update_task_cookie(p, cookie);
+ return 0;
+}
+
+static void sched_core_clear_cookie(struct task_struct *p)
+{
+ lockdep_assert_held(&sched_core_tasks_mutex);
+ if (p->core_cookie.task_cookie) {
+ sched_core_put_task_cookie(p->core_cookie.task_cookie);
+ sched_core_update_task_cookie(p, 0);
+ }
+}
+
+static unsigned long sched_core_get_copy_cookie(struct task_struct *p)
+{
+ unsigned long cookie = p->core_cookie.task_cookie;
+
+ lockdep_assert_held(&sched_core_tasks_mutex);
+ sched_core_get_task_cookie(cookie);
+ return cookie;
+}
+
+static void sched_core_copy_cookie_frm_to(struct task_struct *ft, struct task_struct *tt)
+{
+ unsigned long cookie;
+
+ lockdep_assert_held(&sched_core_tasks_mutex);
+
+ /* sharing a 0 cookie is a clear */
+ if (!ft->core_cookie.task_cookie) {
+ sched_core_clear_cookie(tt);
+ return;
+ }
+
+ cookie = sched_core_get_copy_cookie(ft);
+ if (tt->core_cookie.task_cookie)
+ sched_core_put_task_cookie(tt->core_cookie.task_cookie);
+ sched_core_update_task_cookie(tt, cookie);
+}
+
+/* Called from prctl interface: PR_SCHED_CORE_SHARE */
+int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type)
+{
+ struct task_struct *task;
+ struct task_struct *p;
+ unsigned long cookie;
+ struct pid *grp;
+ int err = 0;
+
+ if (type > PIDTYPE_PGID || flags > PR_SCHED_CORE_SHARE_TO || pid < 0 ||
+ (flags == PR_SCHED_CORE_SHARE_FROM && type != PIDTYPE_PID))
+ return -EINVAL;
+
+ rcu_read_lock();
+
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+
+ get_task_struct(task);
+
+ /* Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ err = -EPERM;
+ goto out;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&sched_core_tasks_mutex);
+ if (type == PIDTYPE_PID) {
+ if (flags == PR_SCHED_CORE_CREATE) {
+ err = sched_core_create_cookie(task);
+
+ } else if (flags == PR_SCHED_CORE_CLEAR) {
+ sched_core_clear_cookie(task);
+
+ } else if (flags == PR_SCHED_CORE_SHARE_FROM) {
+ sched_core_copy_cookie_frm_to(task, current);
+
+ } else if (flags == PR_SCHED_CORE_SHARE_TO) {
+ sched_core_copy_cookie_frm_to(current, task);
+
+ } else {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ } else {
+ if (flags == PR_SCHED_CORE_CREATE) {
+ cookie = sched_core_alloc_task_cookie();
+ if (!cookie) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ } else if (flags == PR_SCHED_CORE_CLEAR) {
+ cookie = 0;
+ } else if (flags == PR_SCHED_CORE_SHARE_TO) {
+ cookie = sched_core_get_copy_cookie(current);
+ } else {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ rcu_read_lock();
+ if (type == PIDTYPE_TGID) {
+ grp = task_tgid(task);
+ } else if (type == PIDTYPE_PGID) {
+ grp = task_pgrp(task);
+ } else {
+ err = -EINVAL;
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+
+ do_each_pid_thread(grp, type, p) {
+ /*
+ * if not allowed, don't do it, but indicate to caller.
+ * task and current are already good.
+ */
+ if (p == task || p == current ||
+ ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) {
+ if (cookie)
+ sched_core_get_task_cookie(cookie);
+ if (p->core_cookie.task_cookie)
+ sched_core_put_task_cookie_async(p->core_cookie.task_cookie);
+ sched_core_update_task_cookie(p, cookie);
+ } else {
+ err = -EACCES;
+ }
+ } while_each_pid_thread(grp, type, p);
+
+ rcu_read_unlock();
+
+ /*
+ * Remove the extra reference we took to the cookie
+ * (ie. via alloc/copy).
+ */
+ if (cookie)
+ sched_core_put_task_cookie(cookie);
+ }
+out_unlock:
+ mutex_unlock(&sched_core_tasks_mutex);
+
+out:
+ put_task_struct(task);
+ return err;
+}
+
+int sched_core_exec(void)
+{
+ int ret = 0;
+
+ /* absent a policy mech, if task had a cookie, give it a new one */
+ if (READ_ONCE(current->core_cookie.task_cookie)) {
+ mutex_lock(&sched_core_tasks_mutex);
+ if (current->core_cookie.task_cookie)
+ ret = sched_core_create_cookie(current);
+ mutex_unlock(&sched_core_tasks_mutex);
+ }
+ return ret;
+}
+
+/* Called from sched_fork() */
int sched_core_fork(struct task_struct *p, unsigned long clone_flags)
{
/*
* Task cookie is ref counted; avoid an uncounted reference.
+ * If p should have a task cookie, it will be set below.
*/
__sched_core_set_task_cookie(&p->core_cookie, 0);

+ if (READ_ONCE(current->core_cookie.task_cookie)) {
+ mutex_lock(&sched_core_tasks_mutex);
+ if (current->core_cookie.task_cookie)
+ sched_core_copy_cookie_frm_to(current, p);
+ mutex_unlock(&sched_core_tasks_mutex);
+ }
return 0;
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5b49cfaa4a53..1be86d9cc58f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1184,6 +1184,8 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p);
void sched_core_get(void);
void sched_core_put(void);

+int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type);
+
bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);

int sched_core_cookie_cmp(const struct sched_core_cookie *a,
diff --git a/kernel/sys.c b/kernel/sys.c
index 2e2e3f378d97..b40243522146 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2534,6 +2534,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = set_syscall_user_dispatch(arg2, arg3, arg4,
(char __user *) arg5);
break;
+#ifdef CONFIG_SCHED_CORE
+ case PR_SCHED_CORE_SHARE:
+ if (arg5)
+ return -EINVAL;
+ error = sched_core_share_pid(arg2, arg3, arg4);
+ break;
+#endif
default:
error = -EINVAL;
break;
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 667f1aed091c..14900c400e74 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -255,4 +255,11 @@ struct prctl_mm_map {
# define SYSCALL_DISPATCH_FILTER_ALLOW 0
# define SYSCALL_DISPATCH_FILTER_BLOCK 1

+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE_SHARE 60
+# define PR_SCHED_CORE_CLEAR 0 /* clear core_sched cookie of pid */
+# define PR_SCHED_CORE_CREATE 1 /* get core_sched cookie from pid */
+# define PR_SCHED_CORE_SHARE_FROM 2 /* get core_sched cookie from pid */
+# define PR_SCHED_CORE_SHARE_TO 3 /* push core_sched cookie to pid */
+
#endif /* _LINUX_PRCTL_H */
--
2.31.0.291.g576ba9dcdaf-goog