[PATCH 5.16 119/186] sched: Fix yet more sched_fork() races

From: Greg Kroah-Hartman
Date: Mon Mar 07 2022 - 05:33:02 EST


From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

commit b1e8206582f9d680cff7d04828708c8b6ab32957 upstream.

Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an
invalid sched_task_group") fixed a fork race vs cgroup, it opened up a
race vs syscalls by not placing the task on the runqueue before it
gets exposed through the pidhash.

Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is
trying to fix a single instance of this, instead fix the whole class
of issues, effectively reverting this commit.

Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group")
Reported-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Tested-by: Tadeusz Struk <tadeusz.struk@xxxxxxxxxx>
Tested-by: Zhang Qiao <zhangqiao22@xxxxxxxxxx>
Tested-by: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
---
include/linux/sched/task.h | 4 ++--
kernel/fork.c | 13 ++++++++++++-
kernel/sched/core.c | 34 +++++++++++++++++++++-------------
3 files changed, 35 insertions(+), 16 deletions(-)

--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(str
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p,
- struct kernel_clone_args *kargs);
+extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2294,6 +2294,17 @@ static __latent_entropy struct task_stru
goto bad_fork_put_pidfd;

/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2402,7 +2413,7 @@ static __latent_entropy struct task_stru
fd_install(pidfd, pidfile);

proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1203,9 +1203,8 @@ int tg_nop(struct task_group *tg, void *
}
#endif

-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
- bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;

@@ -4393,7 +4392,7 @@ int sched_fork(unsigned long clone_flags
p->static_prio = NICE_TO_PRIO(0);

p->prio = p->normal_prio = p->static_prio;
- set_load_weight(p);
+ set_load_weight(p, false);

/*
* We don't need the reset flag anymore after the fork. It has
@@ -4411,6 +4410,7 @@ int sched_fork(unsigned long clone_flags

init_entity_runnable_average(&p->se);

+
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4426,18 +4426,23 @@ int sched_fork(unsigned long clone_flags
return 0;
}

-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *tg;
-#endif

+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
- struct task_group, css);
- p->sched_task_group = autogroup_task_group(p, tg);
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
#endif
rseq_migrate(p);
/*
@@ -4448,7 +4453,10 @@ void sched_post_fork(struct task_struct
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}

+void sched_post_fork(struct task_struct *p)
+{
uclamp_post_fork(p);
}

@@ -6880,7 +6888,7 @@ void set_user_nice(struct task_struct *p
put_prev_task(rq, p);

p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);

@@ -7171,7 +7179,7 @@ static void __setscheduler_params(struct
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}

/*
@@ -9410,7 +9418,7 @@ void __init sched_init(void)
#endif
}

- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);

/*
* The boot idle thread does lazy MMU switching as well: