Re: [tip:sched/urgent] sched/cgroup: Fix/cleanup cgroup teardown/init

From: Peter Zijlstra
Date: Thu Apr 28 2016 - 14:40:45 EST



Greg,

It looks like the below patch missed 4.5 and I'm starting to get bug
reports that look very much like this issue, could we get this patch
lifted into 4.5-stable?

On Mon, Mar 21, 2016 at 04:15:41AM -0700, tip-bot for Peter Zijlstra wrote:
> Commit-ID: 2f5177f0fd7e531b26d54633be62d1d4cb94621c
> Gitweb: http://git.kernel.org/tip/2f5177f0fd7e531b26d54633be62d1d4cb94621c
> Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> AuthorDate: Wed, 16 Mar 2016 16:22:45 +0100
> Committer: Ingo Molnar <mingo@xxxxxxxxxx>
> CommitDate: Mon, 21 Mar 2016 10:49:23 +0100
>
> sched/cgroup: Fix/cleanup cgroup teardown/init
>
> The CPU controller hasn't kept up with the various changes in the whole
> cgroup initialization / destruction sequence, and commit:
>
> 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups")
>
> caused it to explode.
>
> The reason for this is that zombies do not inhibit css_offline() from
> being called, but do stall css_released(). Now we tear down the cfs_rq
> structures on css_offline() but zombies can run after that, leading to
> use-after-free issues.
>
> The solution is to move the tear-down to css_released(), which
> guarantees nobody (including no zombies) is still using our cgroup.
>
> Furthermore, a few simple cleanups are possible too. There doesn't
> appear to be any point to us using css_online() (anymore?) so fold that
> in css_alloc().
>
> And since cgroup code guarantees an RCU grace period between
> css_released() and css_free() we can forgo using call_rcu() and free the
> stuff immediately.
>
> Suggested-by: Tejun Heo <tj@xxxxxxxxxx>
> Reported-by: Kazuki Yamaguchi <k@xxxxxx>
> Reported-by: Niklas Cassel <niklas.cassel@xxxxxxxx>
> Tested-by: Niklas Cassel <niklas.cassel@xxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> Acked-by: Tejun Heo <tj@xxxxxxxxxx>
> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups")
> Link: http://lkml.kernel.org/r/20160316152245.GY6344@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
> ---
> kernel/sched/core.c | 35 ++++++++++++++---------------------
> 1 file changed, 14 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4d872e1..2a87bdd 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7536,7 +7536,7 @@ void set_curr_task(int cpu, struct task_struct *p)
> /* task_group_lock serializes the addition/removal of task groups */
> static DEFINE_SPINLOCK(task_group_lock);
>
> -static void free_sched_group(struct task_group *tg)
> +static void sched_free_group(struct task_group *tg)
> {
> free_fair_sched_group(tg);
> free_rt_sched_group(tg);
> @@ -7562,7 +7562,7 @@ struct task_group *sched_create_group(struct task_group *parent)
> return tg;
>
> err:
> - free_sched_group(tg);
> + sched_free_group(tg);
> return ERR_PTR(-ENOMEM);
> }
>
> @@ -7582,17 +7582,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
> }
>
> /* rcu callback to free various structures associated with a task group */
> -static void free_sched_group_rcu(struct rcu_head *rhp)
> +static void sched_free_group_rcu(struct rcu_head *rhp)
> {
> /* now it should be safe to free those cfs_rqs */
> - free_sched_group(container_of(rhp, struct task_group, rcu));
> + sched_free_group(container_of(rhp, struct task_group, rcu));
> }
>
> -/* Destroy runqueue etc associated with a task group */
> void sched_destroy_group(struct task_group *tg)
> {
> /* wait for possible concurrent references to cfs_rqs complete */
> - call_rcu(&tg->rcu, free_sched_group_rcu);
> + call_rcu(&tg->rcu, sched_free_group_rcu);
> }
>
> void sched_offline_group(struct task_group *tg)
> @@ -8051,31 +8050,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> if (IS_ERR(tg))
> return ERR_PTR(-ENOMEM);
>
> + sched_online_group(tg, parent);
> +
> return &tg->css;
> }
>
> -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
> +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
> {
> struct task_group *tg = css_tg(css);
> - struct task_group *parent = css_tg(css->parent);
>
> - if (parent)
> - sched_online_group(tg, parent);
> - return 0;
> + sched_offline_group(tg);
> }
>
> static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
> {
> struct task_group *tg = css_tg(css);
>
> - sched_destroy_group(tg);
> -}
> -
> -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
> -{
> - struct task_group *tg = css_tg(css);
> -
> - sched_offline_group(tg);
> + /*
> + * Relies on the RCU grace period between css_released() and this.
> + */
> + sched_free_group(tg);
> }
>
> static void cpu_cgroup_fork(struct task_struct *task)
> @@ -8435,9 +8429,8 @@ static struct cftype cpu_files[] = {
>
> struct cgroup_subsys cpu_cgrp_subsys = {
> .css_alloc = cpu_cgroup_css_alloc,
> + .css_released = cpu_cgroup_css_released,
> .css_free = cpu_cgroup_css_free,
> - .css_online = cpu_cgroup_css_online,
> - .css_offline = cpu_cgroup_css_offline,
> .fork = cpu_cgroup_fork,
> .can_attach = cpu_cgroup_can_attach,
> .attach = cpu_cgroup_attach,