Re: [PATCH 2/2] memcg: relocate charge moving from ->attach to ->post_attach

From: Michal Hocko
Date: Mon Apr 25 2016 - 04:26:43 EST


On Thu 21-04-16 19:09:02, Tejun Heo wrote:
> Hello,
>
> So, this ended up a lot simpler than I originally expected. I tested
> it lightly and it seems to work fine. Petr, can you please test these
> two patches w/o the lru drain drop patch and see whether the problem
> is gone?
>
> Thanks.
> ------ 8< ------
> If charge moving is used, memcg performs relabeling of the affected
> pages from its ->attach callback which is called under both
> cgroup_threadgroup_rwsem and thus can't create new kthreads. This is
> fragile as various operations may depend on workqueues making forward
> progress which relies on the ability to create new kthreads.
>
> There's no reason to perform charge moving from ->attach which is deep
> in the task migration path. Move it to ->post_attach which is called
> after the actual migration is finished and cgroup_threadgroup_rwsem is
> dropped.
>
> * move_charge_struct->mm is added and ->can_attach is now responsible
> for pinning and recording the target mm. mem_cgroup_clear_mc() is
> updated accordingly. This also simplifies mem_cgroup_move_task().
>
> * mem_cgroup_move_task() is now called from ->post_attach instead of
> ->attach.

This looks much better than the previous quick&dirty workaround. Thanks
for taking an extra step!

Sorry for being so pushy but shouldn't we at least document those
callbacks which depend on cgroup_threadgroup_rwsem to never ever block
on WQ directly or indirectly. Maybe even enforce they have to be
non-blocking. This would be out of scope of this particular patch of
course but it would fit nicely into the series.

> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxxxx>
> Debugged-by: Petr Mladek <pmladek@xxxxxxxx>
> Reported-by: Cyril Hrubis <chrubis@xxxxxxx>
> Reported-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> Fixes: 1ed1328792ff ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem")
> Cc: <stable@xxxxxxxxxxxxxxx> # 4.4+

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

Thanks!

> ---
> mm/memcontrol.c | 37 +++++++++++++++++++------------------
> 1 file changed, 19 insertions(+), 18 deletions(-)
>
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct
> /* "mc" and its members are protected by cgroup_mutex */
> static struct move_charge_struct {
> spinlock_t lock; /* for from, to */
> + struct mm_struct *mm;
> struct mem_cgroup *from;
> struct mem_cgroup *to;
> unsigned long flags;
> @@ -4667,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void)
>
> static void mem_cgroup_clear_mc(void)
> {
> + struct mm_struct *mm = mc.mm;
> +
> /*
> * we must clear moving_task before waking up waiters at the end of
> * task migration.
> @@ -4676,7 +4679,10 @@ static void mem_cgroup_clear_mc(void)
> spin_lock(&mc.lock);
> mc.from = NULL;
> mc.to = NULL;
> + mc.mm = NULL;
> spin_unlock(&mc.lock);
> +
> + mmput(mm);
> }
>
> static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4733,6 +4739,7 @@ static int mem_cgroup_can_attach(struct
> VM_BUG_ON(mc.moved_swap);
>
> spin_lock(&mc.lock);
> + mc.mm = mm;
> mc.from = from;
> mc.to = memcg;
> mc.flags = move_flags;
> @@ -4742,8 +4749,9 @@ static int mem_cgroup_can_attach(struct
> ret = mem_cgroup_precharge_mc(mm);
> if (ret)
> mem_cgroup_clear_mc();
> + } else {
> + mmput(mm);
> }
> - mmput(mm);
> return ret;
> }
>
> @@ -4852,11 +4860,11 @@ put: /* get_mctgt_type() gets the page
> return ret;
> }
>
> -static void mem_cgroup_move_charge(struct mm_struct *mm)
> +static void mem_cgroup_move_charge(void)
> {
> struct mm_walk mem_cgroup_move_charge_walk = {
> .pmd_entry = mem_cgroup_move_charge_pte_range,
> - .mm = mm,
> + .mm = mc.mm,
> };
>
> lru_add_drain_all();
> @@ -4868,7 +4876,7 @@ static void mem_cgroup_move_charge(struc
> atomic_inc(&mc.from->moving_account);
> synchronize_rcu();
> retry:
> - if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
> + if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
> /*
> * Someone who are holding the mmap_sem might be waiting in
> * waitq. So we cancel all extra charges, wake up all waiters,
> @@ -4885,23 +4893,16 @@ retry:
> * additional charge, the page walk just aborts.
> */
> walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
> - up_read(&mm->mmap_sem);
> + up_read(&mc.mm->mmap_sem);
> atomic_dec(&mc.from->moving_account);
> }
>
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
> {
> - struct cgroup_subsys_state *css;
> - struct task_struct *p = cgroup_taskset_first(tset, &css);
> - struct mm_struct *mm = get_task_mm(p);
> -
> - if (mm) {
> - if (mc.to)
> - mem_cgroup_move_charge(mm);
> - mmput(mm);
> - }
> - if (mc.to)
> + if (mc.to) {
> + mem_cgroup_move_charge();
> mem_cgroup_clear_mc();
> + }
> }
> #else /* !CONFIG_MMU */
> static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -4911,7 +4912,7 @@ static int mem_cgroup_can_attach(struct
> static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
> {
> }
> -static void mem_cgroup_move_task(struct cgroup_taskset *tset)
> +static void mem_cgroup_move_task(void)
> {
> }
> #endif
> @@ -5195,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys
> .css_reset = mem_cgroup_css_reset,
> .can_attach = mem_cgroup_can_attach,
> .cancel_attach = mem_cgroup_cancel_attach,
> - .attach = mem_cgroup_move_task,
> + .post_attach = mem_cgroup_move_task,
> .bind = mem_cgroup_bind,
> .dfl_cftypes = memory_files,
> .legacy_cftypes = mem_cgroup_legacy_files,

--
Michal Hocko
SUSE Labs