Re: 3.13-rc breaks MEMCG_SWAP

From: Michal Hocko
Date: Mon Dec 16 2013 - 11:20:49 EST


On Mon 16-12-13 00:36:05, Hugh Dickins wrote:
[...]

OK, I went through the patch and it looks good except for suspicious
ctrl->lock handling in swap_cgroup_reassign (see below). I am just
suggesting to split it into 4 parts

* swap_cgroup_mutex -> swap_cgroup_lock
* swapon cleanup
* drop irqsave when taking ctrl->lock
* mem_cgroup_reparent_swap

but I will leave the split up to you. Just make sure that the fix is a
separate patch, please.

[...]
> --- 3.13-rc4/mm/page_cgroup.c 2013-02-18 15:58:34.000000000 -0800
> +++ linux/mm/page_cgroup.c 2013-12-15 14:34:36.312485960 -0800
> @@ -322,7 +322,8 @@ void __meminit pgdat_page_cgroup_init(st
>
> #ifdef CONFIG_MEMCG_SWAP
>
> -static DEFINE_MUTEX(swap_cgroup_mutex);
> +static DEFINE_SPINLOCK(swap_cgroup_lock);
> +

This one is worth a separate patch IMO.

> struct swap_cgroup_ctrl {
> struct page **map;
> unsigned long length;
> @@ -353,14 +354,11 @@ struct swap_cgroup {
> /*
> * allocate buffer for swap_cgroup.
> */
> -static int swap_cgroup_prepare(int type)
> +static int swap_cgroup_prepare(struct swap_cgroup_ctrl *ctrl)
> {
> struct page *page;
> - struct swap_cgroup_ctrl *ctrl;
> unsigned long idx, max;
>
> - ctrl = &swap_cgroup_ctrl[type];
> -
> for (idx = 0; idx < ctrl->length; idx++) {
> page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> if (!page)

This with swap_cgroup_swapon should be in a separate patch as a cleanup.

> @@ -407,18 +405,17 @@ unsigned short swap_cgroup_cmpxchg(swp_e
> {
> struct swap_cgroup_ctrl *ctrl;
> struct swap_cgroup *sc;
> - unsigned long flags;
> unsigned short retval;
>
> sc = lookup_swap_cgroup(ent, &ctrl);
>
> - spin_lock_irqsave(&ctrl->lock, flags);
> + spin_lock(&ctrl->lock);
> retval = sc->id;
> if (retval == old)
> sc->id = new;
> else
> retval = 0;
> - spin_unlock_irqrestore(&ctrl->lock, flags);
> + spin_unlock(&ctrl->lock);
> return retval;
> }
>
> @@ -435,14 +432,13 @@ unsigned short swap_cgroup_record(swp_en
> struct swap_cgroup_ctrl *ctrl;
> struct swap_cgroup *sc;
> unsigned short old;
> - unsigned long flags;
>
> sc = lookup_swap_cgroup(ent, &ctrl);
>
> - spin_lock_irqsave(&ctrl->lock, flags);
> + spin_lock(&ctrl->lock);
> old = sc->id;
> sc->id = id;
> - spin_unlock_irqrestore(&ctrl->lock, flags);
> + spin_unlock(&ctrl->lock);
>
> return old;
> }

I would prefer these two in a separate patch as well. I have no idea why
these were IRQ aware as this was never needed AFAICS.
e9e58a4ec3b10 is not very specific...

> @@ -451,19 +447,60 @@ unsigned short swap_cgroup_record(swp_en
> * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
> * @ent: swap entry to be looked up.
> *
> - * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
> + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
> */
> unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> {
> return lookup_swap_cgroup(ent, NULL)->id;
> }
>
> +/**
> + * swap_cgroup_reassign - assign all old entries to new (before old is freed).
> + * @old: id of emptied memcg whose entries are now to be reassigned
> + * @new: id of parent memcg to which those entries are to be assigned
> + *
> + * Returns number of entries reassigned, for debugging or for statistics.
> + */
> +long swap_cgroup_reassign(unsigned short old, unsigned short new)
> +{
> + long reassigned = 0;
> + int type;
> +
> + for (type = 0; type < MAX_SWAPFILES; type++) {
> + struct swap_cgroup_ctrl *ctrl = &swap_cgroup_ctrl[type];
> + unsigned long idx;
> +
> + for (idx = 0; idx < ACCESS_ONCE(ctrl->length); idx++) {
> + struct swap_cgroup *sc, *scend;
> +
> + spin_lock(&swap_cgroup_lock);
> + if (idx >= ACCESS_ONCE(ctrl->length))
> + goto unlock;
> + sc = page_address(ctrl->map[idx]);
> + for (scend = sc + SC_PER_PAGE; sc < scend; sc++) {
> + if (sc->id != old)
> + continue;

Is this safe? What prevents from race when id is set to old?

> + spin_lock(&ctrl->lock);
> + if (sc->id == old) {

Also it seems that compiler is free to optimize this test away, no?
You need ACCESS_ONCE here as well, I guess.

> + sc->id = new;
> + reassigned++;
> + }
> + spin_unlock(&ctrl->lock);
> + }
> +unlock:
> + spin_unlock(&swap_cgroup_lock);
> + cond_resched();
> + }
> + }
> + return reassigned;
> +}
> +
> int swap_cgroup_swapon(int type, unsigned long max_pages)
> {
> void *array;
> unsigned long array_size;
> unsigned long length;
> - struct swap_cgroup_ctrl *ctrl;
> + struct swap_cgroup_ctrl ctrl;
>
> if (!do_swap_account)
> return 0;
[...]
--
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/