Re: [patch 06/10] mm: memcontrol: use root_mem_cgroup res_counter

From: Michal Hocko
Date: Tue Jun 03 2014 - 10:11:46 EST


On Thu 29-05-14 12:15:58, Johannes Weiner wrote:
> Due to an old optimization to keep expensive res_counter changes at a
> minimum, the root_mem_cgroup res_counter is never charged; there is no
> limit at that level anyway, and any statistics can be generated on
> demand by summing up the counters of all other cgroups.
>
> However, with per-cpu charge caches, res_counter operations do not
> even show up in profiles anymore, so this optimization is no longer
> necessary.
>
> Remove it to simplify the code.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

OK, I like the resulting code much more.

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

> ---
> mm/memcontrol.c | 149 ++++++++++++++++----------------------------------------
> 1 file changed, 43 insertions(+), 106 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 184e67cce4e4..84195a80068c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2573,9 +2573,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
> unsigned long nr_reclaimed;
> unsigned long flags = 0;
> unsigned long long size;
> + int ret = 0;
>
> - if (mem_cgroup_is_root(memcg))
> - goto done;
> retry:
> if (consume_stock(memcg, nr_pages))
> goto done;
> @@ -2656,13 +2655,15 @@ nomem:
> if (!(gfp_mask & __GFP_NOFAIL))
> return -ENOMEM;
> bypass:
> - return -EINTR;
> + memcg = root_mem_cgroup;
> + ret = -EINTR;
> + goto retry;
>
> done_restock:
> if (batch > nr_pages)
> refill_stock(memcg, batch - nr_pages);
> done:
> - return 0;
> + return ret;
> }
>
> /**
> @@ -2702,13 +2703,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
> static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
> unsigned int nr_pages)
> {
> - if (!mem_cgroup_is_root(memcg)) {
> - unsigned long bytes = nr_pages * PAGE_SIZE;
> + unsigned long bytes = nr_pages * PAGE_SIZE;
>
> - res_counter_uncharge(&memcg->res, bytes);
> - if (do_swap_account)
> - res_counter_uncharge(&memcg->memsw, bytes);
> - }
> + res_counter_uncharge(&memcg->res, bytes);
> + if (do_swap_account)
> + res_counter_uncharge(&memcg->memsw, bytes);
> }
>
> /*
> @@ -2720,9 +2719,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
> {
> unsigned long bytes = nr_pages * PAGE_SIZE;
>
> - if (mem_cgroup_is_root(memcg))
> - return;
> -
> res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
> if (do_swap_account)
> res_counter_uncharge_until(&memcg->memsw,
> @@ -3997,7 +3993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
> * replacement page, so leave it alone when phasing out the
> * page that is unused after the migration.
> */
> - if (!end_migration && !mem_cgroup_is_root(memcg))
> + if (!end_migration)
> mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
>
> return memcg;
> @@ -4130,8 +4126,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
> * We uncharge this because swap is freed. This memcg can
> * be obsolete one. We avoid calling css_tryget_online().
> */
> - if (!mem_cgroup_is_root(memcg))
> - res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> + res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> mem_cgroup_swap_statistics(memcg, false);
> css_put(&memcg->css);
> }
> @@ -4825,78 +4820,24 @@ out:
> return retval;
> }
>
> -
> -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
> - enum mem_cgroup_stat_index idx)
> -{
> - struct mem_cgroup *iter;
> - long val = 0;
> -
> - /* Per-cpu values can be negative, use a signed accumulator */
> - for_each_mem_cgroup_tree(iter, memcg)
> - val += mem_cgroup_read_stat(iter, idx);
> -
> - if (val < 0) /* race ? */
> - val = 0;
> - return val;
> -}
> -
> -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
> -{
> - u64 val;
> -
> - if (!mem_cgroup_is_root(memcg)) {
> - if (!swap)
> - return res_counter_read_u64(&memcg->res, RES_USAGE);
> - else
> - return res_counter_read_u64(&memcg->memsw, RES_USAGE);
> - }
> -
> - /*
> - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
> - * as well as in MEM_CGROUP_STAT_RSS_HUGE.
> - */
> - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
> - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
> -
> - if (swap)
> - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
> -
> - return val << PAGE_SHIFT;
> -}
> -
> static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
> - struct cftype *cft)
> + struct cftype *cft)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> - u64 val;
> - int name;
> - enum res_type type;
> -
> - type = MEMFILE_TYPE(cft->private);
> - name = MEMFILE_ATTR(cft->private);
> + enum res_type type = MEMFILE_TYPE(cft->private);
> + int name = MEMFILE_ATTR(cft->private);
>
> switch (type) {
> case _MEM:
> - if (name == RES_USAGE)
> - val = mem_cgroup_usage(memcg, false);
> - else
> - val = res_counter_read_u64(&memcg->res, name);
> - break;
> + return res_counter_read_u64(&memcg->res, name);
> case _MEMSWAP:
> - if (name == RES_USAGE)
> - val = mem_cgroup_usage(memcg, true);
> - else
> - val = res_counter_read_u64(&memcg->memsw, name);
> - break;
> + return res_counter_read_u64(&memcg->memsw, name);
> case _KMEM:
> - val = res_counter_read_u64(&memcg->kmem, name);
> + return res_counter_read_u64(&memcg->kmem, name);
> break;
> default:
> BUG();
> }
> -
> - return val;
> }
>
> #ifdef CONFIG_MEMCG_KMEM
> @@ -5376,7 +5317,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> if (!t)
> goto unlock;
>
> - usage = mem_cgroup_usage(memcg, swap);
> + if (!swap)
> + usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> + else
> + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
>
> /*
> * current_threshold points to threshold just below or equal to usage.
> @@ -5468,15 +5412,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
>
> mutex_lock(&memcg->thresholds_lock);
>
> - if (type == _MEM)
> + if (type == _MEM) {
> thresholds = &memcg->thresholds;
> - else if (type == _MEMSWAP)
> + usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> + } else if (type == _MEMSWAP) {
> thresholds = &memcg->memsw_thresholds;
> - else
> + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
> + } else
> BUG();
>
> - usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
> -
> /* Check if a threshold crossed before adding a new one */
> if (thresholds->primary)
> __mem_cgroup_threshold(memcg, type == _MEMSWAP);
> @@ -5556,18 +5500,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> int i, j, size;
>
> mutex_lock(&memcg->thresholds_lock);
> - if (type == _MEM)
> +
> + if (type == _MEM) {
> thresholds = &memcg->thresholds;
> - else if (type == _MEMSWAP)
> + usage = res_counter_read_u64(&memcg->res, RES_USAGE);
> + } else if (type == _MEMSWAP) {
> thresholds = &memcg->memsw_thresholds;
> - else
> + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
> + } else
> BUG();
>
> if (!thresholds->primary)
> goto unlock;
>
> - usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
> -
> /* Check if a threshold crossed before removing */
> __mem_cgroup_threshold(memcg, type == _MEMSWAP);
>
> @@ -6329,9 +6274,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
> * core guarantees its existence.
> */
> } else {
> - res_counter_init(&memcg->res, NULL);
> - res_counter_init(&memcg->memsw, NULL);
> - res_counter_init(&memcg->kmem, NULL);
> + res_counter_init(&memcg->res, &root_mem_cgroup->res);
> + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
> + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
> /*
> * Deeper hierachy with use_hierarchy == false doesn't make
> * much sense so let cgroup subsystem know about this
> @@ -6449,11 +6394,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
> int batch_count = PRECHARGE_COUNT_AT_ONCE;
> struct mem_cgroup *memcg = mc.to;
>
> - if (mem_cgroup_is_root(memcg)) {
> - mc.precharge += count;
> - /* we don't need css_get for root */
> - return ret;
> - }
> /* try to charge at once */
> if (count > 1) {
> struct res_counter *dummy;
> @@ -6769,21 +6709,18 @@ static void __mem_cgroup_clear_mc(void)
> /* we must fixup refcnts and charges */
> if (mc.moved_swap) {
> /* uncharge swap account from the old cgroup */
> - if (!mem_cgroup_is_root(mc.from))
> - res_counter_uncharge(&mc.from->memsw,
> - PAGE_SIZE * mc.moved_swap);
> + res_counter_uncharge(&mc.from->memsw,
> + PAGE_SIZE * mc.moved_swap);
>
> for (i = 0; i < mc.moved_swap; i++)
> css_put(&mc.from->css);
>
> - if (!mem_cgroup_is_root(mc.to)) {
> - /*
> - * we charged both to->res and to->memsw, so we should
> - * uncharge to->res.
> - */
> - res_counter_uncharge(&mc.to->res,
> - PAGE_SIZE * mc.moved_swap);
> - }
> + /*
> + * we charged both to->res and to->memsw, so we should
> + * uncharge to->res.
> + */
> + res_counter_uncharge(&mc.to->res,
> + PAGE_SIZE * mc.moved_swap);
> /* we've already done css_get(mc.to) */
> mc.moved_swap = 0;
> }
> --
> 1.9.3
>

--
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/