Re: [PATCH] memcg: reduce memcg tree traversals for stats collection

From: Michal Hocko
Date: Thu Jul 26 2018 - 09:56:52 EST


On Tue 24-07-18 15:46:35, Shakeel Butt wrote:
> Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times
> to collect the stats while cgroup-v2's memory_stat_show traverses the
> memcg tree thrice. On a large machine, a couple thousand memcgs is very
> normal and if the churn is high and memcgs stick around during to
> several reasons, tens of thousands of nodes in memcg tree can exist.
> This patch has refactored and shared the stat collection code between
> cgroup-v1 and cgroup-v2 and has reduced the tree traversal to just one.
>
> I ran a simple benchmark which reads the root_mem_cgroup's stat file
> 1000 times in the presense of 2500 memcgs on cgroup-v1. The results are:
>
> Without the patch:
> $ time ./read-root-stat-1000-times
>
> real 0m1.663s
> user 0m0.000s
> sys 0m1.660s
>
> With the patch:
> $ time ./read-root-stat-1000-times
>
> real 0m0.468s
> user 0m0.000s
> sys 0m0.467s

The code is not the nicest one and accumulated_stats could really see
some comment explaining v1 vs. v2 differences but other than that this
makes sense. Especially with many zombies we can have.

> Signed-off-by: Shakeel Butt <shakeelb@xxxxxxxxxx>

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

> ---
> mm/memcontrol.c | 150 +++++++++++++++++++++++-------------------------
> 1 file changed, 73 insertions(+), 77 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a5869b9d5194..d90993ef1d7d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3122,29 +3122,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
> return retval;
> }
>
> -static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
> -{
> - struct mem_cgroup *iter;
> - int i;
> -
> - memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
> -
> - for_each_mem_cgroup_tree(iter, memcg) {
> - for (i = 0; i < MEMCG_NR_STAT; i++)
> - stat[i] += memcg_page_state(iter, i);
> - }
> -}
> +struct accumulated_stats {
> + unsigned long stat[MEMCG_NR_STAT];
> + unsigned long events[NR_VM_EVENT_ITEMS];
> + unsigned long lru_pages[NR_LRU_LISTS];
> + const unsigned int *stats_array;
> + const unsigned int *events_array;
> + int stats_size;
> + int events_size;
> +};
>
> -static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
> +static void accumulate_memcg_tree(struct mem_cgroup *memcg,
> + struct accumulated_stats *acc)
> {
> - struct mem_cgroup *iter;
> + struct mem_cgroup *mi;
> int i;
>
> - memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
> + for_each_mem_cgroup_tree(mi, memcg) {
> + for (i = 0; i < acc->stats_size; i++)
> + acc->stat[i] += memcg_page_state(mi,
> + acc->stats_array ? acc->stats_array[i] : i);
>
> - for_each_mem_cgroup_tree(iter, memcg) {
> - for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
> - events[i] += memcg_sum_events(iter, i);
> + for (i = 0; i < acc->events_size; i++)
> + acc->events[i] += memcg_sum_events(mi,
> + acc->events_array ? acc->events_array[i] : i);
> +
> + for (i = 0; i < NR_LRU_LISTS; i++)
> + acc->lru_pages[i] +=
> + mem_cgroup_nr_lru_pages(mi, BIT(i));
> }
> }
>
> @@ -3555,6 +3560,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
> unsigned long memory, memsw;
> struct mem_cgroup *mi;
> unsigned int i;
> + struct accumulated_stats acc;
>
> BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
> BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
> @@ -3587,32 +3593,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
> seq_printf(m, "hierarchical_memsw_limit %llu\n",
> (u64)memsw * PAGE_SIZE);
>
> - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
> - unsigned long long val = 0;
> + memset(&acc, 0, sizeof(acc));
> + acc.stats_size = ARRAY_SIZE(memcg1_stats);
> + acc.stats_array = memcg1_stats;
> + acc.events_size = ARRAY_SIZE(memcg1_events);
> + acc.events_array = memcg1_events;
> + accumulate_memcg_tree(memcg, &acc);
>
> + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
> if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
> continue;
> - for_each_mem_cgroup_tree(mi, memcg)
> - val += memcg_page_state(mi, memcg1_stats[i]) *
> - PAGE_SIZE;
> - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
> + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
> + (u64)acc.stat[i] * PAGE_SIZE);
> }
>
> - for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
> - unsigned long long val = 0;
> -
> - for_each_mem_cgroup_tree(mi, memcg)
> - val += memcg_sum_events(mi, memcg1_events[i]);
> - seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
> - }
> -
> - for (i = 0; i < NR_LRU_LISTS; i++) {
> - unsigned long long val = 0;
> + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
> + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
> + (u64)acc.events[i]);
>
> - for_each_mem_cgroup_tree(mi, memcg)
> - val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
> - seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
> - }
> + for (i = 0; i < NR_LRU_LISTS; i++)
> + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
> + (u64)acc.lru_pages[i] * PAGE_SIZE);
>
> #ifdef CONFIG_DEBUG_VM
> {
> @@ -5737,8 +5738,7 @@ static int memory_events_show(struct seq_file *m, void *v)
> static int memory_stat_show(struct seq_file *m, void *v)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
> - unsigned long stat[MEMCG_NR_STAT];
> - unsigned long events[NR_VM_EVENT_ITEMS];
> + struct accumulated_stats acc;
> int i;
>
> /*
> @@ -5752,66 +5752,62 @@ static int memory_stat_show(struct seq_file *m, void *v)
> * Current memory state:
> */
>
> - tree_stat(memcg, stat);
> - tree_events(memcg, events);
> + memset(&acc, 0, sizeof(acc));
> + acc.stats_size = MEMCG_NR_STAT;
> + acc.events_size = NR_VM_EVENT_ITEMS;
> + accumulate_memcg_tree(memcg, &acc);
>
> seq_printf(m, "anon %llu\n",
> - (u64)stat[MEMCG_RSS] * PAGE_SIZE);
> + (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
> seq_printf(m, "file %llu\n",
> - (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
> + (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
> seq_printf(m, "kernel_stack %llu\n",
> - (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
> + (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
> seq_printf(m, "slab %llu\n",
> - (u64)(stat[NR_SLAB_RECLAIMABLE] +
> - stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
> + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
> + acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
> seq_printf(m, "sock %llu\n",
> - (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
> + (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
>
> seq_printf(m, "shmem %llu\n",
> - (u64)stat[NR_SHMEM] * PAGE_SIZE);
> + (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
> seq_printf(m, "file_mapped %llu\n",
> - (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
> + (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
> seq_printf(m, "file_dirty %llu\n",
> - (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
> + (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
> seq_printf(m, "file_writeback %llu\n",
> - (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
> + (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
>
> - for (i = 0; i < NR_LRU_LISTS; i++) {
> - struct mem_cgroup *mi;
> - unsigned long val = 0;
> -
> - for_each_mem_cgroup_tree(mi, memcg)
> - val += mem_cgroup_nr_lru_pages(mi, BIT(i));
> - seq_printf(m, "%s %llu\n",
> - mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
> - }
> + for (i = 0; i < NR_LRU_LISTS; i++)
> + seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
> + (u64)acc.lru_pages[i] * PAGE_SIZE);
>
> seq_printf(m, "slab_reclaimable %llu\n",
> - (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
> + (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
> seq_printf(m, "slab_unreclaimable %llu\n",
> - (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
> + (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
>
> /* Accumulated memory events */
>
> - seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
> - seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
> + seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
> + seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
>
> - seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
> - seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
> - events[PGSCAN_DIRECT]);
> - seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
> - events[PGSTEAL_DIRECT]);
> - seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
> - seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
> - seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
> - seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
> + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
> + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
> + acc.events[PGSCAN_DIRECT]);
> + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
> + acc.events[PGSTEAL_DIRECT]);
> + seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
> + seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
> + seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
> + seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
>
> seq_printf(m, "workingset_refault %lu\n",
> - stat[WORKINGSET_REFAULT]);
> + acc.stat[WORKINGSET_REFAULT]);
> seq_printf(m, "workingset_activate %lu\n",
> - stat[WORKINGSET_ACTIVATE]);
> + acc.stat[WORKINGSET_ACTIVATE]);
> seq_printf(m, "workingset_nodereclaim %lu\n",
> - stat[WORKINGSET_NODERECLAIM]);
> + acc.stat[WORKINGSET_NODERECLAIM]);
>
> return 0;
> }
> --
> 2.18.0.233.g985f88cf7e-goog

--
Michal Hocko
SUSE Labs