Re: [PATCH RFC 1/2] memcg: use percpu_counter for statistics

From: Kamezawa Hiroyuki
Date: Thu Sep 11 2014 - 21:11:57 EST


(2014/09/12 0:41), Vladimir Davydov wrote:
> In the next patch I need a quick way to get a value of
> MEM_CGROUP_STAT_RSS. The current procedure (mem_cgroup_read_stat) is
> slow (iterates over all cpus) and may sleep (uses get/put_online_cpus),
> so it's a no-go.
>
> This patch converts memory cgroup statistics to use percpu_counter so
> that percpu_counter_read will do the trick.
>
> Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>


I have no strong objections but you need performance comparison to go with this.

I thought percpu counter was messy to be used for "array".
I can't understand why you started from fixing future performance problem before
merging new feature.

Thanks,
-Kame


> ---
> mm/memcontrol.c | 217 ++++++++++++++++++-------------------------------------
> 1 file changed, 69 insertions(+), 148 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 085dc6d2f876..7e8d65e0608a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -136,9 +136,7 @@ enum mem_cgroup_events_target {
> #define SOFTLIMIT_EVENTS_TARGET 1024
> #define NUMAINFO_EVENTS_TARGET 1024
>
> -struct mem_cgroup_stat_cpu {
> - long count[MEM_CGROUP_STAT_NSTATS];
> - unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
> +struct mem_cgroup_ratelimit_state {
> unsigned long nr_page_events;
> unsigned long targets[MEM_CGROUP_NTARGETS];
> };
> @@ -341,16 +339,10 @@ struct mem_cgroup {
> atomic_t moving_account;
> /* taken only while moving_account > 0 */
> spinlock_t move_lock;
> - /*
> - * percpu counter.
> - */
> - struct mem_cgroup_stat_cpu __percpu *stat;
> - /*
> - * used when a cpu is offlined or other synchronizations
> - * See mem_cgroup_read_stat().
> - */
> - struct mem_cgroup_stat_cpu nocpu_base;
> - spinlock_t pcp_counter_lock;
> +
> + struct percpu_counter stat[MEM_CGROUP_STAT_NSTATS];
> + struct percpu_counter events[MEM_CGROUP_EVENTS_NSTATS];
> + struct mem_cgroup_ratelimit_state __percpu *ratelimit;
>
> atomic_t dead_count;
> #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
> @@ -849,59 +841,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
> return mz;
> }
>
> -/*
> - * Implementation Note: reading percpu statistics for memcg.
> - *
> - * Both of vmstat[] and percpu_counter has threshold and do periodic
> - * synchronization to implement "quick" read. There are trade-off between
> - * reading cost and precision of value. Then, we may have a chance to implement
> - * a periodic synchronizion of counter in memcg's counter.
> - *
> - * But this _read() function is used for user interface now. The user accounts
> - * memory usage by memory cgroup and he _always_ requires exact value because
> - * he accounts memory. Even if we provide quick-and-fuzzy read, we always
> - * have to visit all online cpus and make sum. So, for now, unnecessary
> - * synchronization is not implemented. (just implemented for cpu hotplug)
> - *
> - * If there are kernel internal actions which can make use of some not-exact
> - * value, and reading all cpu value can be performance bottleneck in some
> - * common workload, threashold and synchonization as vmstat[] should be
> - * implemented.
> - */
> static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
> enum mem_cgroup_stat_index idx)
> {
> - long val = 0;
> - int cpu;
> -
> - get_online_cpus();
> - for_each_online_cpu(cpu)
> - val += per_cpu(memcg->stat->count[idx], cpu);
> -#ifdef CONFIG_HOTPLUG_CPU
> - spin_lock(&memcg->pcp_counter_lock);
> - val += memcg->nocpu_base.count[idx];
> - spin_unlock(&memcg->pcp_counter_lock);
> -#endif
> - put_online_cpus();
> - return val;
> + return percpu_counter_read(&memcg->stat[idx]);
> }
>
> static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
> enum mem_cgroup_events_index idx)
> {
> - unsigned long val = 0;
> - int cpu;
> -
> - get_online_cpus();
> - for_each_online_cpu(cpu)
> - val += per_cpu(memcg->stat->events[idx], cpu);
> -#ifdef CONFIG_HOTPLUG_CPU
> - spin_lock(&memcg->pcp_counter_lock);
> - val += memcg->nocpu_base.events[idx];
> - spin_unlock(&memcg->pcp_counter_lock);
> -#endif
> - put_online_cpus();
> - return val;
> + return percpu_counter_read(&memcg->events[idx]);
> }
>
> static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
> @@ -913,25 +862,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
> * counted as CACHE even if it's on ANON LRU.
> */
> if (PageAnon(page))
> - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
> + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS],
> nr_pages);
> else
> - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
> + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_CACHE],
> nr_pages);
>
> if (PageTransHuge(page))
> - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
> nr_pages);
>
> /* pagein of a big page is an event. So, ignore page size */
> if (nr_pages > 0)
> - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
> - else {
> - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
> - nr_pages = -nr_pages; /* for event */
> - }
> -
> - __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
> + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGIN]);
> + else
> + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT]);
> }
>
> unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
> @@ -981,8 +926,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> {
> unsigned long val, next;
>
> - val = __this_cpu_read(memcg->stat->nr_page_events);
> - next = __this_cpu_read(memcg->stat->targets[target]);
> + val = __this_cpu_read(memcg->ratelimit->nr_page_events);
> + next = __this_cpu_read(memcg->ratelimit->targets[target]);
> /* from time_after() in jiffies.h */
> if ((long)next - (long)val < 0) {
> switch (target) {
> @@ -998,7 +943,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> default:
> break;
> }
> - __this_cpu_write(memcg->stat->targets[target], next);
> + __this_cpu_write(memcg->ratelimit->targets[target], next);
> return true;
> }
> return false;
> @@ -1006,10 +951,15 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>
> /*
> * Check events in order.
> - *
> */
> -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
> +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page,
> + unsigned long nr_pages)
> {
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + __this_cpu_add(memcg->ratelimit->nr_page_events, nr_pages);
> +
> /* threshold event is triggered in finer grain than soft limit */
> if (unlikely(mem_cgroup_event_ratelimit(memcg,
> MEM_CGROUP_TARGET_THRESH))) {
> @@ -1030,6 +980,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
> atomic_inc(&memcg->numainfo_events);
> #endif
> }
> + local_irq_restore(flags);
> }
>
> struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
> @@ -1294,10 +1245,10 @@ void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
>
> switch (idx) {
> case PGFAULT:
> - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
> + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGFAULT]);
> break;
> case PGMAJFAULT:
> - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
> + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
> break;
> default:
> BUG();
> @@ -2306,7 +2257,7 @@ void mem_cgroup_update_page_stat(struct page *page,
> if (unlikely(!memcg || !PageCgroupUsed(pc)))
> return;
>
> - this_cpu_add(memcg->stat->count[idx], val);
> + percpu_counter_add(&memcg->stat[idx], val);
> }
>
> /*
> @@ -2476,37 +2427,12 @@ static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
> mutex_unlock(&percpu_charge_mutex);
> }
>
> -/*
> - * This function drains percpu counter value from DEAD cpu and
> - * move it to local cpu. Note that this function can be preempted.
> - */
> -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
> -{
> - int i;
> -
> - spin_lock(&memcg->pcp_counter_lock);
> - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
> - long x = per_cpu(memcg->stat->count[i], cpu);
> -
> - per_cpu(memcg->stat->count[i], cpu) = 0;
> - memcg->nocpu_base.count[i] += x;
> - }
> - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
> - unsigned long x = per_cpu(memcg->stat->events[i], cpu);
> -
> - per_cpu(memcg->stat->events[i], cpu) = 0;
> - memcg->nocpu_base.events[i] += x;
> - }
> - spin_unlock(&memcg->pcp_counter_lock);
> -}
> -
> static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
> unsigned long action,
> void *hcpu)
> {
> int cpu = (unsigned long)hcpu;
> struct memcg_stock_pcp *stock;
> - struct mem_cgroup *iter;
>
> if (action == CPU_ONLINE)
> return NOTIFY_OK;
> @@ -2514,9 +2440,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
> if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
> return NOTIFY_OK;
>
> - for_each_mem_cgroup(iter)
> - mem_cgroup_drain_pcp_counter(iter, cpu);
> -
> stock = &per_cpu(memcg_stock, cpu);
> drain_stock(stock);
> return NOTIFY_OK;
> @@ -3419,8 +3342,8 @@ void mem_cgroup_split_huge_fixup(struct page *head)
> pc->mem_cgroup = memcg;
> pc->flags = head_pc->flags;
> }
> - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> - HPAGE_PMD_NR);
> + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
> + HPAGE_PMD_NR);
> }
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> @@ -3475,17 +3398,17 @@ static int mem_cgroup_move_account(struct page *page,
> move_lock_mem_cgroup(from, &flags);
>
> if (!PageAnon(page) && page_mapped(page)) {
> - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
> - nr_pages);
> - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
> - nr_pages);
> + percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_FILE_MAPPED],
> + nr_pages);
> + percpu_counter_add(&to->stat[MEM_CGROUP_STAT_FILE_MAPPED],
> + nr_pages);
> }
>
> if (PageWriteback(page)) {
> - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
> - nr_pages);
> - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
> - nr_pages);
> + percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_WRITEBACK],
> + nr_pages);
> + percpu_counter_add(&to->stat[MEM_CGROUP_STAT_WRITEBACK],
> + nr_pages);
> }
>
> /*
> @@ -3499,12 +3422,10 @@ static int mem_cgroup_move_account(struct page *page,
> move_unlock_mem_cgroup(from, &flags);
> ret = 0;
>
> - local_irq_disable();
> mem_cgroup_charge_statistics(to, page, nr_pages);
> - memcg_check_events(to, page);
> + memcg_check_events(to, page, nr_pages);
> mem_cgroup_charge_statistics(from, page, -nr_pages);
> - memcg_check_events(from, page);
> - local_irq_enable();
> + memcg_check_events(from, page, nr_pages);
> out_unlock:
> unlock_page(page);
> out:
> @@ -3582,7 +3503,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
> bool charge)
> {
> int val = (charge) ? 1 : -1;
> - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
> + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_SWAP], val);
> }
>
> /**
> @@ -5413,25 +5334,11 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
>
> static struct mem_cgroup *mem_cgroup_alloc(void)
> {
> - struct mem_cgroup *memcg;
> size_t size;
>
> size = sizeof(struct mem_cgroup);
> size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
> -
> - memcg = kzalloc(size, GFP_KERNEL);
> - if (!memcg)
> - return NULL;
> -
> - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
> - if (!memcg->stat)
> - goto out_free;
> - spin_lock_init(&memcg->pcp_counter_lock);
> - return memcg;
> -
> -out_free:
> - kfree(memcg);
> - return NULL;
> + return kzalloc(size, GFP_KERNEL);
> }
>
> /*
> @@ -5448,13 +5355,20 @@ out_free:
> static void __mem_cgroup_free(struct mem_cgroup *memcg)
> {
> int node;
> + int i;
>
> mem_cgroup_remove_from_trees(memcg);
>
> for_each_node(node)
> free_mem_cgroup_per_zone_info(memcg, node);
>
> - free_percpu(memcg->stat);
> + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
> + percpu_counter_destroy(&memcg->stat[i]);
> +
> + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
> + percpu_counter_destroy(&memcg->events[i]);
> +
> + free_percpu(memcg->ratelimit);
>
> /*
> * We need to make sure that (at least for now), the jump label
> @@ -5511,11 +5425,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> struct mem_cgroup *memcg;
> long error = -ENOMEM;
> int node;
> + int i;
>
> memcg = mem_cgroup_alloc();
> if (!memcg)
> return ERR_PTR(error);
>
> + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
> + if (percpu_counter_init(&memcg->stat[i], 0, GFP_KERNEL))
> + goto free_out;
> +
> + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
> + if (percpu_counter_init(&memcg->events[i], 0, GFP_KERNEL))
> + goto free_out;
> +
> + memcg->ratelimit = alloc_percpu(struct mem_cgroup_ratelimit_state);
> + if (!memcg->ratelimit)
> + goto free_out;
> +
> for_each_node(node)
> if (alloc_mem_cgroup_per_zone_info(memcg, node))
> goto free_out;
> @@ -6507,10 +6434,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
> VM_BUG_ON_PAGE(!PageTransHuge(page), page);
> }
>
> - local_irq_disable();
> mem_cgroup_charge_statistics(memcg, page, nr_pages);
> - memcg_check_events(memcg, page);
> - local_irq_enable();
> + memcg_check_events(memcg, page, nr_pages);
>
> if (do_swap_account && PageSwapCache(page)) {
> swp_entry_t entry = { .val = page_private(page) };
> @@ -6557,8 +6482,6 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
> unsigned long nr_anon, unsigned long nr_file,
> unsigned long nr_huge, struct page *dummy_page)
> {
> - unsigned long flags;
> -
> if (!mem_cgroup_is_root(memcg)) {
> if (nr_mem)
> res_counter_uncharge(&memcg->res,
> @@ -6569,14 +6492,12 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
> memcg_oom_recover(memcg);
> }
>
> - local_irq_save(flags);
> - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
> - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
> - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
> - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
> - __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
> - memcg_check_events(memcg, dummy_page);
> - local_irq_restore(flags);
> + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS], nr_anon);
> + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_CACHE], nr_file);
> + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
> + percpu_counter_add(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
> +
> + memcg_check_events(memcg, dummy_page, nr_anon + nr_file);
> }
>
> static void uncharge_list(struct list_head *page_list)
>


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/