[PATCH 2/3] mm: memcontrol: Use cgroup_rstat for stat accounting
From: Tejun Heo
Date: Sat Mar 24 2018 - 12:10:06 EST
To fix scalability issue, a983b5ebee57 ("mm: memcontrol: fix excessive
complexity in memory.stat reporting") made the per-cpu counters
batch-overflow into the global one instead of summing them up on
reads.
The approach didn't for events and the previous patch switched event
accounting to cgroup_rstat. Unlike events, it works for stat
accounting but switching to cgroup_rstat has the following benefits
while keeping computational complexity low.
* More accurate accounting. The accumulated per-cpu errors with the
batch approach could add up and cause unintended results with
extreme configurations (e.g. balance_dirty_pages misbehavior with
very low dirty ratio in a cgroup with a low memory limit).
* Consistency with event accounting.
* Cheaper and simpler access to hierarchical stats.
This patch converts stat accounting to use cgroup_rstat.
* mem_cgroup_stat_cpu->last_count[] and mem_cgroup->pending_stat[] are
added to track propagation. As memcg makes use of both local and
hierarchical stats, mem_cgroup->tree_stat[] is added to track
hierarchical numbers.
* An rstat flush wrapper, memcg_stat_flush(), is added for memcg stat
consumers outside memcg proper.
* Accessors are updated / added.
Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Roman Gushchin <guro@xxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---
include/linux/memcontrol.h | 74 +++++++++++++++++++++++++++++---------
mm/memcontrol.c | 90 +++++++++++++++++++++++++---------------------
mm/vmscan.c | 4 ++-
3 files changed, 110 insertions(+), 58 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f1afbf6..0cf6d5a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -93,6 +93,7 @@ struct mem_cgroup_stat_cpu {
unsigned long targets[MEM_CGROUP_NTARGETS];
/* for cgroup rstat delta calculation */
+ unsigned long last_count[MEMCG_NR_STAT];
unsigned long last_events[MEMCG_NR_EVENTS];
};
@@ -235,9 +236,12 @@ struct mem_cgroup {
unsigned long move_lock_flags;
struct mem_cgroup_stat_cpu __percpu *stat_cpu;
- atomic_long_t stat[MEMCG_NR_STAT];
- /* events is managed by cgroup rstat */
+ /* stat and events are managed by cgroup rstat */
+ long stat[MEMCG_NR_STAT]; /* local */
+ long tree_stat[MEMCG_NR_STAT]; /* subtree */
+ long pending_stat[MEMCG_NR_STAT]; /* propagation */
+
unsigned long long events[MEMCG_NR_EVENTS]; /* local */
unsigned long long tree_events[MEMCG_NR_EVENTS]; /* subtree */
unsigned long long pending_events[MEMCG_NR_EVENTS];/* propagation */
@@ -497,11 +501,32 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
void __unlock_page_memcg(struct mem_cgroup *memcg);
void unlock_page_memcg(struct page *page);
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
- int idx)
+/**
+ * memcg_stat_flush - flush stat in a memcg's subtree
+ * @memcg: target memcg
+ *
+ * Flush cgroup_rstat statistics in @memcg's subtree. This brings @memcg's
+ * statistics up-to-date.
+ */
+static inline void memcg_stat_flush(struct mem_cgroup *memcg)
{
- long x = atomic_long_read(&memcg->stat[idx]);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ cgroup_rstat_flush(memcg->css.cgroup);
+}
+
+/**
+ * __memcg_page_state - read page state counter without brininging it up-to-date
+ * @memcg: target memcg
+ * @idx: page state item to read
+ *
+ * Read a memcg page state counter. @idx can be of type enum
+ * memcg_stat_item or node_stat_item. The caller must haved flushed by
+ * calling memcg_stat_flush() to bring the counter up-to-date.
+ */
+static inline unsigned long __memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->stat[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -509,21 +534,30 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
return x;
}
+/**
+ * memcg_page_state - read page state counter after bringing it up-to-date
+ * @memcg: target memcg
+ * @idx: page state item to read
+ *
+ * __memcg_page_state() with implied flushing. When reading multiple
+ * counters in sequence, flushing explicitly and using __memcg_page_state()
+ * is cheaper.
+ */
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ memcg_stat_flush(memcg);
+ return __memcg_page_state(memcg, idx);
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
{
- long x;
-
if (mem_cgroup_disabled())
return;
- x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
- if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
- atomic_long_add(x, &memcg->stat[idx]);
- x = 0;
- }
- __this_cpu_write(memcg->stat_cpu->count[idx], x);
+ __this_cpu_add(memcg->stat_cpu->count[idx], val);
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
}
/* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -895,8 +929,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
return false;
}
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
- int idx)
+static inline void memcg_stat_flush(struct mem_cgroup *memcg)
+{
+}
+
+static inline unsigned long __memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ return 0;
+}
+
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 82cb532..03d1b30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -307,6 +307,16 @@ struct workqueue_struct *memcg_kmem_cache_wq;
#endif /* !CONFIG_SLOB */
+static unsigned long __memcg_tree_stat(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->tree_stat[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -1150,6 +1160,8 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
K((u64)page_counter_read(&memcg->kmem)),
K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
+ memcg_stat_flush(memcg);
+
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(iter->css.cgroup);
@@ -1159,7 +1171,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state(iter, memcg1_stats[i])));
+ K(__memcg_page_state(iter, memcg1_stats[i])));
}
for (i = 0; i < NR_LRU_LISTS; i++)
@@ -1812,17 +1824,10 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
for_each_mem_cgroup(memcg) {
int i;
- for (i = 0; i < MEMCG_NR_STAT; i++) {
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
int nid;
long x;
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
- if (x)
- atomic_long_add(x, &memcg->stat[i]);
-
- if (i >= NR_VM_NODE_STAT_ITEMS)
- continue;
-
for_each_node(nid) {
struct mem_cgroup_per_node *pn;
@@ -2656,32 +2661,16 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
-{
- struct mem_cgroup *iter;
- int i;
-
- memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += memcg_page_state(iter, i);
- }
-}
-
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val = 0;
if (mem_cgroup_is_root(memcg)) {
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg) {
- val += memcg_page_state(iter, MEMCG_CACHE);
- val += memcg_page_state(iter, MEMCG_RSS);
- if (swap)
- val += memcg_page_state(iter, MEMCG_SWAP);
- }
+ memcg_stat_flush(memcg);
+ val += __memcg_tree_stat(memcg, MEMCG_CACHE);
+ val += __memcg_tree_stat(memcg, MEMCG_RSS);
+ if (swap)
+ val += __memcg_tree_stat(memcg, MEMCG_SWAP);
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -3086,7 +3075,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state(memcg, memcg1_stats[i]) *
+ __memcg_page_state(memcg, memcg1_stats[i]) *
PAGE_SIZE);
}
@@ -3111,14 +3100,11 @@ static int memcg_stat_show(struct seq_file *m, void *v)
(u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- unsigned long long val = 0;
-
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- for_each_mem_cgroup_tree(mi, memcg)
- val += memcg_page_state(mi, memcg1_stats[i]) *
- PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
+ seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ (u64)__memcg_tree_stat(memcg, memcg1_stats[i]) *
+ PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -3592,10 +3578,16 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ /*
+ * This function is called under a spinlock. Use the irq-safe
+ * version instead of memcg_stat_flush().
+ */
+ cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+
+ *pdirty = __memcg_page_state(memcg, NR_FILE_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ *pwriteback = __memcg_page_state(memcg, NR_WRITEBACK);
*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
(1 << LRU_ACTIVE_FILE));
*pheadroom = PAGE_COUNTER_MAX;
@@ -4310,6 +4302,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
unsigned long v, delta;
int i;
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ /* calculate the delta to propagate and add to local stat */
+ v = READ_ONCE(statc->count[i]);
+ delta = v - statc->last_count[i];
+ statc->last_count[i] = v;
+ memcg->stat[i] += delta;
+
+ /* transfer the pending stat into delta */
+ delta += memcg->pending_stat[i];
+ memcg->pending_stat[i] = 0;
+
+ /* propagate delta into tree stat and parent's pending */
+ memcg->tree_stat[i] += delta;
+ if (parent)
+ parent->pending_stat[i] += delta;
+ }
+
for (i = 0; i < MEMCG_NR_EVENTS; i++) {
/* calculate the delta to propagate and add to local stat */
v = READ_ONCE(statc->events[i]);
@@ -5207,7 +5216,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long stat[MEMCG_NR_STAT];
+ unsigned long *stat = memcg->tree_stat;
unsigned long long *events = memcg->tree_events;
int i;
@@ -5222,7 +5231,6 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
- tree_stat(memcg, stat);
cgroup_rstat_flush_hold(memcg->css.cgroup);
seq_printf(m, "anon %llu\n",
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bee5349..29bf99f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2738,13 +2738,15 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
{
struct mem_cgroup *memcg;
+ memcg_stat_flush(root_memcg);
+
memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
do {
unsigned long refaults;
struct lruvec *lruvec;
if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
+ refaults = __memcg_page_state(memcg, WORKINGSET_ACTIVATE);
else
refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
--
2.9.5