Re: [PATCH v4 1/4] mm: memcg: properly name and document unified stats flushing

From: Michal Hocko
Date: Mon Sep 04 2023 - 10:44:48 EST


On Thu 31-08-23 16:56:08, Yosry Ahmed wrote:
> Most contexts that flush memcg stats use "unified" flushing, where
> basically all flushers attempt to flush the entire hierarchy, but only
> one flusher is allowed at a time, others skip flushing.
>
> This is needed because we need to flush the stats from paths such as
> reclaim or refaults, which may have high concurrency, especially on
> large systems. Serializing such performance-sensitive paths can
> introduce regressions, hence, unified flushing offers a tradeoff between
> stats staleness and the performance impact of flushing stats.
>
> Document this properly and explicitly by renaming the common flushing
> helper from do_flush_stats() to do_unified_stats_flush(), and adding
> documentation to describe unified flushing. Additionally, rename
> flushing APIs to add "try" in the name, which implies that flushing will
> not always happen. Also add proper documentation.
>
> No functional change intended.
>
> Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>

No objections to renaming but it would be really nice to simplify this.
It is just "funny" to see 4 different flushing methods (flush from
userspace, flush, try_flush_with_ratelimit_1 and try_flush_with_ratelimit_2).
This is all internal so I am not all that worried that this would get
confused but it surely is rather convoluted.

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

> ---
> include/linux/memcontrol.h | 8 ++---
> mm/memcontrol.c | 61 +++++++++++++++++++++++++-------------
> mm/vmscan.c | 2 +-
> mm/workingset.c | 4 +--
> 4 files changed, 47 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 11810a2cfd2d..d517b0cc5221 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1034,8 +1034,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> return x;
> }
>
> -void mem_cgroup_flush_stats(void);
> -void mem_cgroup_flush_stats_ratelimited(void);
> +void mem_cgroup_try_flush_stats(void);
> +void mem_cgroup_try_flush_stats_ratelimited(void);
>
> void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
> int val);
> @@ -1519,11 +1519,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> return node_page_state(lruvec_pgdat(lruvec), idx);
> }
>
> -static inline void mem_cgroup_flush_stats(void)
> +static inline void mem_cgroup_try_flush_stats(void)
> {
> }
>
> -static inline void mem_cgroup_flush_stats_ratelimited(void)
> +static inline void mem_cgroup_try_flush_stats_ratelimited(void)
> {
> }
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index cf57fe9318d5..2d0ec828a1c4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -588,7 +588,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
> static void flush_memcg_stats_dwork(struct work_struct *w);
> static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
> static DEFINE_PER_CPU(unsigned int, stats_updates);
> -static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
> +static atomic_t stats_unified_flush_ongoing = ATOMIC_INIT(0);
> static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
> static u64 flush_next_time;
>
> @@ -630,7 +630,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
> /*
> * If stats_flush_threshold exceeds the threshold
> * (>num_online_cpus()), cgroup stats update will be triggered
> - * in __mem_cgroup_flush_stats(). Increasing this var further
> + * in mem_cgroup_try_flush_stats(). Increasing this var further
> * is redundant and simply adds overhead in atomic update.
> */
> if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
> @@ -639,15 +639,19 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
> }
> }
>
> -static void do_flush_stats(void)
> +/*
> + * do_unified_stats_flush - do a unified flush of memory cgroup statistics
> + *
> + * A unified flush tries to flush the entire hierarchy, but skips if there is
> + * another ongoing flush. This is meant for flushers that may have a lot of
> + * concurrency (e.g. reclaim, refault, etc), and should not be serialized to
> + * avoid slowing down performance-sensitive paths. A unified flush may skip, and
> + * hence may yield stale stats.
> + */
> +static void do_unified_stats_flush(void)
> {
> - /*
> - * We always flush the entire tree, so concurrent flushers can just
> - * skip. This avoids a thundering herd problem on the rstat global lock
> - * from memcg flushers (e.g. reclaim, refault, etc).
> - */
> - if (atomic_read(&stats_flush_ongoing) ||
> - atomic_xchg(&stats_flush_ongoing, 1))
> + if (atomic_read(&stats_unified_flush_ongoing) ||
> + atomic_xchg(&stats_unified_flush_ongoing, 1))
> return;
>
> WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
> @@ -655,19 +659,34 @@ static void do_flush_stats(void)
> cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
>
> atomic_set(&stats_flush_threshold, 0);
> - atomic_set(&stats_flush_ongoing, 0);
> + atomic_set(&stats_unified_flush_ongoing, 0);
> }
>
> -void mem_cgroup_flush_stats(void)
> +/*
> + * mem_cgroup_try_flush_stats - try to flush memory cgroup statistics
> + *
> + * Try to flush the stats of all memcgs that have stat updates since the last
> + * flush. We do not flush the stats if:
> + * - The magnitude of the pending updates is below a certain threshold.
> + * - There is another ongoing unified flush (see do_unified_stats_flush()).
> + *
> + * Hence, the stats may be stale, but ideally by less than FLUSH_TIME due to
> + * periodic flushing.
> + */
> +void mem_cgroup_try_flush_stats(void)
> {
> if (atomic_read(&stats_flush_threshold) > num_online_cpus())
> - do_flush_stats();
> + do_unified_stats_flush();
> }
>
> -void mem_cgroup_flush_stats_ratelimited(void)
> +/*
> + * Like mem_cgroup_try_flush_stats(), but only flushes if the periodic flusher
> + * is late.
> + */
> +void mem_cgroup_try_flush_stats_ratelimited(void)
> {
> if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
> }
>
> static void flush_memcg_stats_dwork(struct work_struct *w)
> @@ -676,7 +695,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
> * Always flush here so that flushing in latency-sensitive paths is
> * as cheap as possible.
> */
> - do_flush_stats();
> + do_unified_stats_flush();
> queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
> }
>
> @@ -1576,7 +1595,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
> *
> * Current memory state:
> */
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
>
> for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
> u64 size;
> @@ -4018,7 +4037,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
> int nid;
> struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
>
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
>
> for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
> seq_printf(m, "%s=%lu", stat->name,
> @@ -4093,7 +4112,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>
> BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
>
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
>
> for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
> unsigned long nr;
> @@ -4595,7 +4614,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
> struct mem_cgroup *parent;
>
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
>
> *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
> *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
> @@ -6610,7 +6629,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
> int i;
> struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
>
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
>
> for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
> int nid;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c7c149cb8d66..457a18921fda 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2923,7 +2923,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
> * Flush the memory cgroup stats, so that we read accurate per-memcg
> * lruvec stats for heuristics.
> */
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
>
> /*
> * Determine the scan balance between anon and file LRUs.
> diff --git a/mm/workingset.c b/mm/workingset.c
> index da58a26d0d4d..affb8699e58d 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -520,7 +520,7 @@ void workingset_refault(struct folio *folio, void *shadow)
> }
>
> /* Flush stats (and potentially sleep) before holding RCU read lock */
> - mem_cgroup_flush_stats_ratelimited();
> + mem_cgroup_try_flush_stats_ratelimited();
>
> rcu_read_lock();
>
> @@ -664,7 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
> struct lruvec *lruvec;
> int i;
>
> - mem_cgroup_flush_stats();
> + mem_cgroup_try_flush_stats();
> lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
> for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
> pages += lruvec_page_state_local(lruvec,
> --
> 2.42.0.rc2.253.gd59a3bf2b4-goog

--
Michal Hocko
SUSE Labs