Re: [PATCH v3 4/8] memcg: reduce memory for the lruvec and memcg stats

From: T.J. Mercier
Date: Tue Apr 30 2024 - 13:31:11 EST


On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@xxxxxxxxx> wrote:
>
> At the moment, the amount of memory allocated for stats related structs
> in the mem_cgroup corresponds to the size of enum node_stat_item.
> However not all fields in enum node_stat_item has corresponding memcg

typo: "have corresponding"

> stats. So, let's use indirection mechanism similar to the one used for
> memcg vmstats management.
>
> For a given x86_64 config, the size of stats with and without patch is:
>
> structs size in bytes w/o with
>
> struct lruvec_stats 1128 648
> struct lruvec_stats_percpu 752 432
> struct memcg_vmstats 1832 1352
> struct memcg_vmstats_percpu 1280 960
>
> The memory savings is further compounded by the fact that these structs
> are allocated for each cpu and for each node. To be precise, for each
> memcg the memory saved would be:
>
> Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +

typo: "NR_NODES"

> (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)
>
> Where 21 is the number of fields eliminated.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@xxxxxxxxx>
> ---
>
> Changes since v2:
> - N/A
>
> mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 115 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 434cff91b65e..f424c5b2ba9b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
> return mz;
> }
>
> +/* Subset of node_stat_item for memcg stats */
> +static const unsigned int memcg_node_stat_items[] = {
> + NR_INACTIVE_ANON,
> + NR_ACTIVE_ANON,
> + NR_INACTIVE_FILE,
> + NR_ACTIVE_FILE,
> + NR_UNEVICTABLE,
> + NR_SLAB_RECLAIMABLE_B,
> + NR_SLAB_UNRECLAIMABLE_B,
> + WORKINGSET_REFAULT_ANON,
> + WORKINGSET_REFAULT_FILE,
> + WORKINGSET_ACTIVATE_ANON,
> + WORKINGSET_ACTIVATE_FILE,
> + WORKINGSET_RESTORE_ANON,
> + WORKINGSET_RESTORE_FILE,
> + WORKINGSET_NODERECLAIM,
> + NR_ANON_MAPPED,
> + NR_FILE_MAPPED,
> + NR_FILE_PAGES,
> + NR_FILE_DIRTY,
> + NR_WRITEBACK,
> + NR_SHMEM,
> + NR_SHMEM_THPS,
> + NR_FILE_THPS,
> + NR_ANON_THPS,
> + NR_KERNEL_STACK_KB,
> + NR_PAGETABLE,
> + NR_SECONDARY_PAGETABLE,
> +#ifdef CONFIG_SWAP
> + NR_SWAPCACHE,
> +#endif
> +};
> +
> +static const unsigned int memcg_stat_items[] = {
> + MEMCG_SWAP,
> + MEMCG_SOCK,
> + MEMCG_PERCPU_B,
> + MEMCG_VMALLOC,
> + MEMCG_KMEM,
> + MEMCG_ZSWAP_B,
> + MEMCG_ZSWAPPED,
> +};
> +
> +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> +
> +static void init_memcg_stats(void)
> +{
> + int8_t i, j = 0;
> +
> + /* Switch to short once this failure occurs. */
> + BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
> +
> + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
> + mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
> +
> + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
> + mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
> +}
> +
> +static inline int memcg_stats_index(int idx)
> +{
> + return mem_cgroup_stats_index[idx] - 1;

Could this just be: return mem_cgroup_stats_index[idx];
with a postfix increment of j in init_memcg_stats instead of prefix increment?


> +}
> +
> struct lruvec_stats_percpu {
> /* Local (CPU and cgroup) state */
> - long state[NR_VM_NODE_STAT_ITEMS];
> + long state[NR_MEMCG_NODE_STAT_ITEMS];
>
> /* Delta calculation for lockless upward propagation */
> - long state_prev[NR_VM_NODE_STAT_ITEMS];
> + long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
> };
>
> struct lruvec_stats {
> /* Aggregated (CPU and subtree) state */
> - long state[NR_VM_NODE_STAT_ITEMS];
> + long state[NR_MEMCG_NODE_STAT_ITEMS];
>
> /* Non-hierarchical (CPU aggregated) state */
> - long state_local[NR_VM_NODE_STAT_ITEMS];
> + long state_local[NR_MEMCG_NODE_STAT_ITEMS];
>
> /* Pending child counts during tree propagation */
> - long state_pending[NR_VM_NODE_STAT_ITEMS];
> + long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
> };
>
> unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> {
> struct mem_cgroup_per_node *pn;
> - long x;
> + long x = 0;
> + int i;
>
> if (mem_cgroup_disabled())
> return node_page_state(lruvec_pgdat(lruvec), idx);
>
> - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - x = READ_ONCE(pn->lruvec_stats->state[idx]);
> + i = memcg_stats_index(idx);
> + if (i >= 0) {
> + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> + x = READ_ONCE(pn->lruvec_stats->state[i]);
> + }
> #ifdef CONFIG_SMP
> if (x < 0)
> x = 0;
> @@ -617,12 +687,16 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> {
> struct mem_cgroup_per_node *pn;
> long x = 0;
> + int i;
>
> if (mem_cgroup_disabled())
> return node_page_state(lruvec_pgdat(lruvec), idx);
>
> - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> + i = memcg_stats_index(idx);
> + if (i >= 0) {
> + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> + x = READ_ONCE(pn->lruvec_stats->state_local[i]);
> + }
> #ifdef CONFIG_SMP
> if (x < 0)
> x = 0;
> @@ -689,11 +763,11 @@ struct memcg_vmstats_percpu {
> /* The above should fit a single cacheline for memcg_rstat_updated() */
>
> /* Local (CPU and cgroup) page state & events */
> - long state[MEMCG_NR_STAT];
> + long state[NR_MEMCG_STATS];
> unsigned long events[NR_MEMCG_EVENTS];
>
> /* Delta calculation for lockless upward propagation */
> - long state_prev[MEMCG_NR_STAT];
> + long state_prev[NR_MEMCG_STATS];
> unsigned long events_prev[NR_MEMCG_EVENTS];
>
> /* Cgroup1: threshold notifications & softlimit tree updates */
> @@ -703,15 +777,15 @@ struct memcg_vmstats_percpu {
>
> struct memcg_vmstats {
> /* Aggregated (CPU and subtree) page state & events */
> - long state[MEMCG_NR_STAT];
> + long state[NR_MEMCG_STATS];
> unsigned long events[NR_MEMCG_EVENTS];
>
> /* Non-hierarchical (CPU aggregated) page state & events */
> - long state_local[MEMCG_NR_STAT];
> + long state_local[NR_MEMCG_STATS];
> unsigned long events_local[NR_MEMCG_EVENTS];
>
> /* Pending child counts during tree propagation */
> - long state_pending[MEMCG_NR_STAT];
> + long state_pending[NR_MEMCG_STATS];
> unsigned long events_pending[NR_MEMCG_EVENTS];
>
> /* Stats updates since the last flush */
> @@ -844,7 +918,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
>
> unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
> {
> - long x = READ_ONCE(memcg->vmstats->state[idx]);
> + long x;
> + int i = memcg_stats_index(idx);
> +
> + if (i < 0)
> + return 0;
> +
> + x = READ_ONCE(memcg->vmstats->state[i]);
> #ifdef CONFIG_SMP
> if (x < 0)
> x = 0;
> @@ -876,18 +956,25 @@ static int memcg_state_val_in_pages(int idx, int val)
> */
> void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
> {
> - if (mem_cgroup_disabled())
> + int i = memcg_stats_index(idx);
> +
> + if (mem_cgroup_disabled() || i < 0)
> return;
>
> - __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> + __this_cpu_add(memcg->vmstats_percpu->state[i], val);
> memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> }
>
> /* idx can be of type enum memcg_stat_item or node_stat_item. */
> static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
> {
> - long x = READ_ONCE(memcg->vmstats->state_local[idx]);
> + long x;
> + int i = memcg_stats_index(idx);
> +
> + if (i < 0)
> + return 0;
>
> + x = READ_ONCE(memcg->vmstats->state_local[i]);
> #ifdef CONFIG_SMP
> if (x < 0)
> x = 0;
> @@ -901,6 +988,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
> {
> struct mem_cgroup_per_node *pn;
> struct mem_cgroup *memcg;
> + int i = memcg_stats_index(idx);
> +
> + if (i < 0)
> + return;
>
> pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> memcg = pn->memcg;
> @@ -930,10 +1021,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
> }
>
> /* Update memcg */
> - __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> + __this_cpu_add(memcg->vmstats_percpu->state[i], val);
>
> /* Update lruvec */
> - __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
> + __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
>
> memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
> memcg_stats_unlock();
> @@ -5702,6 +5793,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> page_counter_init(&memcg->kmem, &parent->kmem);
> page_counter_init(&memcg->tcpmem, &parent->tcpmem);
> } else {
> + init_memcg_stats();
> init_memcg_events();
> page_counter_init(&memcg->memory, NULL);
> page_counter_init(&memcg->swap, NULL);
> @@ -5873,7 +5965,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
> statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
>
> - for (i = 0; i < MEMCG_NR_STAT; i++) {
> + for (i = 0; i < NR_MEMCG_STATS; i++) {
> /*
> * Collect the aggregated propagation counts of groups
> * below us. We're in a per-cpu loop here and this is
> @@ -5937,7 +6029,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
> lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
>
> - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
> delta = lstats->state_pending[i];
> if (delta)
> lstats->state_pending[i] = 0;
> --
> 2.43.0
>