Re: [PATCH v3 10/12] mm/memcg, swap: store cgroup id in cluster table directly

From: Kairui Song

Date: Wed May 13 2026 - 07:54:02 EST


On Sat, May 9, 2026 at 6:46 AM Chris Li <chrisl@xxxxxxxxxx> wrote:
>
> On Tue, Apr 21, 2026 at 2:16 AM Kairui Song via B4 Relay
> <devnull+kasong.tencent.com@xxxxxxxxxx> wrote:
> >
> > From: Kairui Song <kasong@xxxxxxxxxxx>
> >
> > Drop the usage of the swap_cgroup_ctrl, and use the dynamic cluster
> > table instead.
>
> Nice! It takes so many steps to finally drop the static allocated swap
> cgroup ctrl array. Thank you for making it happen.
>
> >
> > The per-cluster memcg table is 1024 / 512 bytes on most archs, and does
> > not need RCU protection: the cgroup data is only read and written under
> > the cluster lock. That keeps things simple, lets the allocation use
> > plain kmalloc with immediate kfree (no deferred free), and keeps
> > fragmentation acceptable.
> >
> > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
>
> Overall looks good, with some nitpick and question follows.
>
> Acked-by: Chris Li <chrisl@xxxxxxxxxx>
>
> > ---
> > include/linux/memcontrol.h | 6 ++++--
> > include/linux/swap.h | 8 +++----
> > mm/memcontrol-v1.c | 42 +++++++++++++++++++++++-------------
> > mm/memcontrol.c | 14 +++++++-----
> > mm/swap.h | 4 ++++
> > mm/swap_state.c | 6 ++----
> > mm/swap_table.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++
> > mm/swapfile.c | 35 +++++++++++++++++++-----------
> > mm/vmscan.c | 2 +-
> > 9 files changed, 128 insertions(+), 43 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index a013f37f24aa..bf1a6e131eca 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -29,6 +29,7 @@ struct obj_cgroup;
> > struct page;
> > struct mm_struct;
> > struct kmem_cache;
> > +struct swap_cluster_info;
> >
> > /* Cgroup-specific page state, on top of universal node page state */
> > enum memcg_stat_item {
> > @@ -1899,7 +1900,7 @@ static inline void mem_cgroup_exit_user_fault(void)
> > current->in_user_fault = 0;
> > }
> >
> > -void __memcg1_swapout(struct folio *folio);
> > +void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci);
> > void memcg1_swapin(struct folio *folio);
> >
> > #else /* CONFIG_MEMCG_V1 */
> > @@ -1929,7 +1930,8 @@ static inline void mem_cgroup_exit_user_fault(void)
> > {
> > }
> >
> > -static inline void __memcg1_swapout(struct folio *folio)
> > +static inline void __memcg1_swapout(struct folio *folio,
> > + struct swap_cluster_info *ci)
> > {
> > }
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index f2949f5844a6..57af4647d432 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -582,12 +582,12 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
> > return __mem_cgroup_try_charge_swap(folio);
> > }
> >
> > -extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
> > -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
> > +extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
> > +static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
> > {
> > if (mem_cgroup_disabled())
> > return;
> > - __mem_cgroup_uncharge_swap(entry, nr_pages);
> > + __mem_cgroup_uncharge_swap(id, nr_pages);
> > }
> >
> > extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
> > @@ -598,7 +598,7 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
> > return 0;
> > }
> >
> > -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
> > +static inline void mem_cgroup_uncharge_swap(unsigned short id,
> > unsigned int nr_pages)
> > {
> > }
> > diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> > index 36c507d81dc5..494e7b9adc60 100644
> > --- a/mm/memcontrol-v1.c
> > +++ b/mm/memcontrol-v1.c
> > @@ -14,6 +14,7 @@
> >
> > #include "internal.h"
> > #include "swap.h"
> > +#include "swap_table.h"
> > #include "memcontrol-v1.h"
> >
> > /*
> > @@ -606,14 +607,15 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
> > /**
> > * __memcg1_swapout - transfer a memsw charge to swap
> > * @folio: folio whose memsw charge to transfer
> > + * @ci: the locked swap cluster holding the swap entries
> > *
> > * Transfer the memsw charge of @folio to the swap entry stored in
> > * folio->swap.
> > *
> > - * Context: folio must be isolated, unmapped, locked and is just about
> > - * to be freed, and caller must disable IRQs.
> > + * Context: folio must be isolated, unmapped, locked and is just about to
> > + * be freed, and caller must disable IRQs and hold the swap cluster lock.
> > */
> > -void __memcg1_swapout(struct folio *folio)
> > +void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
> > {
> > struct mem_cgroup *memcg, *swap_memcg;
> > struct obj_cgroup *objcg;
> > @@ -646,7 +648,8 @@ void __memcg1_swapout(struct folio *folio)
> > swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries);
> > mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
> >
> > - swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), folio->swap);
> > + __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries,
> > + mem_cgroup_private_id(swap_memcg));
> >
> > folio_unqueue_deferred_split(folio);
> > folio->memcg_data = 0;
> > @@ -661,8 +664,7 @@ void __memcg1_swapout(struct folio *folio)
> > }
> >
> > /*
> > - * Interrupts should be disabled here because the caller holds the
> > - * i_pages lock which is taken with interrupts-off. It is
> > + * The caller must hold the swap cluster lock with IRQ off. It is
> > * important here to have the interrupts disabled because it is the
> > * only synchronisation we have for updating the per-CPU variables.
> > */
> > @@ -677,7 +679,7 @@ void __memcg1_swapout(struct folio *folio)
> > }
> >
> > /**
> > - * memcg1_swapin - uncharge swap slot
> > + * memcg1_swapin - uncharge swap slot on swapin
> > * @folio: folio being swapped in
> > *
> > * Call this function after successfully adding the charged
> > @@ -687,6 +689,10 @@ void __memcg1_swapout(struct folio *folio)
> > */
> > void memcg1_swapin(struct folio *folio)
> > {
> > + struct swap_cluster_info *ci;
> > + unsigned long nr_pages;
> > + unsigned short id;
> > +
> > VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
> > VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
> >
> > @@ -702,14 +708,20 @@ void memcg1_swapin(struct folio *folio)
> > * correspond 1:1 to page and swap slot lifetimes: we charge the
> > * page to memory here, and uncharge swap when the slot is freed.
> > */
> > - if (do_memsw_account()) {
> > - /*
> > - * The swap entry might not get freed for a long time,
> > - * let's not wait for it. The page already received a
> > - * memory+swap charge, drop the swap entry duplicate.
> > - */
> > - mem_cgroup_uncharge_swap(folio->swap, folio_nr_pages(folio));
> > - }
> > + if (!do_memsw_account())
> > + return;
> > +
> > + /*
> > + * The swap entry might not get freed for a long time,
> > + * let's not wait for it. The page already received a
> > + * memory+swap charge, drop the swap entry duplicate.
> > + */
> > + nr_pages = folio_nr_pages(folio);
> > + ci = swap_cluster_get_and_lock(folio);
> > + id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
> > + nr_pages);
> > + swap_cluster_unlock(ci);
> > + mem_cgroup_uncharge_swap(id, nr_pages);
> > }
> >
> > void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 641706fa47bf..193c8eb73be7 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -64,6 +64,8 @@
> > #include <linux/sched/isolation.h>
> > #include <linux/kmemleak.h>
> > #include "internal.h"
> > +#include "swap.h"
> > +#include "swap_table.h"
> > #include <net/sock.h>
> > #include <net/ip.h>
> > #include "slab.h"
> > @@ -5462,6 +5464,7 @@ int __init mem_cgroup_init(void)
> > int __mem_cgroup_try_charge_swap(struct folio *folio)
> > {
> > unsigned int nr_pages = folio_nr_pages(folio);
> > + struct swap_cluster_info *ci;
> > struct page_counter *counter;
> > struct mem_cgroup *memcg;
> > struct obj_cgroup *objcg;
> > @@ -5495,22 +5498,23 @@ int __mem_cgroup_try_charge_swap(struct folio *folio)
> > }
> > mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
> >
> > - swap_cgroup_record(folio, mem_cgroup_private_id(memcg), folio->swap);
> > + ci = swap_cluster_get_and_lock(folio);
> > + __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
> > + mem_cgroup_private_id(memcg));
> > + swap_cluster_unlock(ci);
> >
> > return 0;
> > }
> >
> > /**
> > * __mem_cgroup_uncharge_swap - uncharge swap space
> > - * @entry: swap entry to uncharge
> > + * @id: cgroup id to uncharge
> > * @nr_pages: the amount of swap space to uncharge
> > */
> > -void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
> > +void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
> > {
> > struct mem_cgroup *memcg;
> > - unsigned short id;
> >
> > - id = swap_cgroup_clear(entry, nr_pages);
> > rcu_read_lock();
> > memcg = mem_cgroup_from_private_id(id);
> > if (memcg) {
> > diff --git a/mm/swap.h b/mm/swap.h
> > index 80c2f1bf7a57..e4ac7dbc1080 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -5,6 +5,7 @@
> > #include <linux/atomic.h> /* for atomic_long_t */
> > struct mempolicy;
> > struct swap_iocb;
> > +struct swap_memcg_table;
> >
> > extern int page_cluster;
> >
> > @@ -38,6 +39,9 @@ struct swap_cluster_info {
> > u8 order;
> > atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */
> > unsigned int *extend_table; /* For large swap count, protected by ci->lock */
> > +#ifdef CONFIG_MEMCG
> > + struct swap_memcg_table *memcg_table; /* Swap table entries' cgroup record */
> > +#endif
> > struct list_head list;
> > };
> >
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index 86d517a33a55..71a3f128fcf0 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -176,21 +176,19 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci,
> > if (shadowp && swp_tb_is_shadow(old_tb))
> > *shadowp = swp_tb_to_shadow(old_tb);
> > if (memcg_id)
> > - *memcg_id = lookup_swap_cgroup_id(targ_entry);
> > + *memcg_id = __swap_cgroup_get(ci, ci_off);
> >
> > if (nr == 1)
> > return 0;
> >
> > - targ_entry.val = round_down(targ_entry.val, nr);
> > ci_off = round_down(ci_off, nr);
> > ci_end = ci_off + nr;
> > do {
> > old_tb = __swap_table_get(ci, ci_off);
> > if (unlikely(swp_tb_is_folio(old_tb) ||
> > !__swp_tb_get_count(old_tb) ||
> > - (memcg_id && *memcg_id != lookup_swap_cgroup_id(targ_entry))))
> > + (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off))))
> > return -EBUSY;
> > - targ_entry.val++;
> > } while (++ci_off < ci_end);
> >
> > return 0;
> > diff --git a/mm/swap_table.h b/mm/swap_table.h
> > index 8415ffbe2b9c..b2b02ee161b1 100644
> > --- a/mm/swap_table.h
> > +++ b/mm/swap_table.h
> > @@ -11,6 +11,11 @@ struct swap_table {
> > atomic_long_t entries[SWAPFILE_CLUSTER];
> > };
> >
> > +/* For storing memcg private id */
> > +struct swap_memcg_table {
> > + unsigned short id[SWAPFILE_CLUSTER];
> > +};
> > +
> > #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
> >
> > /*
> > @@ -247,4 +252,53 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
> >
> > return swp_tb;
> > }
> > +
> > +#ifdef CONFIG_MEMCG
> > +static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
> > + unsigned int ci_off, unsigned long nr, unsigned short id)
> > +{
> > + lockdep_assert_held(&ci->lock);
> > + VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
> > + do {
> > + ci->memcg_table->id[ci_off++] = id;
>
> Do you need to check the memcg_table is not NULL here? Because this
> function is no longer static. Another caller might invoke this when
> the cluster hasn't allocated the memcg_table. They shouldn't. We might
> want some check and complain here.

Good idea, a NULL check seems good to be defensive. I also noticed
that I should skip memcg table allocation as well if
mem_cgroup_disabled() is true to save some memory. I will update this.