Re: [PATCH v3 08/12] mm, swap: delay and unify memcg lookup and charging for swapin

From: Chris Li

Date: Fri May 08 2026 - 00:47:23 EST


On Tue, Apr 21, 2026 at 2:16 AM Kairui Song via B4 Relay
<devnull+kasong.tencent.com@xxxxxxxxxx> wrote:
>
> From: Kairui Song <kasong@xxxxxxxxxxx>
>
> Instead of checking the cgroup private ID during page table walk in
> swap_pte_batch(), move the memcg lookup into __swap_cache_add_check()
> under the cluster lock.
>
> The first pre-alloc check is speculative and skips the memcg check since
> the post-alloc stable check ensures all slots covered by the folio
> belong to the same memcg. It is very rare for contiguous and aligned
> entries across a contiguous region of a page table of the same process
> or shmem mapping to belong to different memcgs.
>
> This also prepares for recording the memcg info in the cluster's table.
> Also make the order check and fallback more compact.
>
> There should be no user-observable behavior change.
>
> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>

Acked-by: Chris Li <chrisl@xxxxxxxxxx>

> ---
> include/linux/memcontrol.h | 6 +++---
> mm/internal.h | 10 +---------
> mm/memcontrol.c | 10 ++++------
> mm/swap_state.c | 28 +++++++++++++++++++---------
> 4 files changed, 27 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 7d08128de1fd..a013f37f24aa 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -646,8 +646,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
>
> int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);
>
> -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> - gfp_t gfp, swp_entry_t entry);
> +int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id,
> + struct mm_struct *mm, gfp_t gfp);
>
> void __mem_cgroup_uncharge(struct folio *folio);
>
> @@ -1137,7 +1137,7 @@ static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp)
> }
>
> static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
> - struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
> + unsigned short id, struct mm_struct *mm, gfp_t gfp)
> {
> return 0;
> }
> diff --git a/mm/internal.h b/mm/internal.h
> index 5a2ddcf68e0b..9d2fec696bd6 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -451,24 +451,16 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> {
> pte_t expected_pte = pte_next_swp_offset(pte);
> const pte_t *end_ptep = start_ptep + max_nr;
> - const softleaf_t entry = softleaf_from_pte(pte);
> pte_t *ptep = start_ptep + 1;
> - unsigned short cgroup_id;
>
> VM_WARN_ON(max_nr < 1);
> - VM_WARN_ON(!softleaf_is_swap(entry));
> + VM_WARN_ON(!softleaf_is_swap(softleaf_from_pte(pte)));
>
> - cgroup_id = lookup_swap_cgroup_id(entry);
> while (ptep < end_ptep) {
> - softleaf_t entry;
> -
> pte = ptep_get(ptep);
>
> if (!pte_same(pte, expected_pte))
> break;
> - entry = softleaf_from_pte(pte);
> - if (lookup_swap_cgroup_id(entry) != cgroup_id)
> - break;
> expected_pte = pte_next_swp_offset(expected_pte);
> ptep++;
> }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c7df30ca5aa7..641706fa47bf 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5062,27 +5062,25 @@ int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp)
>
> /**
> * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
> - * @folio: folio to charge.
> + * @folio: the folio to charge
> + * @id: memory cgroup id
> * @mm: mm context of the victim
> * @gfp: reclaim mode
> - * @entry: swap entry for which the folio is allocated
> *
> * This function charges a folio allocated for swapin. Please call this before
> * adding the folio to the swapcache.
> *
> * Returns 0 on success. Otherwise, an error code is returned.
> */
> -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> - gfp_t gfp, swp_entry_t entry)
> +int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id,
> + struct mm_struct *mm, gfp_t gfp)
> {
> struct mem_cgroup *memcg;
> - unsigned short id;
> int ret;
>
> if (mem_cgroup_disabled())
> return 0;
>
> - id = lookup_swap_cgroup_id(entry);
> rcu_read_lock();
> memcg = mem_cgroup_from_private_id(id);
> if (!memcg || !css_tryget_online(&memcg->css))
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 12b290d43e45..86d517a33a55 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -142,16 +142,20 @@ void *swap_cache_get_shadow(swp_entry_t entry)
> * @ci: The locked swap cluster
> * @targ_entry: The target swap entry to check, will be rounded down by @nr
> * @nr: Number of slots to check, must be a power of 2
> - * @shadowp: Returns the shadow value if one exists in the range.
> + * @shadowp: Returns the shadow value if one exists in the range
> + * @memcg_id: Returns the memory cgroup id, NULL to ignore cgroup check
> *
> * Check if all slots covered by given range have a swap count >= 1.
> - * Retrieves the shadow if there is one.
> + * Retrieves the shadow if there is one. If @memcg_id is not NULL, also
> + * checks if all slots belong to the same cgroup and return the cgroup
> + * private id.
> *
> * Context: Caller must lock the cluster.
> */
> static int __swap_cache_add_check(struct swap_cluster_info *ci,
> swp_entry_t targ_entry,
> - unsigned long nr, void **shadowp)
> + unsigned long nr, void **shadowp,
> + unsigned short *memcg_id)
> {
> unsigned int ci_off, ci_end;
> unsigned long old_tb;
> @@ -169,19 +173,24 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci,
> return -EEXIST;
> if (!__swp_tb_get_count(old_tb))
> return -ENOENT;
> - if (swp_tb_is_shadow(old_tb) && shadowp)
> + if (shadowp && swp_tb_is_shadow(old_tb))
> *shadowp = swp_tb_to_shadow(old_tb);
> + if (memcg_id)
> + *memcg_id = lookup_swap_cgroup_id(targ_entry);

Nitpick: Consider also use a local variable to stare the memcg_id value here.

>
> if (nr == 1)
> return 0;
>
> + targ_entry.val = round_down(targ_entry.val, nr);
> ci_off = round_down(ci_off, nr);
> ci_end = ci_off + nr;
> do {
> old_tb = __swap_table_get(ci, ci_off);
> if (unlikely(swp_tb_is_folio(old_tb) ||
> - !__swp_tb_get_count(old_tb)))
> + !__swp_tb_get_count(old_tb) ||
> + (memcg_id && *memcg_id != lookup_swap_cgroup_id(targ_entry))))

Nitpick: You can use the local variable here to avoid a memory fetch.
Micro optimizations.

Chris