Re: [PATCH v2 07/28] mm: memcontrol: return root object cgroup for root memory cgroup

From: Chen Ridong
Date: Thu Dec 25 2025 - 22:59:01 EST




On 2025/12/26 11:50, Chen Ridong wrote:
>
>
> On 2025/12/26 11:10, Muchun Song wrote:
>>
>>
>> On 2025/12/26 09:03, Chen Ridong wrote:
>>>
>>> On 2025/12/17 15:27, Qi Zheng wrote:
>>>> From: Muchun Song <songmuchun@xxxxxxxxxxxxx>
>>>>
>>>> Memory cgroup functions such as get_mem_cgroup_from_folio() and
>>>> get_mem_cgroup_from_mm() return a valid memory cgroup pointer,
>>>> even for the root memory cgroup. In contrast, the situation for
>>>> object cgroups has been different.
>>>>
>>>> Previously, the root object cgroup couldn't be returned because
>>>> it didn't exist. Now that a valid root object cgroup exists, for
>>>> the sake of consistency, it's necessary to align the behavior of
>>>> object-cgroup-related operations with that of memory cgroup APIs.
>>>>
>>>> Signed-off-by: Muchun Song <songmuchun@xxxxxxxxxxxxx>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
>>>> ---
>>>>   include/linux/memcontrol.h | 26 +++++++++++++++++-----
>>>>   mm/memcontrol.c            | 45 ++++++++++++++++++++------------------
>>>>   mm/percpu.c                |  2 +-
>>>>   3 files changed, 45 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>>> index 288dd6337f80f..776d9be1f446a 100644
>>>> --- a/include/linux/memcontrol.h
>>>> +++ b/include/linux/memcontrol.h
>>>> @@ -332,6 +332,7 @@ struct mem_cgroup {
>>>>   #define MEMCG_CHARGE_BATCH 64U
>>>>     extern struct mem_cgroup *root_mem_cgroup;
>>>> +extern struct obj_cgroup *root_obj_cgroup;
>>>>     enum page_memcg_data_flags {
>>>>       /* page->memcg_data is a pointer to an slabobj_ext vector */
>>>> @@ -549,6 +550,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
>>>>       return (memcg == root_mem_cgroup);
>>>>   }
>>>>   +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
>>>> +{
>>>> +    return objcg == root_obj_cgroup;
>>>> +}
>>>> +
>>>>   static inline bool mem_cgroup_disabled(void)
>>>>   {
>>>>       return !cgroup_subsys_enabled(memory_cgrp_subsys);
>>>> @@ -773,23 +779,26 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
>>>>     static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
>>>>   {
>>>> +    if (obj_cgroup_is_root(objcg))
>>>> +        return true;
>>>>       return percpu_ref_tryget(&objcg->refcnt);
>>>>   }
>>>>   -static inline void obj_cgroup_get(struct obj_cgroup *objcg)
>>>> +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
>>>> +                       unsigned long nr)
>>>>   {
>>>> -    percpu_ref_get(&objcg->refcnt);
>>>> +    if (!obj_cgroup_is_root(objcg))
>>>> +        percpu_ref_get_many(&objcg->refcnt, nr);
>>>>   }
>>>>   -static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
>>>> -                       unsigned long nr)
>>>> +static inline void obj_cgroup_get(struct obj_cgroup *objcg)
>>>>   {
>>>> -    percpu_ref_get_many(&objcg->refcnt, nr);
>>>> +    obj_cgroup_get_many(objcg, 1);
>>>>   }
>>>>     static inline void obj_cgroup_put(struct obj_cgroup *objcg)
>>>>   {
>>>> -    if (objcg)
>>>> +    if (objcg && !obj_cgroup_is_root(objcg))
>>>>           percpu_ref_put(&objcg->refcnt);
>>>>   }
>>>>   @@ -1084,6 +1093,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
>>>>       return true;
>>>>   }
>>>>   +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
>>>> +{
>>>> +    return true;
>>>> +}
>>>> +
>>>>   static inline bool mem_cgroup_disabled(void)
>>>>   {
>>>>       return true;
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index 544b3200db12d..21b5aad34cae7 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -83,6 +83,8 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
>>>>   struct mem_cgroup *root_mem_cgroup __read_mostly;
>>>>   EXPORT_SYMBOL(root_mem_cgroup);
>>>>   +struct obj_cgroup *root_obj_cgroup __read_mostly;
>>>> +
>>>>   /* Active memory cgroup to use from an interrupt context */
>>>>   DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
>>>>   EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
>>>> @@ -2634,15 +2636,14 @@ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
>>>>     static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
>>>>   {
>>>> -    struct obj_cgroup *objcg = NULL;
>>>> +    for (; memcg; memcg = parent_mem_cgroup(memcg)) {
>>>> +        struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);
>>>>   -    for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
>>>> -        objcg = rcu_dereference(memcg->objcg);
>>>>           if (likely(objcg && obj_cgroup_tryget(objcg)))
>>>> -            break;
>>>> -        objcg = NULL;
>>>> +            return objcg;
>>>>       }
>>>> -    return objcg;
>>>> +
>>>> +    return NULL;
>>>>   }
>>>>     static struct obj_cgroup *current_objcg_update(void)
>>>> @@ -2716,18 +2717,17 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
>>>>            * Objcg reference is kept by the task, so it's safe
>>>>            * to use the objcg by the current task.
>>>>            */
>>>> -        return objcg;
>>>> +        return objcg ? : root_obj_cgroup;
>>>>       }
>>>>         memcg = this_cpu_read(int_active_memcg);
>>>>       if (unlikely(memcg))
>>>>           goto from_memcg;
>>>>   -    return NULL;
>>>> +    return root_obj_cgroup;
>>>>     from_memcg:
>>>> -    objcg = NULL;
>>>> -    for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
>>>> +    for (; memcg; memcg = parent_mem_cgroup(memcg)) {
>>>>           /*
>>>>            * Memcg pointer is protected by scope (see set_active_memcg())
>>>>            * and is pinning the corresponding objcg, so objcg can't go
>>>> @@ -2736,10 +2736,10 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
>>>>            */
>>>>           objcg = rcu_dereference_check(memcg->objcg, 1);
>>>>           if (likely(objcg))
>>>> -            break;
>>>> +            return objcg;
>>>>       }
>>>>   -    return objcg;
>>>> +    return root_obj_cgroup;
>>>>   }
>>>>     struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
>>>> @@ -2753,14 +2753,8 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
>>>>           objcg = __folio_objcg(folio);
>>>>           obj_cgroup_get(objcg);
>>>>       } else {
>>>> -        struct mem_cgroup *memcg;
>>>> -
>>>>           rcu_read_lock();
>>>> -        memcg = __folio_memcg(folio);
>>>> -        if (memcg)
>>>> -            objcg = __get_obj_cgroup_from_memcg(memcg);
>>>> -        else
>>>> -            objcg = NULL;
>>>> +        objcg = __get_obj_cgroup_from_memcg(__folio_memcg(folio));
>>>>           rcu_read_unlock();
>>>>       }
>>>>       return objcg;
>>>> @@ -2863,7 +2857,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
>>>>       int ret = 0;
>>>>         objcg = current_obj_cgroup();
>>>> -    if (objcg) {
>>>> +    if (objcg && !obj_cgroup_is_root(objcg)) {
>>>>           ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
>>>>           if (!ret) {
>>>>               obj_cgroup_get(objcg);
>>>> @@ -3164,7 +3158,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
>>>>        * obj_cgroup_get() is used to get a permanent reference.
>>>>        */
>>>>       objcg = current_obj_cgroup();
>>>> -    if (!objcg)
>>>> +    if (!objcg || obj_cgroup_is_root(objcg))
>>>>           return true;
>>>>         /*
>>>> @@ -3851,6 +3845,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
>>>>       if (!objcg)
>>>>           goto free_shrinker;
>>>>   +    if (unlikely(mem_cgroup_is_root(memcg)))
>>>> +        root_obj_cgroup = objcg;
>>>> +
>>>>       objcg->memcg = memcg;
>>>>       rcu_assign_pointer(memcg->objcg, objcg);
>>>>       obj_cgroup_get(objcg);
>>>> @@ -5471,6 +5468,9 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
>>>>       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>>>           return;
>>>>   +    if (obj_cgroup_is_root(objcg))
>>>> +        return;
>>>> +
>>>>       VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
>>>>         /* PF_MEMALLOC context, charging must succeed */
>>>> @@ -5498,6 +5498,9 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
>>>>       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
>>>>           return;
>>>>   +    if (obj_cgroup_is_root(objcg))
>>>> +        return;
>>>> +
>>>>       obj_cgroup_uncharge(objcg, size);
>>>>  
>>> If we modify zswap by adding MEMCG_ZSWAP_B and MEMCG_ZSWAPPED with obj_cgroup_charge_zswap , then
>>> remove a control group (via rmdir) and reparent its objects to the root cgroup, then for the root
>>> cgroup, obj_cgroup_uncharge_zswap will return directly due to the obj_cgroup_is_root check. Would
>>> this cause us to miss decrementing MEMCG_ZSWAP_B and MEMCG_ZSWAPPED?
>>
>> I'm not sure I fully understand the problem—how could this happen, given that
>> obj_cgroup_charge_zswap also checks for the root objcg?
>>
>> Muchun,
>> Thanks.
>
> That is:
>
> 1. memcg A is under the root memcg.
> 2. obj_cgroup_charge_zswap charges memcg A.
> 3. After rmdir A, the obj of A is reparented to the root memcg.
> 4. obj_cgroup_uncharge_zswap does nothing and returns, since the object is now associated with the root.
>
> Thus, the root memcg will miss decrementing MEMCG_ZSWAP_B and MEMCG_ZSWAPPED, correct? Or am I
> missing something?
>

Oh, I understand now — please ignore my previous question. Apologies for the noise.

Even after reparenting, the object from A should not be treated as a root object.

>>>
>>>>       rcu_read_lock();
>>>> diff --git a/mm/percpu.c b/mm/percpu.c
>>>> index 81462ce5866e1..5c1a9b77d6b93 100644
>>>> --- a/mm/percpu.c
>>>> +++ b/mm/percpu.c
>>>> @@ -1616,7 +1616,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
>>>>           return true;
>>>>         objcg = current_obj_cgroup();
>>>> -    if (!objcg)
>>>> +    if (!objcg || obj_cgroup_is_root(objcg))
>>>>           return true;
>>>>         if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
>>
>

--
Best regards,
Ridong