[RFC PATCH 4/9 v2] mm/memcontrol: charge/uncharge toptier memory to mem_cgroup
From: Joshua Hahn
Date: Thu Apr 23 2026 - 16:42:07 EST
Memory cgroup limits currently offer a way to isolate memory as a
resource, but treats the cost/value of all memory to be equal,
regardless of whether it is present in a toptier node or not.
To better capture the asymmetric utility of toptier memory from
"lowtier" memory, account toptier memory usage in parallel to existing
memory accounting mechanisms. To do this, introduce a new page_counter
"toptier" to mem_cgroup.
>From a simplified perspective, we can achieve this by checking the
physical location of folios when the memory page_counter is updated, and
decide whether to also account to toptier. Add a new "toptier" parameter
to try_charge_memcg(), which callers must determine.
However, as of this RFC, this simplified model only works on LRU folios
(callers of try_charge_memcg() from charge_memcg()). The other two
sites, obj_cgroup_charge_pages() and mem_cgroup_sk_charge(), will be
addressed in future patches that transition enum memcg_stat_item to
a per-lruvec counter (enum memcg_stat_item).
Enforcement mechanisms are not present at this point. Failing the
toptier limit check leads to nothing, but the charges are accumulated.
Signed-off-by: Joshua Hahn <joshua.hahnjy@xxxxxxxxx>
---
include/linux/memcontrol.h | 1 +
mm/memcontrol.c | 63 ++++++++++++++++++++++++++++++++++----
2 files changed, 58 insertions(+), 6 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index be45641e890e4..0cdb6cd1955dc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -206,6 +206,7 @@ struct mem_cgroup {
/* Accounted resources */
struct page_counter memory; /* Both v1 & v2 */
+ struct page_counter toptier; /* v2 only */
union {
struct page_counter swap; /* v2 only */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8f7bedb55dbb1..d891cf77cf6d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -53,6 +53,7 @@
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/memremap.h>
+#include <linux/memory-tiers.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
@@ -2096,6 +2097,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
for_each_mem_cgroup(memcg) {
page_counter_drain_cpu(&memcg->memory, cpu);
+ page_counter_drain_cpu(&memcg->toptier, cpu);
page_counter_drain_cpu(&memcg->memsw, cpu);
}
@@ -2370,7 +2372,7 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
}
static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+ unsigned int nr_pages, bool toptier)
{
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
@@ -2382,9 +2384,11 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool raised_max_event = false;
unsigned long pflags;
bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
+ bool toptier_charged;
retry:
reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
+ toptier_charged = false;
if (do_memsw_account() &&
!page_counter_try_charge(&memcg->memsw, nr_pages, &counter)) {
@@ -2393,11 +2397,18 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto reclaim;
}
+ if (toptier &&
+ page_counter_try_charge(&memcg->toptier, nr_pages, &counter))
+ toptier_charged = true;
+
if (page_counter_try_charge(&memcg->memory, nr_pages, &counter))
goto done_restock;
+ if (toptier_charged)
+ page_counter_uncharge(&memcg->toptier, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
+
mem_over_limit = mem_cgroup_from_counter(counter, memory);
reclaim:
@@ -2490,6 +2501,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* being freed very soon. Allow memory usage go over the limit
* temporarily by force charging it.
*/
+ if (toptier)
+ page_counter_charge(&memcg->toptier, nr_pages);
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
@@ -2559,7 +2572,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (mem_cgroup_is_root(memcg))
return 0;
- return try_charge_memcg(memcg, gfp_mask, nr_pages);
+ return try_charge_memcg(memcg, gfp_mask, nr_pages, false);
}
static void commit_charge(struct folio *folio, struct obj_cgroup *objcg)
@@ -2859,7 +2872,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
memcg = get_mem_cgroup_from_objcg(objcg);
- ret = try_charge_memcg(memcg, gfp, nr_pages);
+ ret = try_charge_memcg(memcg, gfp, nr_pages, false);
if (ret)
goto out;
@@ -2888,6 +2901,11 @@ static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg)
page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
}
+static bool should_charge_toptier(struct folio *folio)
+{
+ return mem_cgroup_tiered_limits() && node_is_toptier(folio_nid(folio));
+}
+
/**
* __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
@@ -3760,6 +3778,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
page_counter_free_stock(&memcg->memory);
+ page_counter_free_stock(&memcg->toptier);
page_counter_free_stock(&memcg->memsw);
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
@@ -3866,6 +3885,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl);
+ page_counter_init(&memcg->toptier, &parent->toptier, memcg_on_dfl);
page_counter_init(&memcg->swap, &parent->swap, false);
#ifdef CONFIG_MEMCG_V1
memcg->memory.track_failcnt = !memcg_on_dfl;
@@ -3877,6 +3897,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
init_memcg_stats();
init_memcg_events();
page_counter_init(&memcg->memory, NULL, true);
+ page_counter_init(&memcg->toptier, NULL, true);
page_counter_init(&memcg->swap, NULL, false);
#ifdef CONFIG_MEMCG_V1
page_counter_init(&memcg->kmem, NULL, false);
@@ -3936,6 +3957,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
/* failure is nonfatal, charges fall back to direct hierarchy */
page_counter_enable_stock(&memcg->memory, MEMCG_CHARGE_BATCH);
+ page_counter_enable_stock(&memcg->toptier, MEMCG_CHARGE_BATCH);
if (do_memsw_account())
page_counter_enable_stock(&memcg->memsw, MEMCG_CHARGE_BATCH);
@@ -4013,6 +4035,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
drain_all_stock(memcg);
page_counter_disable_stock(&memcg->memory);
+ page_counter_disable_stock(&memcg->toptier);
page_counter_disable_stock(&memcg->memsw);
mem_cgroup_private_id_put(memcg, 1);
@@ -4825,7 +4848,8 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
objcg = get_obj_cgroup_from_memcg(memcg);
/* Do not account at the root objcg level. */
if (!obj_cgroup_is_root(objcg))
- ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio));
+ ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio),
+ should_charge_toptier(folio));
if (ret) {
obj_cgroup_put(objcg);
return ret;
@@ -4922,6 +4946,7 @@ struct uncharge_gather {
unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
+ unsigned long nr_toptier;
int nid;
};
@@ -4942,6 +4967,8 @@ static void uncharge_batch(const struct uncharge_gather *ug)
mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem);
memcg1_account_kmem(memcg, -ug->nr_kmem);
}
+ if (ug->nr_toptier)
+ page_counter_uncharge(&memcg->toptier, ug->nr_toptier);
memcg1_oom_recover(memcg);
}
@@ -4987,8 +5014,11 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_kmem += nr_pages;
} else {
/* LRU pages aren't accounted at the root level */
- if (!obj_cgroup_is_root(objcg))
+ if (!obj_cgroup_is_root(objcg)) {
ug->nr_memory += nr_pages;
+ if (should_charge_toptier(folio))
+ ug->nr_toptier += nr_pages;
+ }
ug->pgpgout++;
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
@@ -5063,6 +5093,10 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
+
+ /* old folio's toptier usage will be uncharged on free */
+ if (should_charge_toptier(new))
+ page_counter_charge(&memcg->toptier, nr_pages);
}
obj_cgroup_get(objcg);
@@ -5105,6 +5139,23 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
if (!objcg)
return;
+ if (!obj_cgroup_is_root(objcg)) {
+ struct mem_cgroup *memcg;
+ unsigned long nr_pages = folio_nr_pages(old);
+ bool old_toptier, new_toptier;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ old_toptier = should_charge_toptier(old);
+ new_toptier = should_charge_toptier(new);
+
+ if (old_toptier && !new_toptier)
+ page_counter_uncharge(&memcg->toptier, nr_pages);
+ else if (!old_toptier && new_toptier)
+ page_counter_charge(&memcg->toptier, nr_pages);
+ rcu_read_unlock();
+ }
+
/* Transfer the charge and the objcg ref */
commit_charge(new, objcg);
@@ -5180,7 +5231,7 @@ bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
- if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) {
+ if (try_charge_memcg(memcg, gfp_mask, nr_pages, false) == 0) {
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
return true;
}
--
2.52.0