Re: [RFC PATCH 6/6] mm/memcontrol: Make memory.high tier-aware

From: Donet Tom

Date: Tue Mar 24 2026 - 07:09:43 EST



On 2/24/26 4:08 AM, Joshua Hahn wrote:
On machines serving multiple workloads whose memory is isolated via the
memory cgroup controller, it is currently impossible to enforce a fair
distribution of toptier memory among the workloads, as the only
enforcable limits have to do with total memory footprint, but not where
that memory resides.

This makes ensuring a consistent and baseline performance difficult, as
each workload's performance is heavily impacted by workload-external
factors wuch as which other workloads are co-located in the same host,
and the order at which different workloads are started.

Extend the existing memory.high protection to be tier-aware in the
charging and enforcement to limit toptier-hogging for workloads.

Also, add a new nodemask parameter to try_to_free_mem_cgroup_pages,
which can be used to selectively reclaim from memory at the
memcg-tier interection of a cgroup.

Signed-off-by: Joshua Hahn <joshua.hahnjy@xxxxxxxxx>
---
include/linux/swap.h | 3 +-
mm/memcontrol-v1.c | 6 ++--
mm/memcontrol.c | 85 +++++++++++++++++++++++++++++++++++++-------
mm/vmscan.c | 11 +++---
4 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0effe3cc50f5..c6037ac7bf6e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -368,7 +368,8 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
unsigned int reclaim_options,
- int *swappiness);
+ int *swappiness,
+ nodemask_t *allowed);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 0b39ba608109..29630c7f3567 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1497,7 +1497,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
+ NULL, NULL)) {
ret = -EBUSY;
break;
}
@@ -1529,7 +1530,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP, NULL))
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL, NULL))
nr_retries--;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8aa7ae361a73..ebd4a1b73c51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2184,18 +2184,30 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
do {
unsigned long pflags;
-
- if (page_counter_read(&memcg->memory) <=
- READ_ONCE(memcg->memory.high))
+ nodemask_t toptier_nodes, *reclaim_nodes;
+ bool mem_high_ok, toptier_high_ok;
+
+ mt_get_toptier_nodemask(&toptier_nodes, NULL);
+ mem_high_ok = page_counter_read(&memcg->memory) <=
+ READ_ONCE(memcg->memory.high);
+ toptier_high_ok = !(tier_aware_memcg_limits &&
+ mem_cgroup_toptier_usage(memcg) >
+ page_counter_toptier_high(&memcg->memory));
+ if (mem_high_ok && toptier_high_ok)
continue;
+ if (mem_high_ok && !toptier_high_ok)
+ reclaim_nodes = &toptier_nodes;
+ else
+ reclaim_nodes = NULL;


IIUC The intent of this patch is to partition cgroup memory such that
0 → toptier_high is backed by higher-tier memory, and
toptier_high → max is backed by lower-tier memory.

Based on this:

1.If top-tier usage exceeds toptier_high, pages should be
  demoted to the lower tier.

2. If lower-tier usage exceeds (max - toptier_high), pages
  should be swapped out.

3. If total memory usage exceeds max, demotion should be
  avoided and reclaim should directly swap out pages.

I think we are only handling case (1) in this patch. When
mem_high_ok && !toptier_high_ok, we are reclaiming pages (demotion first)

However, if !mem_high_ok, the memcg reclaim path works as if
there is no memory tiering  in cgroup. This can lead to more demotion
and may eventually result in OOM.

Should we also handle cases (2) and (3) in this patch?


+
memcg_memory_event(memcg, MEMCG_HIGH);
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
MEMCG_RECLAIM_MAY_SWAP,
- NULL);
+ NULL, reclaim_nodes);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2296,6 +2308,24 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg)
return max_overage;
}
+static u64 toptier_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ if (!tier_aware_memcg_limits)
+ return 0;
+
+ do {
+ unsigned long usage = mem_cgroup_toptier_usage(memcg);
+ unsigned long high = page_counter_toptier_high(&memcg->memory);
+
+ overage = calculate_overage(usage, high);
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return max_overage;
+}
static u64 swap_find_max_overage(struct mem_cgroup *memcg)
{
u64 overage, max_overage = 0;
@@ -2401,6 +2431,14 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
penalty_jiffies += calculate_high_delay(memcg, nr_pages,
swap_find_max_overage(memcg));
+ /*
+ * Don't double-penalize for toptier high overage if system-wide
+ * memory.high has already been breached.
+ */
+ if (!penalty_jiffies)
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+ toptier_find_max_overage(memcg));
+
/*
* Clamp the max delay per usermode return so as to still keep the
* application moving forwards and also permit diagnostics, albeit
@@ -2503,7 +2541,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options, NULL);
+ gfp_mask, reclaim_options,
+ NULL, NULL);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2592,23 +2631,26 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* reclaim, the cost of mismatch is negligible.
*/
do {
- bool mem_high, swap_high;
+ bool mem_high, swap_high, toptier_high = false;
mem_high = page_counter_read(&memcg->memory) >
READ_ONCE(memcg->memory.high);
swap_high = page_counter_read(&memcg->swap) >
READ_ONCE(memcg->swap.high);
+ toptier_high = tier_aware_memcg_limits &&
+ (mem_cgroup_toptier_usage(memcg) >
+ page_counter_toptier_high(&memcg->memory));
/* Don't bother a random interrupted task */
if (!in_task()) {
- if (mem_high) {
+ if (mem_high || toptier_high) {
schedule_work(&memcg->high_work);
break;
}
continue;
}
- if (mem_high || swap_high) {
+ if (mem_high || swap_high || toptier_high) {
/*
* The allocating tasks in this cgroup will need to do
* reclaim or be throttled to prevent further growth
@@ -4476,7 +4518,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
bool drained = false;
- unsigned long high;
+ unsigned long high, toptier_high;
int err;
buf = strstrip(buf);
@@ -4485,15 +4527,22 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
return err;
page_counter_set_high(&memcg->memory, high);
+ toptier_high = page_counter_toptier_high(&memcg->memory);
if (of->file->f_flags & O_NONBLOCK)
goto out;
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long toptier_pages = mem_cgroup_toptier_usage(memcg);
unsigned long reclaimed;
+ unsigned long to_free;
+ nodemask_t toptier_nodes, *reclaim_nodes;
+ bool mem_high_ok = nr_pages <= high;
+ bool toptier_high_ok = !(tier_aware_memcg_limits &&
+ toptier_pages > toptier_high);
- if (nr_pages <= high)
+ if (mem_high_ok && toptier_high_ok)
break;
if (signal_pending(current))
@@ -4505,8 +4554,17 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
continue;
}
- reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
+ mt_get_toptier_nodemask(&toptier_nodes, NULL);
+ if (mem_high_ok && !toptier_high_ok) {
+ reclaim_nodes = &toptier_nodes;
+ to_free = toptier_pages - toptier_high;
+ } else {
+ reclaim_nodes = NULL;
+ to_free = nr_pages - high;
+ }
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, to_free,
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL, reclaim_nodes);
if (!reclaimed && !nr_retries--)
break;
@@ -4558,7 +4616,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL, NULL))
nr_reclaims--;
continue;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5b4cb030a477..94498734b4f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6652,7 +6652,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
unsigned int reclaim_options,
- int *swappiness)
+ int *swappiness, nodemask_t *allowed)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -6668,6 +6668,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .nodemask = allowed,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -6693,7 +6694,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
unsigned int reclaim_options,
- int *swappiness)
+ int *swappiness, nodemask_t *allowed)
{
return 0;
}
@@ -7806,9 +7807,9 @@ int user_proactive_reclaim(char *buf,
reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
MEMCG_RECLAIM_PROACTIVE;
reclaimed = try_to_free_mem_cgroup_pages(memcg,
- batch_size, gfp_mask,
- reclaim_options,
- swappiness == -1 ? NULL : &swappiness);
+ batch_size, gfp_mask, reclaim_options,
+ swappiness == -1 ? NULL : &swappiness,
+ NULL);
} else {
struct scan_control sc = {
.gfp_mask = current_gfp_context(gfp_mask),