[RFC PATCH v1 06/11] mm: Handle top tier memory in cgroup soft limit memory tree utilities

From: Tim Chen
Date: Mon Apr 05 2021 - 14:09:28 EST


Update the utility functions __mem_cgroup_insert_exceeded() and
__mem_cgroup_remove_exceeded(), to allow addition and removal of cgroups
from the new red black tree that tracks the cgroups that exceed their
toptier memory limits.

Update also the function +mem_cgroup_largest_soft_limit_node(),
to allow returning the cgroup that has the largest exceess usage
of toptier memory.

Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---
include/linux/memcontrol.h | 9 +++
mm/memcontrol.c | 152 +++++++++++++++++++++++++++----------
2 files changed, 122 insertions(+), 39 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 609d8590950c..0ed8ddfd5436 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -124,6 +124,15 @@ struct mem_cgroup_per_node {
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
+
+ struct rb_node toptier_tree_node; /* RB tree node */
+ unsigned long toptier_usage_in_excess; /* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_toptier_tree;
+
+ bool congested; /* memcg has many dirty pages */
+ /* backed by a congested BDI */
+
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90a78ff3fca8..8a7648b79635 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -616,24 +616,44 @@ soft_limit_tree_from_page(struct page *page, enum node_states type)

static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
- unsigned long new_usage_in_excess)
+ unsigned long new_usage_in_excess,
+ enum node_states type)
{
struct rb_node **p = &mctz->rb_root.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node *parent = NULL, *mz_tree_node;
struct mem_cgroup_per_node *mz_node;
- bool rightmost = true;
+ bool rightmost = true, *mz_on_tree;
+ unsigned long usage_in_excess, *mz_usage_in_excess;

- if (mz->on_tree)
+ if (type == N_TOPTIER) {
+ mz_usage_in_excess = &mz->toptier_usage_in_excess;
+ mz_tree_node = &mz->toptier_tree_node;
+ mz_on_tree = &mz->on_toptier_tree;
+ } else {
+ mz_usage_in_excess = &mz->usage_in_excess;
+ mz_tree_node = &mz->tree_node;
+ mz_on_tree = &mz->on_tree;
+ }
+
+ if (*mz_on_tree)
return;

- mz->usage_in_excess = new_usage_in_excess;
- if (!mz->usage_in_excess)
+ if (!new_usage_in_excess)
return;
+
while (*p) {
parent = *p;
- mz_node = rb_entry(parent, struct mem_cgroup_per_node,
+ if (type == N_TOPTIER) {
+ mz_node = rb_entry(parent, struct mem_cgroup_per_node,
+ toptier_tree_node);
+ usage_in_excess = mz_node->toptier_usage_in_excess;
+ } else {
+ mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node);
- if (mz->usage_in_excess < mz_node->usage_in_excess) {
+ usage_in_excess = mz_node->usage_in_excess;
+ }
+
+ if (new_usage_in_excess < usage_in_excess) {
p = &(*p)->rb_left;
rightmost = false;
} else {
@@ -642,33 +662,47 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
}

if (rightmost)
- mctz->rb_rightmost = &mz->tree_node;
+ mctz->rb_rightmost = mz_tree_node;

- rb_link_node(&mz->tree_node, parent, p);
- rb_insert_color(&mz->tree_node, &mctz->rb_root);
- mz->on_tree = true;
+ rb_link_node(mz_tree_node, parent, p);
+ rb_insert_color(mz_tree_node, &mctz->rb_root);
+ *mz_usage_in_excess = new_usage_in_excess;
+ *mz_on_tree = true;
}

static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
- struct mem_cgroup_tree_per_node *mctz)
+ struct mem_cgroup_tree_per_node *mctz,
+ enum node_states type)
{
- if (!mz->on_tree)
+ bool *mz_on_tree;
+ struct rb_node *mz_tree_node;
+
+ if (type == N_TOPTIER) {
+ mz_tree_node = &mz->toptier_tree_node;
+ mz_on_tree = &mz->on_toptier_tree;
+ } else {
+ mz_tree_node = &mz->tree_node;
+ mz_on_tree = &mz->on_tree;
+ }
+
+ if (!(*mz_on_tree))
return;

- if (&mz->tree_node == mctz->rb_rightmost)
- mctz->rb_rightmost = rb_prev(&mz->tree_node);
+ if (mz_tree_node == mctz->rb_rightmost)
+ mctz->rb_rightmost = rb_prev(mz_tree_node);

- rb_erase(&mz->tree_node, &mctz->rb_root);
- mz->on_tree = false;
+ rb_erase(mz_tree_node, &mctz->rb_root);
+ *mz_on_tree = false;
}

static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
- struct mem_cgroup_tree_per_node *mctz)
+ struct mem_cgroup_tree_per_node *mctz,
+ enum node_states type)
{
unsigned long flags;

spin_lock_irqsave(&mctz->lock, flags);
- __mem_cgroup_remove_exceeded(mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz, type);
spin_unlock_irqrestore(&mctz->lock, flags);
}

@@ -696,13 +730,18 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg, enum node_state
return excess;
}

-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *bottom_memcg, struct page *page)
{
unsigned long excess;
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
+ enum node_states type = N_MEMORY;
+ struct mem_cgroup *memcg;
+
+repeat_toptier:
+ memcg = bottom_memcg;
+ mctz = soft_limit_tree_from_page(page, type);

- mctz = soft_limit_tree_from_page(page, N_MEMORY);
if (!mctz)
return;
/*
@@ -710,27 +749,37 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ bool on_tree;
+
mz = mem_cgroup_page_nodeinfo(memcg, page);
- excess = soft_limit_excess(memcg, N_MEMORY);
+ excess = soft_limit_excess(memcg, type);
+
+ on_tree = (type == N_MEMORY) ? mz->on_tree: mz->on_toptier_tree;
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
- if (excess || mz->on_tree) {
+ if (excess || on_tree) {
unsigned long flags;

spin_lock_irqsave(&mctz->lock, flags);
/* if on-tree, remove it */
- if (mz->on_tree)
- __mem_cgroup_remove_exceeded(mz, mctz);
+ if (on_tree)
+ __mem_cgroup_remove_exceeded(mz, mctz, type);
+
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
- __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz, mctz, excess, type);
+
spin_unlock_irqrestore(&mctz->lock, flags);
}
}
+ if (type == N_MEMORY) {
+ type = N_TOPTIER;
+ goto repeat_toptier;
+ }
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
@@ -743,12 +792,16 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
mz = mem_cgroup_nodeinfo(memcg, nid);
mctz = soft_limit_tree_node(nid, N_MEMORY);
if (mctz)
- mem_cgroup_remove_exceeded(mz, mctz);
+ mem_cgroup_remove_exceeded(mz, mctz, N_MEMORY);
+ mctz = soft_limit_tree_node(nid, N_TOPTIER);
+ if (mctz)
+ mem_cgroup_remove_exceeded(mz, mctz, N_TOPTIER);
}
}

static struct mem_cgroup_per_node *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+ enum node_states type)
{
struct mem_cgroup_per_node *mz;

@@ -757,15 +810,19 @@ __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
if (!mctz->rb_rightmost)
goto done; /* Nothing to reclaim from */

- mz = rb_entry(mctz->rb_rightmost,
+ if (type == N_TOPTIER)
+ mz = rb_entry(mctz->rb_rightmost,
+ struct mem_cgroup_per_node, toptier_tree_node);
+ else
+ mz = rb_entry(mctz->rb_rightmost,
struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
- __mem_cgroup_remove_exceeded(mz, mctz);
- if (!soft_limit_excess(mz->memcg, N_MEMORY) ||
+ __mem_cgroup_remove_exceeded(mz, mctz, type);
+ if (!soft_limit_excess(mz->memcg, type) ||
!css_tryget(&mz->memcg->css))
goto retry;
done:
@@ -773,12 +830,13 @@ __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
}

static struct mem_cgroup_per_node *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+ enum node_states type)
{
struct mem_cgroup_per_node *mz;

spin_lock_irq(&mctz->lock);
- mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ mz = __mem_cgroup_largest_soft_limit_node(mctz, type);
spin_unlock_irq(&mctz->lock);
return mz;
}
@@ -3472,7 +3530,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
struct mem_cgroup_per_node *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
- struct mem_cgroup_tree_per_node *mctz;
+ struct mem_cgroup_tree_per_node *mctz, *mctz_sibling;
unsigned long excess;
unsigned long nr_scanned;
int migration_nid;
@@ -3481,6 +3539,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
return 0;

mctz = soft_limit_tree_node(pgdat->node_id, N_MEMORY);
+ mctz_sibling = soft_limit_tree_node(pgdat->node_id, N_TOPTIER);

/*
* Do not even bother to check the largest node if the root
@@ -3516,7 +3575,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
if (next_mz)
mz = next_mz;
else
- mz = mem_cgroup_largest_soft_limit_node(mctz);
+ mz = mem_cgroup_largest_soft_limit_node(mctz, N_MEMORY);
if (!mz)
break;

@@ -3526,7 +3585,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
spin_lock_irq(&mctz->lock);
- __mem_cgroup_remove_exceeded(mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz, N_MEMORY);

/*
* If we failed to reclaim anything from this memory cgroup
@@ -3534,7 +3593,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
*/
next_mz = NULL;
if (!reclaimed)
- next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ next_mz =
+ __mem_cgroup_largest_soft_limit_node(mctz, N_MEMORY);

excess = soft_limit_excess(mz->memcg, N_MEMORY);
/*
@@ -3546,8 +3606,20 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
* term TODO.
*/
/* If excess == 0, no tree ops */
- __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz, mctz, excess, N_MEMORY);
spin_unlock_irq(&mctz->lock);
+
+ /* update both affected N_MEMORY and N_TOPTIER trees */
+ if (mctz_sibling) {
+ spin_lock_irq(&mctz_sibling->lock);
+ __mem_cgroup_remove_exceeded(mz, mctz_sibling,
+ N_TOPTIER);
+ excess = soft_limit_excess(mz->memcg, N_TOPTIER);
+ __mem_cgroup_insert_exceeded(mz, mctz, excess,
+ N_TOPTIER);
+ spin_unlock_irq(&mctz_sibling->lock);
+ }
+
css_put(&mz->memcg->css);
loop++;
/*
@@ -5312,6 +5384,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
lruvec_init(&pn->lruvec);
pn->usage_in_excess = 0;
pn->on_tree = false;
+ pn->toptier_usage_in_excess = 0;
+ pn->on_toptier_tree = false;
pn->memcg = memcg;

memcg->nodeinfo[node] = pn;
--
2.20.1