[PATCH v7 3/4] mm: memcontrol: add interfaces for swap tier selection

From: Youngjun Park

Date: Wed May 27 2026 - 02:24:29 EST


Integrate swap tier infrastructure with cgroup to allow selecting
specific swap devices per cgroup.

Introduce memory.swap.tiers for configuring allowed tiers, and
memory.swap.tiers.effective for exposing the effective tiers.
The effective tiers are the intersection of the configured tiers
and the parent's effective tiers.

Note that cgroups do not pin swap tiers, similar to cpuset and CPU
hotplug, allowing configuration changes regardless of usage.

Signed-off-by: Youngjun Park <youngjun.park@xxxxxxx>
---
Documentation/admin-guide/cgroup-v2.rst | 29 +++++++
include/linux/memcontrol.h | 5 ++
mm/memcontrol.c | 96 +++++++++++++++++++++
mm/swap_state.c | 5 +-
mm/swap_tier.c | 107 +++++++++++++++++++++++-
mm/swap_tier.h | 56 +++++++++++--
6 files changed, 288 insertions(+), 10 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 6efd0095ed99..08253072a252 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1850,6 +1850,35 @@ The following nested keys are defined.
Swap usage hard limit. If a cgroup's swap usage reaches this
limit, anonymous memory of the cgroup will not be swapped out.

+ memory.swap.tiers
+ A read-write file which exists on non-root cgroups.
+ Format is similar to cgroup.subtree_control.
+
+ Controls which swap tiers this cgroup is allowed to swap
+ out to. All tiers are enabled by default.
+
+ ::
+
+ (-|+)TIER [(-|+)TIER ...]
+
+ "-" disables a tier, "+" re-enables it.
+ Entries are whitespace-delimited.
+
+ Changes here are combined with parent restrictions to
+ compute memory.swap.tiers.effective.
+
+ If a tier is removed from /sys/kernel/mm/swap/tiers,
+ any prior disable for that tier is invalidated.
+
+ memory.swap.tiers.effective
+ A read-only file which exists on non-root cgroups.
+
+ Shows the tiers this cgroup can actually swap out to.
+ This is the intersection of the parent's effective tiers
+ and this cgroup's own memory.swap.tiers configuration.
+ A child cannot enable a tier that is disabled in its
+ parent.
+
memory.swap.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bf1a6e131eca..eb33c8e30c9e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -287,6 +287,11 @@ struct mem_cgroup {
struct lru_gen_mm_list mm_list;
#endif

+#ifdef CONFIG_SWAP
+ int tier_mask;
+ int tier_effective_mask;
+#endif
+
#ifdef CONFIG_MEMCG_V1
/* Legacy consumer-oriented counters */
struct page_counter kmem; /* v1 only */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e24114a4493a..cbc7a519a24d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -68,6 +68,7 @@
#include <net/ip.h>
#include "slab.h"
#include "memcontrol-v1.h"
+#include "swap_tier.h"

#include <linux/uaccess.h>

@@ -4249,6 +4250,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
refcount_set(&memcg->id.ref, 1);
css_get(css);

+ swap_tiers_memcg_inherit_mask(memcg);
+
/*
* Ensure mem_cgroup_from_private_id() works once we're fully online.
*
@@ -5791,6 +5794,88 @@ static int swap_events_show(struct seq_file *m, void *v)
return 0;
}

+static int swap_tier_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ swap_tiers_mask_show(m, READ_ONCE(memcg->tier_mask));
+ return 0;
+}
+
+static ssize_t swap_tier_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ char *pos, *token;
+ int ret = 0;
+ int original_mask = 0;
+
+ pos = strstrip(buf);
+
+ spin_lock(&swap_tier_lock);
+ if (!*pos) {
+ WRITE_ONCE(memcg->tier_mask, TIER_ALL_MASK);
+ goto sync;
+ }
+
+ original_mask = memcg->tier_mask;
+
+ while ((token = strsep(&pos, " \t\n")) != NULL) {
+ int mask;
+
+ if (!*token)
+ continue;
+
+ if (token[0] != '-' && token[0] != '+') {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ mask = swap_tiers_mask_lookup(token+1);
+ if (!mask) {
+ ret = -EINVAL;
+ goto err;
+ }
+ /*
+ * tier_mask can be modified independently at each memcg.
+ * However, the effective mask is restricted to a subset of
+ * the parent's mask in swap_tiers_memcg_sync_mask().
+ */
+ switch (token[0]) {
+ case '-':
+ WRITE_ONCE(memcg->tier_mask,
+ memcg->tier_mask & ~mask);
+ break;
+ case '+':
+ WRITE_ONCE(memcg->tier_mask,
+ memcg->tier_mask | mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ goto err;
+ }
+
+sync:
+ swap_tiers_memcg_sync_mask(memcg);
+err:
+ if (ret)
+ WRITE_ONCE(memcg->tier_mask, original_mask);
+ spin_unlock(&swap_tier_lock);
+ return ret ? ret : nbytes;
+}
+
+static int swap_tier_effective_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ swap_tiers_mask_show(m, READ_ONCE(memcg->tier_effective_mask));
+ return 0;
+}
+
static struct cftype swap_files[] = {
{
.name = "swap.current",
@@ -5823,6 +5908,17 @@ static struct cftype swap_files[] = {
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
.seq_show = swap_events_show,
},
+ {
+ .name = "swap.tiers",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_tier_show,
+ .write = swap_tier_write,
+ },
+ {
+ .name = "swap.tiers.effective",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_tier_effective_show,
+ },
{ } /* terminate */
};

diff --git a/mm/swap_state.c b/mm/swap_state.c
index de285b36e31c..2fda6b61e2de 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -1011,6 +1011,7 @@ static ssize_t tiers_store(struct kobject *kobj,
char *p, *token, *name, *tmp;
int ret = 0;
short prio;
+ int mask = 0;

tmp = kstrdup(buf, GFP_KERNEL);
if (!tmp)
@@ -1043,7 +1044,7 @@ static ssize_t tiers_store(struct kobject *kobj,
goto restore;
break;
case '-':
- ret = swap_tiers_remove(token + 1);
+ ret = swap_tiers_remove(token + 1, &mask);
if (ret)
goto restore;
break;
@@ -1053,7 +1054,7 @@ static ssize_t tiers_store(struct kobject *kobj,
}
}

- if (!swap_tiers_update()) {
+ if (!swap_tiers_update(mask)) {
ret = -EINVAL;
goto restore;
}
diff --git a/mm/swap_tier.c b/mm/swap_tier.c
index 6b57cadb3e95..9c180f55a4e9 100644
--- a/mm/swap_tier.c
+++ b/mm/swap_tier.c
@@ -253,7 +253,7 @@ int swap_tiers_add(const char *name, int prio)
return ret;
}

-int swap_tiers_remove(const char *name)
+int swap_tiers_remove(const char *name, int *mask)
{
int ret = 0;
struct swap_tier *tier;
@@ -276,6 +276,7 @@ int swap_tiers_remove(const char *name)
list_prev_entry(tier, list)->prio = DEF_SWAP_PRIO;

swap_tier_inactivate(tier);
+ *mask |= TIER_MASK(tier);

return ret;
}
@@ -344,7 +345,26 @@ void swap_tiers_assign_dev(struct swap_info_struct *swp)
swp->tier_mask = TIER_DEFAULT_MASK;
}

-bool swap_tiers_update(void)
+#ifdef CONFIG_MEMCG
+static void swap_tier_memcg_propagate(int mask)
+{
+ struct mem_cgroup *child;
+
+ rcu_read_lock();
+ for_each_mem_cgroup_tree(child, root_mem_cgroup) {
+ WRITE_ONCE(child->tier_mask, child->tier_mask | mask);
+ WRITE_ONCE(child->tier_effective_mask,
+ child->tier_effective_mask | mask);
+ }
+ rcu_read_unlock();
+}
+#else
+static void swap_tier_memcg_propagate(int mask)
+{
+}
+#endif
+
+bool swap_tiers_update(int mask)
{
struct swap_tier *tier;
struct swap_info_struct *swp;
@@ -375,5 +395,88 @@ bool swap_tiers_update(void)
swap_tiers_assign_dev(swp);
}

+ /*
+ * When a tier is removed, its index (bit position in the mask) becomes
+ * free for reassignment to a future tier. If a memcg had previously
+ * disabled this tier (cleared the bit in its swap.tiers file), the
+ * effective mask would keep that bit clear -- meaning the new tier at
+ * the same index would be silently unavailable, an invisible cgroup
+ * constraint left behind by a tier that no longer exists.
+ *
+ * To prevent this, OR the removed tier's mask bit into every memcg's
+ * tier_mask and tier_effective_mask. This resets the bit so the new
+ * tier is accessible by default; users who want to restrict it must
+ * explicitly disable it after the tier is re-created.
+ */
+ if (mask)
+ swap_tier_memcg_propagate(mask);
+
return true;
}
+
+#ifdef CONFIG_MEMCG
+void swap_tiers_mask_show(struct seq_file *m, int mask)
+{
+ struct swap_tier *tier;
+
+ spin_lock(&swap_tier_lock);
+ for_each_active_tier(tier) {
+ if (mask & TIER_MASK(tier))
+ seq_printf(m, "%s ", tier->name);
+ }
+ spin_unlock(&swap_tier_lock);
+ seq_puts(m, "\n");
+}
+
+int swap_tiers_mask_lookup(const char *name)
+{
+ struct swap_tier *tier;
+
+ lockdep_assert_held(&swap_tier_lock);
+
+ for_each_active_tier(tier) {
+ if (!strcmp(name, tier->name))
+ return TIER_MASK(tier);
+ }
+
+ return 0;
+}
+
+static void __swap_tier_memcg_inherit_mask(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ int parent_mask = parent
+ ? READ_ONCE(parent->tier_effective_mask)
+ : TIER_ALL_MASK;
+
+ WRITE_ONCE(memcg->tier_effective_mask,
+ parent_mask & READ_ONCE(memcg->tier_mask));
+}
+
+/* Computes the initial effective mask from the parent's effective mask. */
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg)
+{
+ spin_lock(&swap_tier_lock);
+ rcu_read_lock();
+ memcg->tier_mask = TIER_ALL_MASK;
+ __swap_tier_memcg_inherit_mask(memcg, parent_mem_cgroup(memcg));
+ rcu_read_unlock();
+ spin_unlock(&swap_tier_lock);
+}
+
+/*
+ * Called when a memcg's tier_mask is modified. Walks the subtree
+ * and recomputes each descendant's effective mask against its parent.
+ */
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *child;
+
+ lockdep_assert_held(&swap_tier_lock);
+
+ rcu_read_lock();
+ for_each_mem_cgroup_tree(child, memcg)
+ __swap_tier_memcg_inherit_mask(child, parent_mem_cgroup(child));
+ rcu_read_unlock();
+}
+#endif
diff --git a/mm/swap_tier.h b/mm/swap_tier.h
index 3e355f857363..49433dcaa1ce 100644
--- a/mm/swap_tier.h
+++ b/mm/swap_tier.h
@@ -10,22 +10,66 @@ struct swap_info_struct;

extern spinlock_t swap_tier_lock;

-#define TIER_ALL_MASK (~0)
-#define TIER_DEFAULT_IDX (31)
-#define TIER_DEFAULT_MASK (1U << TIER_DEFAULT_IDX)
-
/* Initialization and application */
void swap_tiers_init(void);
ssize_t swap_tiers_sysfs_show(char *buf);

int swap_tiers_add(const char *name, int prio);
-int swap_tiers_remove(const char *name);
+int swap_tiers_remove(const char *name, int *mask);

void swap_tiers_snapshot(void);
void swap_tiers_snapshot_restore(void);
-bool swap_tiers_update(void);
+bool swap_tiers_update(int mask);

/* Tier assignment */
void swap_tiers_assign_dev(struct swap_info_struct *swp);

+#define TIER_ALL_MASK (~0)
+#define TIER_DEFAULT_IDX (31)
+#define TIER_DEFAULT_MASK (1U << TIER_DEFAULT_IDX)
+
+#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG)
+/* Memcg related functions */
+void swap_tiers_mask_show(struct seq_file *m, int mask);
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg);
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg);
+int swap_tiers_mask_lookup(const char *name);
+static inline int folio_tier_effective_mask(struct folio *folio)
+{
+ struct mem_cgroup *memcg;
+ int mask = TIER_ALL_MASK;
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ if (memcg)
+ mask = READ_ONCE(memcg->tier_effective_mask);
+ rcu_read_unlock();
+
+ return mask;
+}
+#else
+static inline void swap_tiers_mask_show(struct seq_file *m, int mask) {}
+static inline void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg) {}
+static inline void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg) {}
+static inline int swap_tiers_mask_lookup(const char *name)
+{
+ return 0;
+}
+static inline int folio_tier_effective_mask(struct folio *folio)
+{
+ return TIER_ALL_MASK;
+}
+#endif
+
+/**
+ * swap_tiers_mask_test - Check if the tier mask is valid
+ * @tier_mask: The tier mask to check
+ * @mask: The mask to compare against
+ *
+ * Return: true if condition matches, false otherwise
+ */
+static inline bool swap_tiers_mask_test(int tier_mask, int mask)
+{
+ return tier_mask & mask;
+}
#endif /* _SWAP_TIER_H */
--
2.34.1