Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability

From: Hao Jia

Date: Tue Jun 23 2026 - 09:23:50 EST

On 2026/6/23 07:33, Yosry Ahmed wrote:

On Thu, Jun 18, 2026 at 12:48:53PM +0800, Hao Jia wrote:

From: Hao Jia <jiahao1@xxxxxxxxxxx>

diff --git a/mm/zswap.c b/mm/zswap.c
index 761cd699e0a3..d7d031dee4cd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -160,6 +160,11 @@ struct zswap_pool {
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
+struct zswap_shrink_walk_arg {
+ unsigned long bytes_written;
+ bool encountered_page_in_swapcache;
+};
+
/* Global LRU lists shared by all zswap pools. */
static struct list_lru zswap_list_lru;
@@ -1089,8 +1094,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
void *arg)
{
struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
- bool *encountered_page_in_swapcache = (bool *)arg;
+ struct zswap_shrink_walk_arg *walk_arg = arg;
swp_entry_t swpentry;
+ unsigned int length;
enum lru_status ret = LRU_REMOVED_RETRY;
int writeback_result;
@@ -1135,8 +1141,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
* Once the lru lock is dropped, the entry might get freed. The
* swpentry is copied to the stack, and entry isn't deref'd again
* until the entry is verified to still be alive in the tree.
+ *
+ * entry->length is also copied while the lock is held, because
+ * zswap_writeback_entry() frees the entry on success and we still
+ * need its compressed size to account for writeback.

Hmm that's unnecessary, just update "The swpentry is copied to the
stack.." above to "Copy neded fields to the stack.." or something.

I'll do this, thanks.

*/
swpentry = entry->swpentry;
+ length = entry->length;
/*
* It's safe to drop the lock here because we return either
@@ -1155,12 +1166,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
* into the warmer region. We should terminate shrinking (if we're in the dynamic
* shrinker context).
*/
- if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+ if (writeback_result == -EEXIST) {
ret = LRU_STOP;
- *encountered_page_in_swapcache = true;
+ walk_arg->encountered_page_in_swapcache = true;
}
} else {
zswap_written_back_pages++;
+ walk_arg->bytes_written += length;
}
return ret;
@@ -1169,8 +1181,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
+ struct zswap_shrink_walk_arg walk_arg = {
+ .bytes_written = 0,
+ .encountered_page_in_swapcache = false,
+ };
unsigned long shrink_ret;
- bool encountered_page_in_swapcache = false;
if (!zswap_shrinker_enabled ||
!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
@@ -1179,9 +1194,9 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
}
shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
- &encountered_page_in_swapcache);
+ &walk_arg);
- if (encountered_page_in_swapcache)
+ if (walk_arg.encountered_page_in_swapcache)
return SHRINK_STOP;
return shrink_ret ? shrink_ret : SHRINK_STOP;
@@ -1275,10 +1290,32 @@ static struct shrinker *zswap_alloc_shrinker(void)
return shrinker;
}
-static int shrink_memcg(struct mem_cgroup *memcg)
-{
- int nid, shrunk = 0, scanned = 0;
+/*
+ * The maximum acceptable scan cost factor for writing back
+ * PAGE_SIZE bytes of compressed data.
+ */
+#define ZSWAP_WB_SCAN_FACTOR 16UL
+#define NR_ZSWAP_WB_BATCH 64UL
+/*
+ * Iterate over the per-node zswap LRUs of @memcg in batches, writing back
+ * up to @nr_to_writeback * PAGE_SIZE bytes of compressed data.
+ *
+ * Return: The number of bytes written back, or -ENOENT if @memcg has
+ * writeback disabled, is a zombie cgroup, or has empty zswap LRUs.
+ */
+static long shrink_memcg(struct mem_cgroup *memcg,
+ unsigned long nr_to_writeback)

Is nr_to_writeback supposed to be the number of pages we want to
writeback (regardless of their compressed size), or the compressed bytes
we want to writeback divided by PAGE_SIZE?

The way it's being used below seems like it's the latter, but the batch
size should be in terms of scanned pages (i.e. uncompressed pages). So
this is confusing.

The zswap_store() path expects to reclaim one uncompressed page, but
this will reclaim PAGE_SIZE worth of compressed memory when passing 1
IIUC (actually maybe more, see below).

+{
+ struct zswap_shrink_walk_arg walk_arg = {
+ .bytes_written = 0,
+ .encountered_page_in_swapcache = false,
+ };
+ u64 bytes_to_writeback = nr_to_writeback << PAGE_SHIFT;
+ bool memcg_list_is_empty = true;
+ int nid;
+
+ /* Memcg with zswap writeback disabled are not candidates. */

The comment is unnecessary here, it should be obvious.

I'll do this, thanks.

if (!mem_cgroup_zswap_writeback_enabled(memcg))
return -ENOENT;
@@ -1290,24 +1327,65 @@ static int shrink_memcg(struct mem_cgroup *memcg)
return -ENOENT;
for_each_node_state(nid, N_NORMAL_MEMORY) {
- unsigned long nr_to_walk = 1;
+ unsigned long nr_to_scan, nr_scanned = 0;
+ unsigned long remain;
+ walk_arg.encountered_page_in_swapcache = false;
+ /*
+ * Cap by LRU length: bounds rewalks when referenced
+ * entries keep rotating to the tail.
+ */
+ nr_to_scan = list_lru_count_one(&zswap_list_lru, nid, memcg);
+ if (!nr_to_scan)
+ continue;

Hmm generally if we are running out of pages to scan then we should scan
the rotated entries, and reclaim them on the second pass, right? So this
should be working as intended. But I guess this doesn't work well when
iterating multiple memcgs, as we don't want to drain referenced entries
in one memcg before reclaiming already rotated entries on another.

So I think the assumption here is that the caller will retry if needed,
handling balancing scanning between multiple memcgs if needed. Maybe we
should document this in the function doc above? We should explain that
referenced entries will be rotated but not reclaimed as part of the same
call.

+ memcg_list_is_empty = false;
+
+ /*
+ * Cap by SCAN_FACTOR * remain budget: bounds scan cost
+ * to the remaining writeback budget.
+ */
+ remain = DIV_ROUND_UP(bytes_to_writeback - walk_arg.bytes_written, PAGE_SIZE);
+ nr_to_scan = min(nr_to_scan,
+ remain * ZSWAP_WB_SCAN_FACTOR);

For the zswap_store() path bytes_to_writeback=PAGE_SIZE, so remain will
initially be 1. But then we multiply by this factor and now to scan 16
pages? Also, where did this factor and equation come from?

We'll also loop over nodes, so we may end up scanning 32 or more pages
depending on the number of nodes in the system.

If this is just a heuristic, we should really just start simple and add
heuristics later as needed. The caller should probably pass in the
number of pages to scan (i.e. uncompressed pages), and leave it to the
caller to decide when to retry if the actual memory savings are
realized.

- shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
- &shrink_memcg_cb, NULL, &nr_to_walk);
- scanned += 1 - nr_to_walk;
+ while (nr_scanned < nr_to_scan) {
+ unsigned long nr_to_walk = min(NR_ZSWAP_WB_BATCH,
+ nr_to_scan - nr_scanned);
+
+ /*
+ * Account for the committed budget rather than the walker's
+ * actual delta. If the list is emptied concurrently, the
+ * walker visits nothing and nr_scanned would never advance.
+ */
+ nr_scanned += nr_to_walk;
+
+ list_lru_walk_one(&zswap_list_lru, nid, memcg,
+ &shrink_memcg_cb,
+ &walk_arg,
+ &nr_to_walk);
+
+ if (walk_arg.bytes_written >= bytes_to_writeback)
+ return walk_arg.bytes_written;
+
+ if (walk_arg.encountered_page_in_swapcache)
+ break;
+
+ cond_resched();
+ }

If the caller is expected to have a retry loop anyway, should we
simplify this and just scan each per-node LRU once?

We should also probably bail early if the number of scanned pages has
already been reached? Currently shrink_memcg() scans one page at a time,
so if it scans a bit more to balance between the nodes it's probably
fine.

But with batching, we could end up scanning hundres of extra pages just
to balance between all nodes. Is node imbalance a real issue?

My initial thought was that if cold memory is evenly distributed across nodes and we are doing a large writeback, it would be better to balance the zswap entry writeback across all nodes rather than just draining node 0 first. However, since we currently lack a proper metric to represent hot/cold memory (such as age-based tracking), doing this probably doesn't make much sense right now.

So, perhaps we want something like this? Please correct me if I'm wrong.

static long shrink_memcg(struct mem_cgroup *memcg,
unsigned long nr_to_scan)
{
struct zswap_shrink_walk_arg walk_arg = {
.bytes_written = 0,
.encountered_page_in_swapcache = false,
};
unsigned long nr_remaining = nr_to_scan;
bool memcg_list_is_empty = true;
int nid;

if (!mem_cgroup_zswap_writeback_enabled(memcg))
return -ENOENT;

if (memcg && !mem_cgroup_online(memcg))
return -ENOENT;

for_each_node_state(nid, N_NORMAL_MEMORY) {
unsigned long nr_to_walk;

/*
* Cap the per-node scan by the current LRU length. A referenced
* entry is only rotated to the tail (second chance) and may be
* revisited within a single walk; without this cap those rotated
* entries could drain the shared scan budget on one node.
*/
nr_to_walk = min(nr_remaining,
list_lru_count_one(&zswap_list_lru, nid, memcg));
if (!nr_to_walk)
continue;
memcg_list_is_empty = false;

nr_remaining -= nr_to_walk;
list_lru_walk_one(&zswap_list_lru, nid, memcg,
&shrink_memcg_cb, &walk_arg, &nr_to_walk);
/* Return the unused share of the budget to the pool. */
nr_remaining += nr_to_walk;

/* Bail out once the whole scan budget has been spent. */
if (!nr_remaining)
break;

cond_resched();
}

if (memcg_list_is_empty)
return -ENOENT;

return walk_arg.bytes_written;
}

Thanks,
Hao