Re: [PATCH 2/3] mm/zswap: Implement proactive writeback

From: Nhat Pham

Date: Mon May 11 2026 - 15:57:10 EST


On Mon, May 11, 2026 at 3:52 AM Hao Jia <jiahao.kernel@xxxxxxxxx> wrote:
>
> From: Hao Jia <jiahao1@xxxxxxxxxxx>
>
> Zswap currently writes back pages to backing swap devices reactively,
> triggered either by memory pressure via the shrinker or by the pool
> reaching its size limit. This reactive approach offers no precise
> control over when writeback happens, which can disturb latency-sensitive
> workloads, and it cannot direct writeback at a specific memory cgroup.
> However, there are scenarios where users might want to proactively
> write back cold pages from zswap to the backing swap device, for
> example, to free up memory for other applications or to prepare for
> upcoming memory-intensive workloads.
>
> Therefore, implement a proactive writeback mechanism for zswap by
> adding a new cgroup interface file memory.zswap.proactive_writeback
> within the memory controller.
>
> Users can trigger writeback by writing to this file with the following
> parameters:
> - max=<bytes>: The maximum amount of memory to write back (optional,
> default: unlimited).
> - <age>: The minimum age of the pages to write back. Only pages that
> have been in zswap for at least this duration will be written back.
>
> Example usage:
> # Write back pages older than 1 hour (3600 seconds), max 10MB
> echo "max=10M 3600" > memory.zswap.proactive_writeback
>
> The implementation consists of:
> 1. Add store_time to struct zswap_entry to record when each entry was
> inserted into zswap, used for proactive writeback age comparison.
> 2. Introduce struct zswap_shrink_walk_arg, passed as the cb_arg to
> list_lru_walk_one() in both the shrinker and proactive paths. It
> carries the per-invocation cutoff_time and proactive flag down to
> shrink_memcg_cb(), and propagates the encountered_page_in_swapcache
> out-signal from the callback back to the caller.
> 3. Modify the callback function shrink_memcg_cb() to proactively
> writeback zswap_entries that meet the time threshold.
> 4. Add zswap_proactive_writeback() as the proactive writeback driver:
> a per-node batched list_lru_walk_one() loop bounded by the
> writeback budget.
>
> Signed-off-by: Hao Jia <jiahao1@xxxxxxxxxxx>
> ---
> Documentation/admin-guide/cgroup-v2.rst | 24 ++++
> include/linux/zswap.h | 8 ++
> mm/memcontrol.c | 76 ++++++++++
> mm/zswap.c | 176 ++++++++++++++++++++++--
> 4 files changed, 276 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 6efd0095ed99..05b664b3b3e8 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1908,6 +1908,30 @@ The following nested keys are defined.
> This setting has no effect if zswap is disabled, and swapping
> is allowed unless memory.swap.max is set to 0.
>
> + memory.zswap.proactive_writeback
> + A write-only nested-keyed file which exists in non-root cgroups.
> +
> + This interface allows proactive writeback of pages from the zswap
> + pool to the backing swap device. This is useful to offload cold
> + pages from the zswap pool to the slower swap device. It is only
> + available if zswap writeback is enabled.
> +
> + Users can trigger writeback by writing to this file with the following
> + parameters:
> +
> + - "max=<bytes>" : Optional. The maximum amount of data to write back.
> + (default: unlimited). Please note that the kernel can over or under
> + writeback this value.
> +
> + - "<age>" : Required. The minimum age of the pages to write back
> + (in seconds). Only pages that have been in the zswap pool for at
> + least this amount of time will be written back.
> +
> + Example::
> +
> + # Write back pages older than 1 hour (3600 seconds), max 10MB
> + echo "max=10M 3600" > memory.zswap.proactive_writeback
> +
> memory.pressure
> A read-only nested-keyed file.
>
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index efa6b551217e..7a51b4f95017 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -44,6 +44,8 @@ void zswap_lruvec_state_init(struct lruvec *lruvec);
> void zswap_folio_swapin(struct folio *folio);
> bool zswap_is_enabled(void);
> bool zswap_never_enabled(void);
> +int zswap_proactive_writeback(struct mem_cgroup *root, unsigned long nr_max_writeback,
> + ktime_t cutoff);
> #else
>
> struct zswap_lruvec_state {};
> @@ -78,6 +80,12 @@ static inline bool zswap_never_enabled(void)
> return true;
> }
>
> +static inline int zswap_proactive_writeback(struct mem_cgroup *root,
> + unsigned long nr_max_writeback, ktime_t cutoff)
> +{
> + return 0;
> +}
> +
> #endif
>
> #endif /* _LINUX_ZSWAP_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 409c41359dc8..ba7f7b1954a8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -70,6 +70,7 @@
> #include "memcontrol-v1.h"
>
> #include <linux/uaccess.h>
> +#include <linux/parser.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/memcg.h>
> @@ -5891,6 +5892,76 @@ static ssize_t zswap_writeback_write(struct kernfs_open_file *of,
> return nbytes;
> }
>
> +enum {
> + ZSWAP_WRITEBACK_MAX,
> + ZSWAP_WRITEBACK_AGE,
> + ZSWAP_WRITEBACK_ERR,
> +};
> +
> +static const match_table_t zswap_writeback_tokens = {
> + { ZSWAP_WRITEBACK_MAX, "max=%s" },
> + { ZSWAP_WRITEBACK_AGE, "%u" },
> + { ZSWAP_WRITEBACK_ERR, NULL },
> +};
> +
> +static ssize_t zswap_proactive_writeback_write(struct kernfs_open_file *of,
> + char *buf, size_t nbytes,
> + loff_t off)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> + unsigned long nr_max_writeback = ULONG_MAX;
> + substring_t args[MAX_OPT_ARGS];
> + unsigned int age_sec;
> + bool age_set = false;
> + ktime_t cutoff_time;
> + char *token, *end;
> + int err;
> +
> + if (!mem_cgroup_zswap_writeback_enabled(memcg))
> + return -EINVAL;
> +
> + buf = strstrip(buf);
> +
> + while ((token = strsep(&buf, " ")) != NULL) {
> + if (!strlen(token))
> + continue;
> +
> + switch (match_token(token, zswap_writeback_tokens, args)) {
> + case ZSWAP_WRITEBACK_MAX:
> + nr_max_writeback = memparse(args[0].from, &end);
> + if (*end != '\0')
> + return -EINVAL;
> + nr_max_writeback >>= PAGE_SHIFT;
> + break;
> + case ZSWAP_WRITEBACK_AGE:
> + if (age_set)
> + return -EINVAL;
> +
> + if (match_uint(&args[0], &age_sec))
> + return -EINVAL;
> + age_set = true;
> + break;
> + default:
> + return -EINVAL;
> + }
> + }
> +
> + if (!age_set || !age_sec || !nr_max_writeback)
> + return -EINVAL;
> +
> + cutoff_time = ktime_sub(ktime_get_boottime(),
> + ns_to_ktime((u64)age_sec * NSEC_PER_SEC));
> + /* age_sec >= uptime: no entry can be that old, skip the walk. */
> + if (ktime_to_ns(cutoff_time) <= 0)
> + return nbytes;
> +
> + err = zswap_proactive_writeback(memcg, nr_max_writeback, cutoff_time);
> + if (err)
> + return err;
> +
> + return nbytes;
> +}
> +
> static struct cftype zswap_files[] = {
> {
> .name = "zswap.current",
> @@ -5908,6 +5979,11 @@ static struct cftype zswap_files[] = {
> .seq_show = zswap_writeback_show,
> .write = zswap_writeback_write,
> },
> + {
> + .name = "zswap.proactive_writeback",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .write = zswap_proactive_writeback_write,
> + },
> { } /* terminate */
> };
> #endif /* CONFIG_ZSWAP */
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 19538d6f169a..1173ac6836fa 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -36,6 +36,7 @@
> #include <linux/workqueue.h>
> #include <linux/list_lru.h>
> #include <linux/zsmalloc.h>
> +#include <linux/timekeeping.h>
>
> #include "swap.h"
> #include "internal.h"
> @@ -160,6 +161,12 @@ struct zswap_pool {
> char tfm_name[CRYPTO_MAX_ALG_NAME];
> };
>
> +struct zswap_shrink_walk_arg {
> + ktime_t cutoff_time;
> + bool proactive;
> + bool encountered_page_in_swapcache;
> +};
> +
> /* Global LRU lists shared by all zswap pools. */
> static struct list_lru zswap_list_lru;
>
> @@ -183,6 +190,7 @@ static struct shrinker *zswap_shrinker;
> * handle - zsmalloc allocation handle that stores the compressed page data
> * objcg - the obj_cgroup that the compressed memory is charged to
> * lru - handle to the pool's lru used to evict pages.
> + * store_time - Time when the entry was stored, for proactive writeback.
> */
> struct zswap_entry {
> swp_entry_t swpentry;
> @@ -192,6 +200,7 @@ struct zswap_entry {
> unsigned long handle;
> struct obj_cgroup *objcg;
> struct list_head lru;
> + ktime_t store_time;

On the implementation side - will this blow up struct zswap_entry
memory footprint? If so, can you guard this behind a CONFIG option, if
we are to go this route?