Re: [RFC PATCH v2 3/7] mm, swap: support physical swap as a vswap backend
From: Yosry Ahmed
Date: Tue Jun 23 2026 - 15:02:09 EST
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 466f8a182716..5daff7a25f67 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -993,6 +993,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> struct folio *folio;
> struct mempolicy *mpol;
> struct swap_info_struct *si;
> + swp_entry_t phys = {};
> int ret = 0;
>
> /* try to allocate swap cache folio */
> @@ -1000,16 +1001,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> if (!si)
> return -EEXIST;
>
> - /*
> - * Vswap entries have no physical backing - writeback would fail
> - * and SIGBUS the caller. Bail before we waste a swap-cache folio
> - * allocation.
> - */
> - if (si->flags & SWP_VSWAP) {
> - put_swap_device(si);
> - return -EINVAL;
> - }
> -
> mpol = get_task_policy(current);
> folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol,
> NO_INTERLEAVE_INDEX);
> @@ -1028,40 +1019,78 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> /*
> * folio is locked, and the swapcache is now secured against
> * concurrent swapping to and from the slot, and concurrent
> - * swapoff so we can safely dereference the zswap tree here.
> - * Verify that the swap entry hasn't been invalidated and recycled
> - * behind our backs, to avoid overwriting a new swap folio with
> - * old compressed data. Only when this is successful can the entry
> - * be dereferenced.
> + * swapoff so we can safely dereference the zswap tree (or vswap
> + * vtable) here. Verify that the swap entry hasn't been
> + * invalidated and recycled behind our backs, to avoid overwriting
> + * a new swap folio with old compressed data. Only when this is
> + * successful can the entry be dereferenced.
> */
> - tree = swap_zswap_tree(swpentry);
> - if (entry != xa_load(tree, offset)) {
> - ret = -ENOMEM;
> - goto out;
> + if (swap_is_vswap(si)) {
> + if (entry != vswap_zswap_load(swpentry)) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + /*
> + * Allocate physical backing BEFORE decompress - if it fails,
> + * no wasted work. folio_realloc_swap sets vtable to PHYS,
> + * overwriting ZSWAP - the old entry pointer is only held
> + * by the caller now.
> + */
> + phys = folio_realloc_swap(folio);
> + if (!phys.val) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + } else {
> + tree = swap_zswap_tree(swpentry);
> + if (entry != xa_load(tree, offset)) {
> + ret = -ENOMEM;
> + goto out;
> + }
There's a lot of divergence in the code (in this patch and previous
ones). Seems like a lot of it is to do xarray operations vs vswap
operations. I wonder if we can abstract these into helpers, e.g.
zswap_tree_store(), zswap_tree_load(), etc. Maybe the name is not the
best, but you get the point :)
Here we can then do zswap_tree_load() for both code paths and only the
folio_realloc_swap() needs to be different for vswap. We can do
similar cleanups for the load/store paths as well.
> }
>
> if (!zswap_decompress(entry, folio)) {
> ret = -EIO;
> + /*
> + * For vswap: folio_realloc_swap already moved the entry
> + * out of the vtable. Restore it via vswap_zswap_store so
> + * the entry stays tracked (and the just-allocated PHYS
> + * slot is freed). For non-vswap: entry is still in the
> + * zswap tree.
> + */
> + if (swap_is_vswap(si) && phys.val)
> + vswap_zswap_store(swpentry, entry);
Should this go in the cleanup path instead (i.e. in the 'out' label?).
> goto out;
> }
>
> - xa_erase(tree, offset);
> + if (!swap_is_vswap(si))
> + xa_erase(tree, offset);
Maybe this can also be abstracted into a helper, but I wonder what the
corresponding vswap operation would be. I think folio_realloc_swap()
will have already "erased" the zswap entry from vswap. Maybe have a
vswap helper that will only remove it if it's a zswap entry? We can
probably do a lockless check first to make it cheap?
It's probably silly to do this, and maybe there's a better way.
Generally, I think the code would be easier to follow if we abstract
away the xarray vs. vswap stuff into helpers (where it's reasonable).
>
> count_vm_event(ZSWPWB);
> if (entry->objcg)
> count_objcg_events(entry->objcg, ZSWPWB, 1);
>
> - zswap_entry_free(entry);
> -
> /* folio is up to date */
> folio_mark_uptodate(folio);
>
> /* move it to the tail of the inactive list after end_writeback */
> folio_set_reclaim(folio);
>
> - /* start writeback */
> - ret = __swap_writepage(folio, NULL);
> - WARN_ON_ONCE(ret);
> + /*
> + * Start writeback. __swap_writepage_phys is void; __swap_writepage
> + * returns 0 today (async IO errors surface in the bio end_io
> + * callback). Either way the entry has been moved out of its prior
> + * location (vtable PHYS for vswap, removed from tree otherwise),
> + * so we own the free.
> + */
> + if (swap_is_vswap(si)) {
> + __swap_writepage_phys(folio, NULL, phys);
> + } else {
> + ret = __swap_writepage(folio, NULL);
> + WARN_ON_ONCE(ret);
> + }
> +
> + zswap_entry_free(entry);
>
> out:
> if (ret) {
> @@ -1212,6 +1241,18 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
> return 0;
>
> + /*
> + * With CONFIG_VSWAP, vswap-backed zswap entries need a physical
> + * swap slot allocated on demand (via folio_realloc_swap) for
> + * writeback. If no physical slots are available, writeback will
> + * fail - skip the shrinker to avoid spinning on entries we cannot
> + * drain. Vanilla zswap-on-swapfile is unaffected because every
> + * zswap entry already has a backing slot; gate on CONFIG_VSWAP so
> + * the check compiles out there.
> + */
> + if (IS_ENABLED(CONFIG_VSWAP) && !get_nr_swap_pages())
> + return 0;
> +
> /*
> * The shrinker resumes swap writeback, which will enter block
> * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS
> @@ -1558,7 +1599,7 @@ bool zswap_store(struct folio *folio)
> * writeback could overwrite the new data in the swapfile.
> */
> if (partial_store && is_vswap_entry(swp))
> - folio_release_vswap_backing(folio);
> + folio_release_non_phys_swap_backing(folio);
> else if (!ret && !is_vswap_entry(swp)) {
> unsigned type = swp_type(swp);
> pgoff_t offset = swp_offset(swp);
> --
> 2.53.0-Meta
>