Re: [RFC PATCH v2 3/9] mm/zswap: support fully zswap-backed large folio loads

From: Nhat Pham

Date: Fri May 29 2026 - 14:26:07 EST

On Fri, May 29, 2026 at 5:19 AM fujunjie <fujunjie1@xxxxxx> wrote:
>
> zswap currently refuses large swapcache folios. That is correct for mixed
> backend ranges, but it also prevents the common swapin path from loading a
> range that is still fully backed by zswap.
>
> Teach zswap_load() to fill a locked large swapcache folio by decompressing
> each base-page entry into the matching folio offset, then flushing the
> folio once. A missing entry after zswap data has been seen is reported as
> -EAGAIN so the caller can drop the speculative large folio and retry
> order-0.
>
> The large load keeps the zswap entries in place. It is a clean speculative
> fill: until the swap slots are freed, zswap remains the backing copy if
> reclaim drops the large folio before PTEs are installed.
>
> Signed-off-by: fujunjie <fujunjie1@xxxxxx>
> ---
> mm/zswap.c | 105 ++++++++++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 87 insertions(+), 18 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index da5297f7bd69..94ba112a2982 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -15,6 +15,8 @@
>
> #include <linux/module.h>
> #include <linux/cpu.h>
> +#include <linux/mm.h>
> +#include <linux/huge_mm.h>
> #include <linux/highmem.h>
> #include <linux/slab.h>
> #include <linux/spinlock.h>
> @@ -934,7 +936,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> return comp_ret == 0 && alloc_ret == 0;
> }
>
> -static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio,
> + unsigned int page_idx, bool flush_dcache)
> {
> struct zswap_pool *pool = entry->pool;
> struct scatterlist input[2]; /* zsmalloc returns an SG list 1-2 entries */
> @@ -952,14 +955,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
> WARN_ON_ONCE(input->length != PAGE_SIZE);
>
> - dst = kmap_local_folio(folio, 0);
> + dst = kmap_local_folio(folio, page_idx * PAGE_SIZE);
> memcpy_from_sglist(dst, input, 0, PAGE_SIZE);
> dlen = PAGE_SIZE;
> kunmap_local(dst);
> - flush_dcache_folio(folio);
> + if (flush_dcache)
> + flush_dcache_folio(folio);
> } else {
> sg_init_table(&output, 1);
> - sg_set_folio(&output, folio, PAGE_SIZE, 0);
> + sg_set_folio(&output, folio, PAGE_SIZE, page_idx * PAGE_SIZE);
> acomp_request_set_params(acomp_ctx->req, input, &output,
> entry->length, PAGE_SIZE);
> ret = crypto_acomp_decompress(acomp_ctx->req);
> @@ -1042,7 +1046,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> goto out;
> }
>
> - if (!zswap_decompress(entry, folio)) {
> + if (!zswap_decompress(entry, folio, 0, true)) {
> ret = -EIO;
> goto out;
> }
> @@ -1615,10 +1619,9 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
> * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
> * will SIGBUS).
> *
> - * -EINVAL: if the swapped out content was in zswap, but the page belongs
> - * to a large folio, which is not supported by zswap. The folio is unlocked,
> - * but NOT marked up-to-date, so that an IO error is emitted (e.g.
> - * do_swap_page() will SIGBUS).
> + * -EAGAIN: if the swapped out content belongs to a large folio, but the
> + * range is mixed or raced with writeback. The folio remains locked so the
> + * caller can drop the large swapcache folio and retry order-0.
> *
> * -ENOENT: if the swapped out content was not in zswap. The folio remains
> * locked on return.
> @@ -1626,9 +1629,12 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
> int zswap_load(struct folio *folio)
> {
> swp_entry_t swp = folio->swap;
> + unsigned int nr_pages = folio_nr_pages(folio);
> + unsigned int type = swp_type(swp);
> pgoff_t offset = swp_offset(swp);
> - struct xarray *tree = swap_zswap_tree(swp);
> + struct xarray *tree;
> struct zswap_entry *entry;
> + unsigned int i;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1636,21 +1642,84 @@ int zswap_load(struct folio *folio)
> if (zswap_never_enabled())
> return -ENOENT;
>
> - /*
> - * Large folios should not be swapped in while zswap is being used, as
> - * they are not properly handled. Zswap does not properly load large
> - * folios, and a large folio may only be partially in zswap.
> - */
> - if (WARN_ON_ONCE(folio_test_large(folio))) {
> + if (folio_test_large(folio)) {
> + struct obj_cgroup *first_objcg = NULL;
> + bool same_objcg = true;
> + bool saw_zswap = false;
> + bool saw_non_zswap = false;
> +
> + /*
> + * The locked large swapcache folio now covers the range and
> + * conflicts with zswap writeback's order-0 swapcache allocation.
> + * If the range is mixed or an entry disappears, retry order-0.
> + */
> + for (i = 0; i < nr_pages; i++) {
> + tree = swap_zswap_tree(swp_entry(type, offset + i));
> + entry = xa_load(tree, offset + i);
> + if (!entry) {
> + if (saw_zswap)
> + return -EAGAIN;
> + saw_non_zswap = true;
> + continue;
> + }

Can we use xas_load API here instead of traversing down the tree again
and again?

> + if (saw_non_zswap)
> + return -EAGAIN;
> +
> + if (!saw_zswap)
> + first_objcg = entry->objcg;
> + else if (entry->objcg != first_objcg)
> + same_objcg = false;

Can we get different objcg at this point?

> + saw_zswap = true;
> + }
> + if (!saw_zswap)
> + return -ENOENT;
> +
> + for (i = 0; i < nr_pages; i++) {
> + tree = swap_zswap_tree(swp_entry(type, offset + i));
> + entry = xa_load(tree, offset + i);
> + if (!entry)
> + return -EAGAIN;
> +
> + if (!zswap_decompress(entry, folio, i, false)) {
> + folio_unlock(folio);
> + return -EIO;
> + }
> + }
> +
> + flush_dcache_folio(folio);
> + /*
> + * Keep zswap entries until swap slots are freed. This is a clean
> + * speculative fill; zswap remains the backing copy if reclaim
> + * drops the large folio before PTEs are installed.
> + */
> + folio_mark_uptodate(folio);
> + count_vm_events(ZSWPIN, nr_pages);
> + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
> +
> + if (same_objcg) {
> + if (first_objcg)
> + count_objcg_events(first_objcg, ZSWPIN, nr_pages);
> + } else {
> + for (i = 0; i < nr_pages; i++) {
> + tree = swap_zswap_tree(swp_entry(type, offset + i));
> + entry = xa_load(tree, offset + i);
> + if (WARN_ON_ONCE(!entry))
> + continue;
> + if (entry->objcg)
> + count_objcg_events(entry->objcg, ZSWPIN, 1);

xas_load() here too?

> + }
> + }
> +
> folio_unlock(folio);
> - return -EINVAL;
> + return 0;
> }

>
> + tree = swap_zswap_tree(swp);
> entry = xa_load(tree, offset);
> if (!entry)
> return -ENOENT;
>
> - if (!zswap_decompress(entry, folio)) {
> + if (!zswap_decompress(entry, folio, 0, true)) {
> folio_unlock(folio);
> return -EIO;
> }

I wonder how much of these two paths (order 0 and larger order) can be
unified...

> --
> 2.34.1
>