Re: [RFC PATCH v2 3/9] mm/zswap: support fully zswap-backed large folio loads

From: Fujunjie

Date: Sun May 31 2026 - 16:03:42 EST




On 5/30/2026 2:25 AM, Nhat Pham wrote:
> On Fri, May 29, 2026 at 5:19 AM fujunjie <fujunjie1@xxxxxx> wrote:
>>
>> zswap currently refuses large swapcache folios. That is correct for mixed
>> backend ranges, but it also prevents the common swapin path from loading a
>> range that is still fully backed by zswap.
>>
>> Teach zswap_load() to fill a locked large swapcache folio by decompressing
>> each base-page entry into the matching folio offset, then flushing the
>> folio once. A missing entry after zswap data has been seen is reported as
>> -EAGAIN so the caller can drop the speculative large folio and retry
>> order-0.
>>
>> The large load keeps the zswap entries in place. It is a clean speculative
>> fill: until the swap slots are freed, zswap remains the backing copy if
>> reclaim drops the large folio before PTEs are installed.
>>
>> Signed-off-by: fujunjie <fujunjie1@xxxxxx>
>> ---
>> mm/zswap.c | 105 ++++++++++++++++++++++++++++++++++++++++++++---------
>> 1 file changed, 87 insertions(+), 18 deletions(-)
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index da5297f7bd69..94ba112a2982 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -15,6 +15,8 @@
>>
>> #include <linux/module.h>
>> #include <linux/cpu.h>
>> +#include <linux/mm.h>
>> +#include <linux/huge_mm.h>
>> #include <linux/highmem.h>
>> #include <linux/slab.h>
>> #include <linux/spinlock.h>
>> @@ -934,7 +936,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>> return comp_ret == 0 && alloc_ret == 0;
>> }
>>
>> -static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>> +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio,
>> + unsigned int page_idx, bool flush_dcache)
>> {
>> struct zswap_pool *pool = entry->pool;
>> struct scatterlist input[2]; /* zsmalloc returns an SG list 1-2 entries */
>> @@ -952,14 +955,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>>
>> WARN_ON_ONCE(input->length != PAGE_SIZE);
>>
>> - dst = kmap_local_folio(folio, 0);
>> + dst = kmap_local_folio(folio, page_idx * PAGE_SIZE);
>> memcpy_from_sglist(dst, input, 0, PAGE_SIZE);
>> dlen = PAGE_SIZE;
>> kunmap_local(dst);
>> - flush_dcache_folio(folio);
>> + if (flush_dcache)
>> + flush_dcache_folio(folio);
>> } else {
>> sg_init_table(&output, 1);
>> - sg_set_folio(&output, folio, PAGE_SIZE, 0);
>> + sg_set_folio(&output, folio, PAGE_SIZE, page_idx * PAGE_SIZE);
>> acomp_request_set_params(acomp_ctx->req, input, &output,
>> entry->length, PAGE_SIZE);
>> ret = crypto_acomp_decompress(acomp_ctx->req);
>> @@ -1042,7 +1046,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>> goto out;
>> }
>>
>> - if (!zswap_decompress(entry, folio)) {
>> + if (!zswap_decompress(entry, folio, 0, true)) {
>> ret = -EIO;
>> goto out;
>> }
>> @@ -1615,10 +1619,9 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
>> * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
>> * will SIGBUS).
>> *
>> - * -EINVAL: if the swapped out content was in zswap, but the page belongs
>> - * to a large folio, which is not supported by zswap. The folio is unlocked,
>> - * but NOT marked up-to-date, so that an IO error is emitted (e.g.
>> - * do_swap_page() will SIGBUS).
>> + * -EAGAIN: if the swapped out content belongs to a large folio, but the
>> + * range is mixed or raced with writeback. The folio remains locked so the
>> + * caller can drop the large swapcache folio and retry order-0.
>> *
>> * -ENOENT: if the swapped out content was not in zswap. The folio remains
>> * locked on return.
>> @@ -1626,9 +1629,12 @@ enum zswap_range_state zswap_probe_range(swp_entry_t swp,
>> int zswap_load(struct folio *folio)
>> {
>> swp_entry_t swp = folio->swap;
>> + unsigned int nr_pages = folio_nr_pages(folio);
>> + unsigned int type = swp_type(swp);
>> pgoff_t offset = swp_offset(swp);
>> - struct xarray *tree = swap_zswap_tree(swp);
>> + struct xarray *tree;
>> struct zswap_entry *entry;
>> + unsigned int i;
>>
>> VM_WARN_ON_ONCE(!folio_test_locked(folio));
>> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
>> @@ -1636,21 +1642,84 @@ int zswap_load(struct folio *folio)
>> if (zswap_never_enabled())
>> return -ENOENT;
>>
>> - /*
>> - * Large folios should not be swapped in while zswap is being used, as
>> - * they are not properly handled. Zswap does not properly load large
>> - * folios, and a large folio may only be partially in zswap.
>> - */
>> - if (WARN_ON_ONCE(folio_test_large(folio))) {
>> + if (folio_test_large(folio)) {
>> + struct obj_cgroup *first_objcg = NULL;
>> + bool same_objcg = true;
>> + bool saw_zswap = false;
>> + bool saw_non_zswap = false;
>> +
>> + /*
>> + * The locked large swapcache folio now covers the range and
>> + * conflicts with zswap writeback's order-0 swapcache allocation.
>> + * If the range is mixed or an entry disappears, retry order-0.
>> + */
>> + for (i = 0; i < nr_pages; i++) {
>> + tree = swap_zswap_tree(swp_entry(type, offset + i));
>> + entry = xa_load(tree, offset + i);
>> + if (!entry) {
>> + if (saw_zswap)
>> + return -EAGAIN;
>> + saw_non_zswap = true;
>> + continue;
>> + }
>
> Can we use xas_load API here instead of traversing down the tree again
> and again?

I'll rework it to use xas_load(), while handling zswap tree boundaries correctly.

>
>> + if (saw_non_zswap)
>> + return -EAGAIN;
>> +
>> + if (!saw_zswap)
>> + first_objcg = entry->objcg;
>> + else if (entry->objcg != first_objcg)
>> + same_objcg = false;
>
> Can we get different objcg at this point?

The objcg pointers can be different in principle, for example if
the range is assembled from entries that came from different per-node objcgs
of the same memcg.

But for this accounting path, count_objcg_events() ultimately charges the
event to obj_cgroup_memcg(entry->objcg). Since the large swapcache allocation
has already checked compatible swap ownership for the range, the final memcg
accounting target should be the same even if the objcg pointers differ.

I will simplify this in v3 and avoid the extra objcg equality pass.

>
>> + saw_zswap = true;
>> + }
>> + if (!saw_zswap)
>> + return -ENOENT;
>> +
>> + for (i = 0; i < nr_pages; i++) {
>> + tree = swap_zswap_tree(swp_entry(type, offset + i));
>> + entry = xa_load(tree, offset + i);
>> + if (!entry)
>> + return -EAGAIN;
>> +
>> + if (!zswap_decompress(entry, folio, i, false)) {
>> + folio_unlock(folio);
>> + return -EIO;
>> + }
>> + }
>> +
>> + flush_dcache_folio(folio);
>> + /*
>> + * Keep zswap entries until swap slots are freed. This is a clean
>> + * speculative fill; zswap remains the backing copy if reclaim
>> + * drops the large folio before PTEs are installed.
>> + */
>> + folio_mark_uptodate(folio);
>> + count_vm_events(ZSWPIN, nr_pages);
>> + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
>> +
>> + if (same_objcg) {
>> + if (first_objcg)
>> + count_objcg_events(first_objcg, ZSWPIN, nr_pages);
>> + } else {
>> + for (i = 0; i < nr_pages; i++) {
>> + tree = swap_zswap_tree(swp_entry(type, offset + i));
>> + entry = xa_load(tree, offset + i);
>> + if (WARN_ON_ONCE(!entry))
>> + continue;
>> + if (entry->objcg)
>> + count_objcg_events(entry->objcg, ZSWPIN, 1);
>
> xas_load() here too?

Yes, same issue here.

>
>
>> + }
>> + }
>> +
>> folio_unlock(folio);
>> - return -EINVAL;
>> + return 0;
>> }
>
>>
>> + tree = swap_zswap_tree(swp);
>> entry = xa_load(tree, offset);
>> if (!entry)
>> return -ENOENT;
>>
>> - if (!zswap_decompress(entry, folio)) {
>> + if (!zswap_decompress(entry, folio, 0, true)) {
>> folio_unlock(folio);
>> return -EIO;
>> }
>
> I wonder how much of these two paths (order 0 and larger order) can be
> unified...

I think more of this can be unified than this version does.

I split the paths this way because I treated the large-folio load as a
speculative fill and kept the zswap entries as the backing copy. But with
your point that an installed large swapcache folio should block zswap
writeback from turning the range mixed, I should revisit that completion rule
instead of baking it into a separate path.

For the v3 version I will try to collapse the common load path. If the large-folio
case still needs different entry lifetime rules, I will make that distinction
explicit.

>
>> --
>> 2.34.1
>>