Re: [RFC PATCH v1] mm: zswap: Store large folios without splitting

From: Ryan Roberts
Date: Fri Oct 27 2023 - 08:30:05 EST


Just a polite nudge on this if anyone has any feedback?

Alternatively, happy to repost against -rc1 if that's preferred?


On 19/10/2023 12:05, Ryan Roberts wrote:
> Previously zswap would refuse to store any folio bigger than order-0,
> and therefore all of those folios would be sent directly to the swap
> file. This is a minor inconvenience since swap can currently only
> support order-0 and PMD-sized THP, but with the pending introduction of
> "small-sized THP", and corresponding changes to swapfile to support any
> order of folio, these large folios will become more prevalent and
> without this zswap change, zswap will become unusable. Independently of
> the "small-sized THP" feature, this change makes it possible to store
> existing PMD-sized THPs in zswap.
>
> Modify zswap_store() to allow storing large folios. The function is
> split into 2 parts; zswap_store() does all the per-folio operations
> (i.e. checking there is enough space, etc). Then it calls a new helper,
> zswap_store_page(), for each page in the folio, which are stored as
> their own entries in the zswap pool. (These entries continue to be
> loaded back individually as single pages). If a store fails for any
> single page, then all previously successfully stored folio pages are
> invalidated.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx>
> ---
> I've tested this on arm64 (m2) with zswap enabled, and running
> vm-scalability's `usemem` across multiple cores from within a
> memory-constrained memcg to force high volumes of swap. I've also run mm
> selftests and observe no regressions (although there is nothing [z]swap
> specific there) - does zswap have any specific tests I should run?
>
> This is based on mm-stable, since mm-unstable contains a zswap patch
> known to be buggy [1]. I thought it would be best to get comments on the
> shape, then do the rebase after that patch has been fixed.
>
> For context, small-sized THP is being discussed here [2], and I'm
> working on changes to swapfile to support non-PMD-sized large folios
> here [3].
>
> [1] https://lore.kernel.org/linux-mm/21606fe5-fb9b-4d37-98ab-38c96819893b@xxxxxxx/
> [2] https://lore.kernel.org/linux-mm/20230929114421.3761121-1-ryan.roberts@xxxxxxx/
> [3] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@xxxxxxx/
>
> Thanks,
> Ryan
>
>
> mm/zswap.c | 155 +++++++++++++++++++++++++++++++++--------------------
> 1 file changed, 98 insertions(+), 57 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 37d2b1cb2ecb..51cbfc4e1ef8 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1188,18 +1188,17 @@ static void zswap_fill_page(void *ptr, unsigned long value)
> memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
> }
>
> -bool zswap_store(struct folio *folio)
> +static bool zswap_store_page(struct folio *folio, long index,
> + struct obj_cgroup *objcg, struct zswap_pool *pool)
> {
> swp_entry_t swp = folio->swap;
> int type = swp_type(swp);
> - pgoff_t offset = swp_offset(swp);
> - struct page *page = &folio->page;
> + pgoff_t offset = swp_offset(swp) + index;
> + struct page *page = folio_page(folio, index);
> struct zswap_tree *tree = zswap_trees[type];
> struct zswap_entry *entry, *dupentry;
> struct scatterlist input, output;
> struct crypto_acomp_ctx *acomp_ctx;
> - struct obj_cgroup *objcg = NULL;
> - struct zswap_pool *pool;
> struct zpool *zpool;
> unsigned int dlen = PAGE_SIZE;
> unsigned long handle, value;
> @@ -1208,51 +1207,11 @@ bool zswap_store(struct folio *folio)
> gfp_t gfp;
> int ret;
>
> - VM_WARN_ON_ONCE(!folio_test_locked(folio));
> - VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> -
> - /* Large folios aren't supported */
> - if (folio_test_large(folio))
> - return false;
> -
> - if (!zswap_enabled || !tree)
> + /* entry keeps the references if successfully stored. */
> + if (!zswap_pool_get(pool))
> return false;
> -
> - /*
> - * If this is a duplicate, it must be removed before attempting to store
> - * it, otherwise, if the store fails the old page won't be removed from
> - * the tree, and it might be written back overriding the new data.
> - */
> - spin_lock(&tree->lock);
> - dupentry = zswap_rb_search(&tree->rbroot, offset);
> - if (dupentry) {
> - zswap_duplicate_entry++;
> - zswap_invalidate_entry(tree, dupentry);
> - }
> - spin_unlock(&tree->lock);
> -
> - /*
> - * XXX: zswap reclaim does not work with cgroups yet. Without a
> - * cgroup-aware entry LRU, we will push out entries system-wide based on
> - * local cgroup limits.
> - */
> - objcg = get_obj_cgroup_from_folio(folio);
> - if (objcg && !obj_cgroup_may_zswap(objcg))
> - goto reject;
> -
> - /* reclaim space if needed */
> - if (zswap_is_full()) {
> - zswap_pool_limit_hit++;
> - zswap_pool_reached_full = true;
> - goto shrink;
> - }
> -
> - if (zswap_pool_reached_full) {
> - if (!zswap_can_accept())
> - goto shrink;
> - else
> - zswap_pool_reached_full = false;
> - }
> + if (objcg)
> + obj_cgroup_get(objcg);
>
> /* allocate entry */
> entry = zswap_entry_cache_alloc(GFP_KERNEL);
> @@ -1260,6 +1219,8 @@ bool zswap_store(struct folio *folio)
> zswap_reject_kmemcache_fail++;
> goto reject;
> }
> + entry->objcg = objcg;
> + entry->pool = pool;
>
> if (zswap_same_filled_pages_enabled) {
> src = kmap_atomic(page);
> @@ -1277,11 +1238,6 @@ bool zswap_store(struct folio *folio)
> if (!zswap_non_same_filled_pages_enabled)
> goto freepage;
>
> - /* if entry is successfully added, it keeps the reference */
> - entry->pool = zswap_pool_current_get();
> - if (!entry->pool)
> - goto freepage;
> -
> /* compress */
> acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>
> @@ -1337,7 +1293,6 @@ bool zswap_store(struct folio *folio)
> entry->length = dlen;
>
> insert_entry:
> - entry->objcg = objcg;
> if (objcg) {
> obj_cgroup_charge_zswap(objcg, entry->length);
> /* Account before objcg ref is moved to tree */
> @@ -1373,19 +1328,105 @@ bool zswap_store(struct folio *folio)
>
> put_dstmem:
> mutex_unlock(acomp_ctx->mutex);
> - zswap_pool_put(entry->pool);
> freepage:
> zswap_entry_cache_free(entry);
> reject:
> if (objcg)
> obj_cgroup_put(objcg);
> + zswap_pool_put(pool);
> return false;
> +}
>
> +bool zswap_store(struct folio *folio)
> +{
> + long nr_pages = folio_nr_pages(folio);
> + swp_entry_t swp = folio->swap;
> + int type = swp_type(swp);
> + pgoff_t offset = swp_offset(swp);
> + struct zswap_tree *tree = zswap_trees[type];
> + struct zswap_entry *entry;
> + struct obj_cgroup *objcg = NULL;
> + struct zswap_pool *pool;
> + bool ret = false;
> + long i;
> +
> + VM_WARN_ON_ONCE(!folio_test_locked(folio));
> + VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> +
> + if (!zswap_enabled || !tree)
> + return false;
> +
> + /*
> + * If this is a duplicate, it must be removed before attempting to store
> + * it, otherwise, if the store fails the old page won't be removed from
> + * the tree, and it might be written back overriding the new data.
> + */
> + spin_lock(&tree->lock);
> + for (i = 0; i < nr_pages; i++) {
> + entry = zswap_rb_search(&tree->rbroot, offset + i);
> + if (entry) {
> + zswap_duplicate_entry++;
> + zswap_invalidate_entry(tree, entry);
> + }
> + }
> + spin_unlock(&tree->lock);
> +
> + /*
> + * XXX: zswap reclaim does not work with cgroups yet. Without a
> + * cgroup-aware entry LRU, we will push out entries system-wide based on
> + * local cgroup limits.
> + */
> + objcg = get_obj_cgroup_from_folio(folio);
> + if (objcg && !obj_cgroup_may_zswap(objcg))
> + goto put_objcg;
> +
> + /* reclaim space if needed */
> + if (zswap_is_full()) {
> + zswap_pool_limit_hit++;
> + zswap_pool_reached_full = true;
> + goto shrink;
> + }
> +
> + if (zswap_pool_reached_full) {
> + if (!zswap_can_accept())
> + goto shrink;
> + else
> + zswap_pool_reached_full = false;
> + }
> +
> + pool = zswap_pool_current_get();
> + if (!pool)
> + goto put_objcg;
> +
> + /*
> + * Store each page of the folio as a separate entry. If we fail to store
> + * a page, unwind by removing all the previous pages we stored.
> + */
> + for (i = 0; i < nr_pages; i++) {
> + if (!zswap_store_page(folio, i, objcg, pool)) {
> + spin_lock(&tree->lock);
> + for (i--; i >= 0; i--) {
> + entry = zswap_rb_search(&tree->rbroot, offset + i);
> + if (entry)
> + zswap_invalidate_entry(tree, entry);
> + }
> + spin_unlock(&tree->lock);
> + goto put_pool;
> + }
> + }
> +
> + ret = true;
> +put_pool:
> + zswap_pool_put(pool);
> +put_objcg:
> + if (objcg)
> + obj_cgroup_put(objcg);
> + return ret;
> shrink:
> pool = zswap_pool_last_get();
> if (pool && !queue_work(shrink_wq, &pool->shrink_work))
> zswap_pool_put(pool);
> - goto reject;
> + goto put_objcg;
> }
>
> bool zswap_load(struct folio *folio)
>
> base-commit: 158978945f3173b8c1a88f8c5684a629736a57ac
> --
> 2.25.1
>