Re: [PATCH v2 1/1] swap: shmem: remove SWAP_MAP_SHMEM

From: Yosry Ahmed
Date: Tue Oct 01 2024 - 21:34:01 EST


On Tue, Oct 1, 2024 at 6:20 PM Nhat Pham <nphamcs@xxxxxxxxx> wrote:
>
> The SWAP_MAP_SHMEM state was introduced in the commit aaa468653b4a
> ("swap_info: note SWAP_MAP_SHMEM"), to quickly determine if a swap entry
> belongs to shmem during swapoff.
>
> However, swapoff has since been rewritten in the commit b56a2d8af914
> ("mm: rid swapoff of quadratic complexity"). Now having swap count ==
> SWAP_MAP_SHMEM value is basically the same as having swap count == 1,
> and swap_shmem_alloc() behaves analogously to swap_duplicate(). The only
> difference of note is that swap_shmem_alloc() does not check for
> -ENOMEM returned from __swap_duplicate(), but it is OK because shmem
> never re-duplicates any swap entry it owns. This will stil be safe if we
> use (batched) swap_duplicate() instead.
>
> This commit adds swap_duplicate_nr(), the batched variant of
> swap_duplicate(), and removes the SWAP_MAP_SHMEM state and the
> associated swap_shmem_alloc() helper to simplify the state machine (both
> mentally and in terms of actual code). We will also have an extra
> state/special value that can be repurposed (for swap entries that never
> gets re-duplicated).
>
> Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx>
> ---
> include/linux/swap.h | 16 ++++++++--------
> mm/shmem.c | 2 +-
> mm/swapfile.c | 41 +++++++++++++++++++++--------------------
> 3 files changed, 30 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ca533b478c21..017f3c03ff7a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -232,7 +232,6 @@ enum {
> /* Special value in first swap_map */
> #define SWAP_MAP_MAX 0x3e /* Max count */
> #define SWAP_MAP_BAD 0x3f /* Note page is bad */
> -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */
>
> /* Special value in each swap_map continuation */
> #define SWAP_CONT_MAX 0x7f /* Max count */
> @@ -482,8 +481,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
> extern swp_entry_t get_swap_page_of_type(int);
> extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
> extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> -extern void swap_shmem_alloc(swp_entry_t, int);
> -extern int swap_duplicate(swp_entry_t);
> +extern int swap_duplicate_nr(swp_entry_t, int);
> extern int swapcache_prepare(swp_entry_t entry, int nr);
> extern void swap_free_nr(swp_entry_t entry, int nr_pages);
> extern void swapcache_free_entries(swp_entry_t *entries, int n);
> @@ -549,11 +547,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
> return 0;
> }
>
> -static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
> -{
> -}
> -
> -static inline int swap_duplicate(swp_entry_t swp)
> +static inline int swap_duplicate_nr(swp_entry_t swp, int nr)
> {
> return 0;
> }
> @@ -606,6 +600,12 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
> }
> #endif /* CONFIG_SWAP */
>
> +static inline int swap_duplicate(swp_entry_t entry)
> +{
> + return swap_duplicate_nr(entry, 1);
> +}
> +
> +

Nit: extra blank line.

> static inline void free_swap_and_cache(swp_entry_t entry)
> {
> free_swap_and_cache_nr(entry, 1);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 0613421e09e7..e3f72f99be32 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1561,7 +1561,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
> __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
> NULL) == 0) {
> shmem_recalc_inode(inode, 0, nr_pages);
> - swap_shmem_alloc(swap, nr_pages);
> + swap_duplicate_nr(swap, nr_pages);
> shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
>
> mutex_unlock(&shmem_swaplist_mutex);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 0cded32414a1..9bb94e618914 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1381,12 +1381,6 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
> if (usage == SWAP_HAS_CACHE) {
> VM_BUG_ON(!has_cache);
> has_cache = 0;
> - } else if (count == SWAP_MAP_SHMEM) {
> - /*
> - * Or we could insist on shmem.c using a special
> - * swap_shmem_free() and free_shmem_swap_and_cache()...
> - */
> - count = 0;
> } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
> if (count == COUNT_CONTINUED) {
> if (swap_count_continued(si, offset, count))
> @@ -3626,7 +3620,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
>
> offset = swp_offset(entry);
> VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
> - VM_WARN_ON(usage == 1 && nr > 1);
> ci = lock_cluster_or_swap_info(si, offset);
>
> err = 0;
> @@ -3652,6 +3645,13 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
> err = -EEXIST;
> } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
> err = -EINVAL;
> + } else {
> + /*
> + * The only swap_duplicate_nr() caller that passes nr > 1 is shmem,
> + * who never re-duplicates any swap entry it owns. So this should

nit: I think "which" is the right word here, but I am not a native speaker :)

> + * not happen.
> + */
> + VM_WARN_ON(nr > 1 && (count & ~COUNT_CONTINUED) == SWAP_MAP_MAX);

Why not return an error in this case? I think we should add recovery
for bugs when it's possible and simple, which I believe is the case
here.

In shmem_writepage() we can add a WARN if swap_duplicate_nr() fails,
or propagate an error to the caller as well (perhaps this belongs in a
separate patch that does this for swap_shmem_alloc() first).

Sorry if I am being paranoid here, please let me know if this is the case.

> }
>
> if (err)
> @@ -3686,27 +3686,28 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
> return err;
> }
>
> -/*
> - * Help swapoff by noting that swap entry belongs to shmem/tmpfs
> - * (in which case its reference count is never incremented).
> - */
> -void swap_shmem_alloc(swp_entry_t entry, int nr)
> -{
> - __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
> -}
> -
> -/*
> - * Increase reference count of swap entry by 1.
> +/**
> + * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries
> + * by 1.

Can we avoid the line break by using "refcount" instead of "reference count"?

> + *
> + * @entry: first swap entry from which we want to increase the refcount.
> + * @nr: Number of entries in range.
> + *
> * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
> * but could not be atomically allocated. Returns 0, just as if it succeeded,
> * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
> * might occur if a page table entry has got corrupted.
> + *
> + * Note that we are currently not handling the case where nr > 1 and we need to
> + * add swap count continuation. This is OK, because no such user exists - shmem
> + * is the only user that can pass nr > 1, and it never re-duplicates any swap
> + * entry it owns.

Do we need this comment when we have the WARN + comment in __swap_duplicate()?

> */
> -int swap_duplicate(swp_entry_t entry)
> +int swap_duplicate_nr(swp_entry_t entry, int nr)
> {
> int err = 0;
>
> - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
> + while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> err = add_swap_count_continuation(entry, GFP_ATOMIC);
> return err;
> }
> --
> 2.43.5