Re: [PATCH v3 10/13] mm, swap: simplify percpu cluster updating

From: Baoquan He
Date: Wed Jan 08 2025 - 21:08:26 EST


On 12/31/24 at 01:46am, Kairui Song wrote:
> From: Kairui Song <kasong@xxxxxxxxxxx>
>
> Instead of using a returning argument, we can simply store the next
> cluster offset to the fixed percpu location, which reduce the stack
> usage and simplify the function:
>
> Object size:
> ./scripts/bloat-o-meter mm/swapfile.o mm/swapfile.o.new
> add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-271 (-271)
> Function old new delta
> get_swap_pages 2847 2733 -114
> alloc_swap_scan_cluster 894 737 -157
> Total: Before=30833, After=30562, chg -0.88%
>
> Stack usage:
> Before:
> swapfile.c:1190:5:get_swap_pages 240 static
>
> After:
> swapfile.c:1185:5:get_swap_pages 216 static
>
> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> ---
> include/linux/swap.h | 4 +--
> mm/swapfile.c | 66 +++++++++++++++++++-------------------------
> 2 files changed, 31 insertions(+), 39 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index c4ff31cb6bde..4c1d2e69689f 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -275,9 +275,9 @@ enum swap_cluster_flags {
> * The first page in the swap file is the swap header, which is always marked
> * bad to prevent it from being allocated as an entry. This also prevents the
> * cluster to which it belongs being marked free. Therefore 0 is safe to use as
> - * a sentinel to indicate next is not valid in percpu_cluster.
> + * a sentinel to indicate an entry is not valid.
> */
> -#define SWAP_NEXT_INVALID 0
> +#define SWAP_ENTRY_INVALID 0
>
> #ifdef CONFIG_THP_SWAP
> #define SWAP_NR_ORDERS (PMD_ORDER + 1)
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index dadd4fead689..60a650ba88fd 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -759,23 +759,23 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
> return true;
> }
>
> -static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
> - unsigned int *foundp, unsigned int order,
> +/* Try use a new cluster for current CPU and allocate from it. */
> +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
> + struct swap_cluster_info *ci,
> + unsigned long offset,
> + unsigned int order,
> unsigned char usage)
> {
> - unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
> + unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
> + unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
> unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
> unsigned int nr_pages = 1 << order;
> bool need_reclaim, ret;
> - struct swap_cluster_info *ci;
>
> - ci = &si->cluster_info[offset / SWAPFILE_CLUSTER];
> lockdep_assert_held(&ci->lock);
>
> - if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) {
> - offset = SWAP_NEXT_INVALID;
> + if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
> goto out;
> - }
>
> for (end -= nr_pages; offset <= end; offset += nr_pages) {
> need_reclaim = false;
> @@ -789,34 +789,27 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
> * cluster has no flag set, and change of list
> * won't cause fragmentation.
> */
> - if (!cluster_is_usable(ci, order)) {
> - offset = SWAP_NEXT_INVALID;
> + if (!cluster_is_usable(ci, order))
> goto out;
> - }
> if (cluster_is_free(ci))
> offset = start;
> /* Reclaim failed but cluster is usable, try next */
> if (!ret)
> continue;
> }
> - if (!cluster_alloc_range(si, ci, offset, usage, order)) {
> - offset = SWAP_NEXT_INVALID;
> - goto out;
> - }
> - *foundp = offset;
> - if (ci->count == SWAPFILE_CLUSTER) {
> - offset = SWAP_NEXT_INVALID;
> - goto out;
> - }
> + if (!cluster_alloc_range(si, ci, offset, usage, order))
> + break;
> + found = offset;
> offset += nr_pages;
> + if (ci->count < SWAPFILE_CLUSTER && offset <= end)
> + next = offset;
> break;
> }
> - if (offset > end)
> - offset = SWAP_NEXT_INVALID;
> out:
> relocate_cluster(si, ci);
> unlock_cluster(ci);
> - return offset;
> + __this_cpu_write(si->percpu_cluster->next[order], next);
> + return found;
> }
>
> /* Return true if reclaimed a whole cluster */
> @@ -885,8 +878,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> if (cluster_is_usable(ci, order)) {
> if (cluster_is_free(ci))
> offset = cluster_offset(si, ci);
> - offset = alloc_swap_scan_cluster(si, offset, &found,
> - order, usage);
> + found = alloc_swap_scan_cluster(si, ci, offset,
> + order, usage);
> } else {
> unlock_cluster(ci);
> }
> @@ -897,8 +890,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> new_cluster:
> ci = cluster_isolate_lock(si, &si->free_clusters);
> if (ci) {
> - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> - &found, order, usage);
> + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> + order, usage);
> if (found)
> goto done;
> }
> @@ -911,8 +904,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> unsigned int frags = 0, frags_existing;
>
> while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[order]))) {
> - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> - &found, order, usage);
> + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> + order, usage);
> /*
> * With `fragmenting` set to true, it will surely take
> * the cluster off nonfull list
> @@ -932,8 +925,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> * per-CPU usage, but they could contain newly released
> * reclaimable (eg. lazy-freed swap cache) slots.
> */
> - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> - &found, order, usage);
> + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> + order, usage);
> if (found)
> goto done;
> frags++;
> @@ -959,21 +952,20 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> */
> while ((ci = cluster_isolate_lock(si, &si->frag_clusters[o]))) {
> atomic_long_dec(&si->frag_cluster_nr[o]);
> - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> - &found, order, usage);
> + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> + 0, usage);
> if (found)
> goto done;
> }
>
> while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[o]))) {
> - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> - &found, order, usage);
> + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> + 0, usage);
> if (found)
> goto done;
> }
> }
> done:
> - __this_cpu_write(si->percpu_cluster->next[order], offset);
> local_unlock(&si->percpu_cluster->lock);

Do you think if we still need hold the si->percpu_cluster->lock till the
end of cluster_alloc_swap_entry() invocation? If so, we may need hold the
lock during the whole period when going through percpu_cluster->next,
free_cluster, nonfull, frag_clusters until we get one available slot, even
though we keep upating the si->percpu_cluster->next[order]. I can't see
the point by changing it like this.

>
> return found;
> @@ -3194,7 +3186,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
>
> cluster = per_cpu_ptr(si->percpu_cluster, cpu);
> for (i = 0; i < SWAP_NR_ORDERS; i++)
> - cluster->next[i] = SWAP_NEXT_INVALID;
> + cluster->next[i] = SWAP_ENTRY_INVALID;
> local_lock_init(&cluster->lock);
> }
>
> --
> 2.47.1
>
>