Re: [PATCH v3 09/13] mm, swap: reduce contention on device lock
From: Baoquan He
Date: Wed Jan 08 2025 - 06:10:39 EST
On 12/31/24 at 01:46am, Kairui Song wrote:
......snip.....
> ---
> include/linux/swap.h | 3 +-
> mm/swapfile.c | 435 ++++++++++++++++++++++++-------------------
> 2 files changed, 246 insertions(+), 192 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 339d7f0192ff..c4ff31cb6bde 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -291,6 +291,7 @@ enum swap_cluster_flags {
> * throughput.
> */
> struct percpu_cluster {
> + local_lock_t lock; /* Protect the percpu_cluster above */
> unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> };
>
> @@ -313,7 +314,7 @@ struct swap_info_struct {
> /* list of cluster that contains at least one free slot */
> struct list_head frag_clusters[SWAP_NR_ORDERS];
> /* list of cluster that are fragmented or contented */
> - unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
> + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
> unsigned int pages; /* total of usable pages of swap */
> atomic_long_t inuse_pages; /* number of those currently in use */
> struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 7795a3d27273..dadd4fead689 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -261,12 +261,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> folio_ref_sub(folio, nr_pages);
> folio_set_dirty(folio);
>
> - spin_lock(&si->lock);
> /* Only sinple page folio can be backed by zswap */
> if (nr_pages == 1)
> zswap_invalidate(entry);
> swap_entry_range_free(si, entry, nr_pages);
> - spin_unlock(&si->lock);
> ret = nr_pages;
> out_unlock:
> folio_unlock(folio);
> @@ -403,7 +401,21 @@ static void discard_swap_cluster(struct swap_info_struct *si,
>
> static inline bool cluster_is_free(struct swap_cluster_info *info)
> {
> - return info->flags == CLUSTER_FLAG_FREE;
> + return info->count == 0;
This is a little confusing. Maybe we should add one and call it
cluster_is_empty(). Because discarded clusters are also be able to pass
the checking here.
> +}
> +
> +static inline bool cluster_is_discard(struct swap_cluster_info *info)
> +{
> + return info->flags == CLUSTER_FLAG_DISCARD;
> +}
> +
> +static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
> +{
> + if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
> + return false;
> + if (!order)
> + return true;
> + return cluster_is_free(ci) || order == ci->order;
> }
>
> static inline unsigned int cluster_index(struct swap_info_struct *si,
> @@ -440,19 +452,20 @@ static void cluster_move(struct swap_info_struct *si,
> {
> VM_WARN_ON(ci->flags == new_flags);
> BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
> + lockdep_assert_held(&ci->lock);
>
> - if (ci->flags == CLUSTER_FLAG_NONE) {
> + spin_lock(&si->lock);
> + if (ci->flags == CLUSTER_FLAG_NONE)
> list_add_tail(&ci->list, list);
> - } else {
> - if (ci->flags == CLUSTER_FLAG_FRAG) {
> - VM_WARN_ON(!si->frag_cluster_nr[ci->order]);
> - si->frag_cluster_nr[ci->order]--;
> - }
> + else
> list_move_tail(&ci->list, list);
> - }
> + spin_unlock(&si->lock);
> +
> + if (ci->flags == CLUSTER_FLAG_FRAG)
> + atomic_long_dec(&si->frag_cluster_nr[ci->order]);
> + else if (new_flags == CLUSTER_FLAG_FRAG)
> + atomic_long_inc(&si->frag_cluster_nr[ci->order]);
> ci->flags = new_flags;
> - if (new_flags == CLUSTER_FLAG_FRAG)
> - si->frag_cluster_nr[ci->order]++;
> }
>
> /* Add a cluster to discard list and schedule it to do discard */
> @@ -475,39 +488,90 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>
> static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
> {
> - lockdep_assert_held(&si->lock);
> lockdep_assert_held(&ci->lock);
> cluster_move(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
> ci->order = 0;
> }
>
> +/*
> + * Isolate and lock the first cluster that is not contented on a list,
> + * clean its flag before taken off-list. Cluster flag must be in sync
> + * with list status, so cluster updaters can always know the cluster
> + * list status without touching si lock.
> + *
> + * Note it's possible that all clusters on a list are contented so
> + * this returns NULL for an non-empty list.
> + */
> +static struct swap_cluster_info *cluster_isolate_lock(
> + struct swap_info_struct *si, struct list_head *list)
> +{
> + struct swap_cluster_info *ci, *ret = NULL;
> +
> + spin_lock(&si->lock);
> +
> + if (unlikely(!(si->flags & SWP_WRITEOK)))
> + goto out;
> +
> + list_for_each_entry(ci, list, list) {
> + if (!spin_trylock(&ci->lock))
> + continue;
> +
> + /* We may only isolate and clear flags of following lists */
> + VM_BUG_ON(!ci->flags);
> + VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
> + ci->flags != CLUSTER_FLAG_FULL);
> +
> + list_del(&ci->list);
> + ci->flags = CLUSTER_FLAG_NONE;
> + ret = ci;
> + break;
> + }
> +out:
> + spin_unlock(&si->lock);
> +
> + return ret;
> +}
> +
> /*
> * Doing discard actually. After a cluster discard is finished, the cluster
> - * will be added to free cluster list. caller should hold si->lock.
> -*/
> -static void swap_do_scheduled_discard(struct swap_info_struct *si)
> + * will be added to free cluster list. Discard cluster is a bit special as
> + * they don't participate in allocation or reclaim, so clusters marked as
> + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
> + */
> +static bool swap_do_scheduled_discard(struct swap_info_struct *si)
> {
> struct swap_cluster_info *ci;
> + bool ret = false;
> unsigned int idx;
>
> + spin_lock(&si->lock);
> while (!list_empty(&si->discard_clusters)) {
> ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
> + /*
> + * Delete the cluster from list but don't clear its flags until
> + * discard is done, so isolation and relocation will skip it.
> + */
> list_del(&ci->list);
I don't understand above comment. ci has been taken off list. While
allocation need isolate from a usable list. Even though we clear
ci->flags now, how come isolation and relocation will touch it. I may
miss anything here.
> - /* Must clear flag when taking a cluster off-list */
> - ci->flags = CLUSTER_FLAG_NONE;
> idx = cluster_index(si, ci);
> spin_unlock(&si->lock);
> -
> discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
> SWAPFILE_CLUSTER);
>
> - spin_lock(&si->lock);
> spin_lock(&ci->lock);
> - __free_cluster(si, ci);
> + /*
> + * Discard is done, clear its flags as it's now off-list,
> + * then return the cluster to allocation list.
> + */
> + ci->flags = CLUSTER_FLAG_NONE;
> memset(si->swap_map + idx * SWAPFILE_CLUSTER,
> 0, SWAPFILE_CLUSTER);
> + __free_cluster(si, ci);
> spin_unlock(&ci->lock);
> + ret = true;
> + spin_lock(&si->lock);
> }
> + spin_unlock(&si->lock);
> + return ret;
> }
>
> static void swap_discard_work(struct work_struct *work)
......snip....
> @@ -791,29 +873,34 @@ static void swap_reclaim_work(struct work_struct *work)
> static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> unsigned char usage)
> {
> - struct percpu_cluster *cluster;
> struct swap_cluster_info *ci;
> unsigned int offset, found = 0;
>
> -new_cluster:
> - lockdep_assert_held(&si->lock);
> - cluster = this_cpu_ptr(si->percpu_cluster);
> - offset = cluster->next[order];
> + /* Fast path using per CPU cluster */
> + local_lock(&si->percpu_cluster->lock);
> + offset = __this_cpu_read(si->percpu_cluster->next[order]);
> if (offset) {
> - offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
> + ci = lock_cluster(si, offset);
> + /* Cluster could have been used by another order */
> + if (cluster_is_usable(ci, order)) {
> + if (cluster_is_free(ci))
> + offset = cluster_offset(si, ci);
> + offset = alloc_swap_scan_cluster(si, offset, &found,
> + order, usage);
> + } else {
> + unlock_cluster(ci);
> + }
> if (found)
> goto done;
> }
>
> - if (!list_empty(&si->free_clusters)) {
> - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
> - /*
> - * Either we didn't touch the cluster due to swapoff,
> - * or the allocation must success.
> - */
> - VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
> - goto done;
> +new_cluster:
> + ci = cluster_isolate_lock(si, &si->free_clusters);
> + if (ci) {
> + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> + &found, order, usage);
> + if (found)
> + goto done;
> }
>
> /* Try reclaim from full clusters if free clusters list is drained */
> @@ -821,49 +908,45 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> swap_reclaim_full_clusters(si, false);
>
> if (order < PMD_ORDER) {
> - unsigned int frags = 0;
> + unsigned int frags = 0, frags_existing;
>
> - while (!list_empty(&si->nonfull_clusters[order])) {
> - ci = list_first_entry(&si->nonfull_clusters[order],
> - struct swap_cluster_info, list);
> - cluster_move(si, ci, &si->frag_clusters[order], CLUSTER_FLAG_FRAG);
> + while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[order]))) {
> offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> &found, order, usage);
> - frags++;
> + /*
> + * With `fragmenting` set to true, it will surely take
~~~~~~~~~~~
wondering what 'fragmenting' means here.
> + * the cluster off nonfull list
> + */
> if (found)
> goto done;
> + frags++;
> }
>
> - /*
> - * Nonfull clusters are moved to frag tail if we reached
> - * here, count them too, don't over scan the frag list.
> - */
> - while (frags < si->frag_cluster_nr[order]) {
> - ci = list_first_entry(&si->frag_clusters[order],
> - struct swap_cluster_info, list);
> + frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
> + while (frags < frags_existing &&
> + (ci = cluster_isolate_lock(si, &si->frag_clusters[order]))) {
> + atomic_long_dec(&si->frag_cluster_nr[order]);
> /*
> - * Rotate the frag list to iterate, they were all failing
> - * high order allocation or moved here due to per-CPU usage,
> - * this help keeping usable cluster ahead.
> + * Rotate the frag list to iterate, they were all
> + * failing high order allocation or moved here due to
> + * per-CPU usage, but they could contain newly released
> + * reclaimable (eg. lazy-freed swap cache) slots.
> */
> - list_move_tail(&ci->list, &si->frag_clusters[order]);
> offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> &found, order, usage);
> - frags++;
> if (found)
> goto done;
> + frags++;
> }
> }
>
> - if (!list_empty(&si->discard_clusters)) {
> - /*
> - * we don't have free cluster but have some clusters in
> - * discarding, do discard now and reclaim them, then
> - * reread cluster_next_cpu since we dropped si->lock
> - */
> - swap_do_scheduled_discard(si);
> + /*
> + * We don't have free cluster but have some clusters in
> + * discarding, do discard now and reclaim them, then
> + * reread cluster_next_cpu since we dropped si->lock
> + */
> + if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
> goto new_cluster;
> - }
>
> if (order)
> goto done;
.....