[PATCH RFC] mm: attempt to batch free swap entries for zap_pte_range()
From: Barry Song
Date: Sat Aug 03 2024 - 04:21:14 EST
Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.
[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@xxxxxxxx/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@xxxxxxxxxx/
Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>
---
mm/swapfile.c | 43 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..9def6dba8d26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
return true;
}
+static bool swap_is_last_map(struct swap_info_struct *si,
+ unsigned long offset, int nr_pages,
+ bool *any_only_cache)
+{
+ unsigned char *map = si->swap_map + offset;
+ unsigned char *map_end = map + nr_pages;
+ bool cached = false;
+
+ do {
+ if ((*map & ~SWAP_HAS_CACHE) != 1)
+ return false;
+ if (*map & SWAP_HAS_CACHE)
+ cached = true;
+ } while (++map < map_end);
+
+ *any_only_cache = cached;
+ return true;
+}
+
/*
* returns number of pages in the folio that backs the swap entry. If positive,
* the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1808,6 +1827,29 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
if (WARN_ON(end_offset > si->max))
goto out;
+ if (nr > 1) {
+ struct swap_cluster_info *ci;
+ bool batched_free;
+ int i;
+
+ ci = lock_cluster_or_swap_info(si, start_offset);
+ if ((batched_free = swap_is_last_map(si, start_offset, nr, &any_only_cache))) {
+ for (i = 0; i < nr; i++)
+ WRITE_ONCE(si->swap_map[start_offset + i], SWAP_HAS_CACHE);
+ }
+ unlock_cluster_or_swap_info(si, ci);
+
+ if (batched_free) {
+ spin_lock(&si->lock);
+ pr_err("%s offset:%lx nr:%lx\n", __func__,start_offset, nr);
+ swap_entry_range_free(si, entry, nr);
+ spin_unlock(&si->lock);
+ if (any_only_cache)
+ goto reclaim;
+ goto out;
+ }
+ }
+
/*
* First free all entries in the range.
*/
@@ -1828,6 +1870,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
if (!any_only_cache)
goto out;
+reclaim:
/*
* Now go back over the range trying to reclaim the swap cache. This is
* more efficient for large folios because we will only try to reclaim
--
2.34.1
> ---
> Â mm/swapfile.c | 59 ++++++++++++++++++++++++++---------------------------------
> Â 1 file changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 34e6ea13e8e4..9b63b2262cc2 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -479,20 +479,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
> Â }
>
> Â /*
> - * The cluster ci decreases one usage. If the usage counter becomes 0,
> + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
> Â * which means no page in the cluster is in use, we can optionally discard
> Â * the cluster and add it to free cluster list.
> Â */
> -static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
> +static void dec_cluster_info_page(struct swap_info_struct *p,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct swap_cluster_info *ci, int nr_pages)
> Â {
> Â Â Â Â if (!p->cluster_info)
> Â Â Â Â Â Â Â Â return;
>
> - Â Â Â VM_BUG_ON(ci->count == 0);
> + Â Â Â VM_BUG_ON(ci->count < nr_pages);
> Â Â Â Â VM_BUG_ON(cluster_is_free(ci));
> Â Â Â Â lockdep_assert_held(&p->lock);
> Â Â Â Â lockdep_assert_held(&ci->lock);
> - Â Â Â ci->count--;
> + Â Â Â ci->count -= nr_pages;
>
> Â Â Â Â if (!ci->count) {
> Â Â Â Â Â Â Â Â free_cluster(p, ci);
> @@ -998,19 +999,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
> Â Â Â Â return n_ret;
> Â }
>
> -static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> -{
> - Â Â Â unsigned long offset = idx * SWAPFILE_CLUSTER;
> - Â Â Â struct swap_cluster_info *ci;
> -
> - Â Â Â ci = lock_cluster(si, offset);
> - Â Â Â memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> - Â Â Â ci->count = 0;
> - Â Â Â free_cluster(si, ci);
> - Â Â Â unlock_cluster(ci);
> - Â Â Â swap_range_free(si, offset, SWAPFILE_CLUSTER);
> -}
> -
> Â int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> Â {
> Â Â Â Â int order = swap_entry_order(entry_order);
> @@ -1269,21 +1257,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
> Â Â Â Â return usage;
> Â }
>
> -static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
> +/*
> + * Drop the last HAS_CACHE flag of swap entries, caller have to
> + * ensure all entries belong to the same cgroup.
> + */
> +static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â unsigned int nr_pages)
> Â {
> - Â Â Â struct swap_cluster_info *ci;
> Â Â Â Â unsigned long offset = swp_offset(entry);
> - Â Â Â unsigned char count;
> + Â Â Â unsigned char *map = p->swap_map + offset;
> + Â Â Â unsigned char *map_end = map + nr_pages;
> + Â Â Â struct swap_cluster_info *ci;
>
> Â Â Â Â ci = lock_cluster(p, offset);
> - Â Â Â count = p->swap_map[offset];
> - Â Â Â VM_BUG_ON(count != SWAP_HAS_CACHE);
> - Â Â Â p->swap_map[offset] = 0;
> - Â Â Â dec_cluster_info_page(p, ci);
> + Â Â Â do {
> + Â Â Â Â Â Â Â VM_BUG_ON(*map != SWAP_HAS_CACHE);
> + Â Â Â Â Â Â Â *map = 0;
> + Â Â Â } while (++map < map_end);
> + Â Â Â dec_cluster_info_page(p, ci, nr_pages);
> Â Â Â Â unlock_cluster(ci);
>
> - Â Â Â mem_cgroup_uncharge_swap(entry, 1);
> - Â Â Â swap_range_free(p, offset, 1);
> + Â Â Â mem_cgroup_uncharge_swap(entry, nr_pages);
> + Â Â Â swap_range_free(p, offset, nr_pages);
> Â }
>
> Â static void cluster_swap_free_nr(struct swap_info_struct *sis,
> @@ -1343,7 +1338,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
> Â void put_swap_folio(struct folio *folio, swp_entry_t entry)
> Â {
> Â Â Â Â unsigned long offset = swp_offset(entry);
> - Â Â Â unsigned long idx = offset / SWAPFILE_CLUSTER;
> Â Â Â Â struct swap_cluster_info *ci;
> Â Â Â Â struct swap_info_struct *si;
> Â Â Â Â unsigned char *map;
> @@ -1356,19 +1350,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
> Â Â Â Â Â Â Â Â return;
>
> Â Â Â Â ci = lock_cluster_or_swap_info(si, offset);
> - Â Â Â if (size == SWAPFILE_CLUSTER) {
> + Â Â Â if (size > 1) {
> Â Â Â Â Â Â Â Â map = si->swap_map + offset;
> - Â Â Â Â Â Â Â for (i = 0; i < SWAPFILE_CLUSTER; i++) {
> + Â Â Â Â Â Â Â for (i = 0; i < size; i++) {
> Â Â Â Â Â Â Â Â Â Â Â Â val = map[i];
> Â Â Â Â Â Â Â Â Â Â Â Â VM_BUG_ON(!(val & SWAP_HAS_CACHE));
> Â Â Â Â Â Â Â Â Â Â Â Â if (val == SWAP_HAS_CACHE)
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â free_entries++;
> Â Â Â Â Â Â Â Â }
> - Â Â Â Â Â Â Â if (free_entries == SWAPFILE_CLUSTER) {
> + Â Â Â Â Â Â Â if (free_entries == size) {
> Â Â Â Â Â Â Â Â Â Â Â Â unlock_cluster_or_swap_info(si, ci);
> Â Â Â Â Â Â Â Â Â Â Â Â spin_lock(&si->lock);
> - Â Â Â Â Â Â Â Â Â Â Â mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
> - Â Â Â Â Â Â Â Â Â Â Â swap_free_cluster(si, idx);
> + Â Â Â Â Â Â Â Â Â Â Â swap_entry_range_free(si, entry, size);
> Â Â Â Â Â Â Â Â Â Â Â Â spin_unlock(&si->lock);
> Â Â Â Â Â Â Â Â Â Â Â Â return;
> Â Â Â Â Â Â Â Â }
> @@ -1413,7 +1406,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
> Â Â Â Â for (i = 0; i < n; ++i) {
> Â Â Â Â Â Â Â Â p = swap_info_get_cont(entries[i], prev);
> Â Â Â Â Â Â Â Â if (p)
> - Â Â Â Â Â Â Â Â Â Â Â swap_entry_free(p, entries[i]);
> + Â Â Â Â Â Â Â Â Â Â Â swap_entry_range_free(p, entries[i], 1);
> Â Â Â Â Â Â Â Â prev = p;
> Â Â Â Â }
> Â Â Â Â if (p)
>
> --
> 2.46.0.rc1.232.g9752f9e123-goog
>
Thanks
Barry